diff --git a/Cargo.lock b/Cargo.lock index 5a8f7fbb02ed8..5c2ef13bb7914 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7183,6 +7183,7 @@ dependencies = [ "itertools 0.14.0", "lgalloc", "libc", + "lz4_flex", "mz-ore", "mz-ore-proc", "native-tls", diff --git a/doc/developer/design/20260610_buffer_managed_state.md b/doc/developer/design/20260610_buffer_managed_state.md new file mode 100644 index 0000000000000..e3c6a00b9524d --- /dev/null +++ b/doc/developer/design/20260610_buffer_managed_state.md @@ -0,0 +1,671 @@ +# Buffer-managed dataflow state + +* Associated: [20260504_pager.md](20260504_pager.md) (the explicit pager this design succeeds), [CLU-65](https://linear.app/materializeinc/issue/CLU-65/pager). + +## The problem + +Materialize keeps dataflow state — merge-batcher chains, arrangement batches, upsert state — in resident memory, and treats disk as a reactive spill target. +The current mechanism is `mz_ore::pager`: a blob store that serializes a whole ~2 MiB columnar chunk and either hints it cold to the kernel (`Swap` backend) or writes it to a per-chunk scratch file (`File` backend). + +Production experience with this design surfaced four structural problems. + +First, the pager is a blob store, and the blob model caps what it can do. +Chunks page out whole and rehydrate whole; the partial-read API (`read_at_many`) has no production consumer. +Every page-in is a full deserialize-and-copy even when the reader needed a fraction of the data, and a resident chunk and its paged form are different types (`PagedColumn::Resident` vs `Paged` vs `Compressed`), so residency decisions are baked in at pageout time by a policy that can see only the chunk's size. + +Second, the file backend pays filesystem metadata per chunk. +One named file per 2 MiB chunk means create, writev, open, pread, unlink per chunk per merge generation. +Profiling showed the hot cost is unlink and inode eviction — journaled extent deallocation plus page-cache invalidation under the inode lock — at 35.6 s versus 4.3 s for opens in the measured workload. +This is structural: the cost scales with chunk turnover, and merge-heavy workloads turn chunks over constantly. + +Third, the swap backend trades control for laziness. +It wins when the working set mostly fits (data re-read before reclaim never touches disk, and translation is free) but under real pressure the kernel pages synchronously, per 4 KiB, on the worker thread: the pager design doc measured 64 s of sys time in a 65 s single-threaded merge, with TLB shootdowns and direct reclaim as user-visible latency. +The kernel also cannot know that a chunk consumed by a merge is dead, so it dutifully writes garbage to disk. + +Fourth, and most importantly, on the columnar path only the pre-seal batcher stash spills at all. +Sealed spine batches — the arrangements proper, the dominant long-lived memory — are fully resident and invisible to any spill budget. +Columnation-era arrangements retain a transparent disk story through lgalloc's file-backed mappings (see Background), but the columnar containers removed lgalloc by design, and nothing replaced it past the batcher. +Hydration of a large arrangement holds the entire state in RSS regardless of how much of it is actively needed. + +Buffer management for larger-than-memory state is an old and intensely studied problem in the database literature — not a solved one, as a decade of successive buffer-manager redesigns attests, but one rich in measured designs and documented failure modes to build on. +This design replaces the spill-a-blob model with a buffer-managed architecture in the style of Umbra and vmcache, adapted to the properties that make Materialize's problem easier than the general one: state is immutable once sealed, recreatable from persist (no durability requirement), and its lifecycle (just-built, sealed, queued-for-merge, dead) is known to the engine rather than guessed by a cache. + +## Success criteria + +The design succeeds when: + +* Resident access to a chunk costs one atomic load; no hash-table translation, no serialization, no copy. +* A chunk that dies before pressure forces it out never touches disk. +* Workers never enter kernel direct reclaim on the state path, and any state-path I/O a worker performs is explicit, bounded, and chunk-granular at a point the engine chose — never an unscheduled page fault. + Whether write-back runs on workers or dedicated spill threads is a measured choice (see I/O execution). +* Chunk turnover performs no per-chunk filesystem metadata operations: no create, no unlink, no inode churn. +* Sealed arrangement batches can be paged: a merge of two batches holds a bounded resident window rather than both batches; an arrangement cursor faults at most one leaf page per cold seek. +* Hydration of an arrangement larger than the memory budget completes with RSS bounded by the budget, not by state size. +* One budget pool covers batcher chains and arrangement batches; exceeding it triggers eviction of cold chunks rather than gating pageout decisions per chunk. +* The swap and file backends of `mz_ore::pager`, and the `PagedColumn` residency enum, are deleted at the end of the migration. + +## Out of scope + +* Durability and crash consistency. + State is recreatable from persist; the scratch volume is a cache. + No WAL, no manifest, no fsync anywhere in this design. +* Async restructuring of timely operators. + Cursors and containers stay synchronous; cold accesses stall the worker for one NVMe read. + ForSt-style asynchronous state access is a separate project. +* Warm restart (reattaching to scratch state across process restarts) ships as a follow-up, not in the initial milestones. + The design keeps it reachable: the eager-backed format is versioned and self-describing from day one. +* Sharing on-disk format with persist parts. + A north star (see Future work), not a requirement. +* Key-value separation (WiscKey-style out-of-line values). + The value container stays abstract enough to add it; see Open questions. +* Non-Linux production support, as with the existing pager. + +## Background + +### Three generations of disk offload + +Materialize has approached larger-than-memory state three times. + +The first generation was [lgalloc](https://crates.io/crates/lgalloc) (integrated October 2023): a size-classed allocator that serves large allocations from memory-mapped files on the scratch volume. +Columnation-backed arrangement regions and persist's arrow buffers allocate through it (`enable_columnation_lgalloc`, default on), which is why columnation-era arrangements have a disk story today — but a transparent one: the kernel owns writeback and reclaim of the file-backed pages, including dirty-page writeback at its own discretion. +Operating it has meant approximating policy from below the allocation boundary: a background worker returning freed memory, eager-return and file-growth-dampener knobs, and eventually a disk usage limiter (May 2025) — all compensating for the fact that an allocator sees only `alloc` and `free`, never "this region is cold", "this run will be read sequentially next", or "this data is dead". + +The second generation was kernel swap: provision swap on the cluster nodes and let the kernel page anonymous heap memory under pressure. +Swap subsumes the allocator seam — every allocation is implicitly offloadable, nothing opts in — but it conflates dataflow state with arbitrary heap allocations and hands eviction to the kernel at anonymous-page granularity. +The pager design doc records the endpoint: direct reclaim running on worker threads, `pgscan_direct` spikes during hydration, synchronous per-4 KiB faults serializing single-threaded merges. + +The third generation is the explicit pager of [20260504_pager.md](20260504_pager.md): the application marks cold data and chooses a backend. +The columnar end-to-end project deliberately removed lgalloc from the columnar containers ([CLU-64](https://linear.app/materializeinc/issue/CLU-64/remove-lgalloc-from-columnar): `Column::Aligned` becomes a plain `Vec`) to create the seam the pager plugs into; the lgalloc copies in persist's arrow path are being removed in parallel. +The pager fixed the control problem at blob granularity but inherited the ceilings described under The problem. + +This trajectory matters to the present design in two ways. +First, the first two generations differ only in where the kernel's transparency attaches — file-backed mappings versus anonymous memory — and share the failure mode, which the third generation answers only partially: the entity that owns eviction must be the entity that knows data lifecycle, and neither the kernel nor a size-only pageout policy can know it. +Second, lgalloc's address-space layout — size-classed regions over a scratch volume — is structurally Layer 2's layout; what changes is the direction of control, from file-backed mmap with kernel paging to anonymous memory with engine-scheduled explicit I/O. +In that sense Layer 2 is less a new mechanism than lgalloc with the kernel taken out of the loop. + +### The workload, from first principles + +The state this design serves has properties a general-purpose storage engine cannot assume: + +* **Immutable after seal.** Columnar chunks and arrangement batches never mutate; the only state transitions are residency transitions. + This deletes dirty tracking, write-back ordering, reader/writer latching on content, and torn-read hazards. +* **Recreatable.** Everything can be rebuilt from persisted sources, so every durability mechanism is deletable. +* **Lifecycle known to the engine.** The merge batcher knows which chains merge next; a consumed chunk is dead; hydration output is write-once-read-rarely by construction. + A generic buffer manager spends real machinery (LeanStore's cooling stage, LRU approximations) guessing at coldness the engine here simply knows. +* **Maintenance is sequential; the update path is random and delta-proportional.** + Merges, extraction, and hydration are linear scans and dominate bytes moved. + The latency-critical path has the opposite shape: incremental operators do work proportional to arriving deltas — a join probes arrangements at exactly the updated keys, upsert reads back the keys in the input batch — and interactive peeks are fully random. + How much each side matters is workload-dependent, and this is the assumption in this document most likely to draw disagreement: a design that follows the sequential framing too closely becomes a batch processor. + Several choices below (per-arrangement residency policy, per-arrangement leaf sizes, uncompressed lookup tiers, the record-cache open question) exist specifically to keep per-key probes cheap; see "Access patterns, read amplification, and policy" under Layer 3. +* **Already log-structured.** A differential spine is a tiered LSM: batches are immutable sorted runs, the fueled merge scheduler is the compactor, batcher chains are L0. + The design question is not "adopt an LSM" but "give the existing LSM a paged run format and a buffer manager." + +Tuning within the current blob model has continued, and its best-known endpoint is instructive. +The strongest measured swap-backend strategy ([#36948](https://github.com/MaterializeInc/materialize/pull/36948), benchmarked in [CLU-108](https://linear.app/materializeinc/issue/CLU-108/correctionv2-pager-lz4-compress-spilled-chunks-madv-pageout-swap)) couples lz4 compression with an eager `MADV_PAGEOUT` over the compressed bytes at spill time: peak RSS holds at the budget (0.40 GiB versus 0.97 GiB with lazy hints, where RSS drifts to the cgroup cap and is relieved only at the kernel's pressure cliff). +The eager eviction pays precisely because compression shrinks the re-fault volume ~5.6×; on the uncompressed path the same hint is a measured net loss. +That result is this design's thesis expressed through kernel primitives — compress at the spill boundary, release physical memory eagerly, keep RSS honest against the budget — and it simultaneously marks the blob model's ceiling: re-access still faults synchronously per 4 KiB on worker threads, a chunk consumed moments after spill was evicted (and must be re-faulted) anyway because the kernel cannot know it was about to die, and the compress-and-evict decision remains irrevocable at pageout time. +Layer 2 keeps the converged policy — cold data compressed past the memory boundary, physical memory released eagerly — while replacing the mechanism with one the engine schedules, prioritizes, and can cancel. + +The disk-versus-memory question is not binary. +Treating disk as the first-class home of all state (every chunk written at seal) buys pressure-free eviction and warm restart but imposes a write-bandwidth floor proportional to merge traffic — fatal for young data that dies in seconds, and for EBS-class disks. +Treating disk as a pure spill target avoids the floor but makes pressure handling a write storm at the worst moment and leaves nothing on disk to reattach to. +The resolution is that both modes share one mechanism and differ in a single policy bit — when must a chunk be backed — and the spine's geometric level structure supplies a principled threshold: lazy backing for young, churning data; eager backing for sealed, deep, long-lived data. + +## Solution proposal + +Three layers, built bottom-up, each independently shippable. + +```mermaid +flowchart TB + subgraph L3["Layer 3: paged sealed batches"] + Header["resident header: fence keys, page table"] + Pages["column pages (leaves)"] + end + subgraph L2["Layer 2: buffer manager"] + Pool["size-class VM regions, pin-mediated access"] + State["per-chunk state word + epochs"] + IO["spill threads: write-behind, prefetch"] + end + subgraph L1["Layer 1: extent store"] + Extents["extent allocator: file extents or swap-backed anonymous"] + end + Batcher["merge batcher chains"] --> L2 + L3 --> L2 + L2 --> L1 +``` + +### Layer 1: extent store + +The extent store is an interface — allocate, write, read, free, in chunk-class-sized extents — not a single mechanism. +It needs two implementations, because production nodes today provision the entire disk as swap and mount no scratch filesystem: file extents have nowhere to live until volume topology changes, and the design must not wait for that. + +#### File extents + +Replace per-chunk scratch files with a few large preallocated files per worker (or `O_TMPFILE` inodes) and a userspace extent allocator. + +* Allocation rounds to the chunk size classes (see Layer 2), so the free list is per-class and fragmentation is bounded by construction. +* Free is a free-list push. + No unlink, no journal transaction, no inode eviction — the measured 35.6 s cost class becomes pointer arithmetic. + Space is returned to the filesystem lazily and batched via `fallocate(FALLOC_FL_PUNCH_HOLE)` only if scratch-volume pressure demands it. +* I/O is `O_DIRECT`: the buffer pool (Layer 2) is the cache, and the kernel page cache would be a second copy of everything plus unpredictable writeback. + Aligned buffers are natural for the paged format. +* The DuckDB temp-file model is precedent: slotted, recycled temp files in native block format rather than create-unlink per object. + +This implementation is shippable behind the existing `Handle` API as a drop-in replacement for the file backend's storage, before any of Layer 2 exists. + +#### Swap-backed extents + +Where no filesystem exists, the extent store is anonymous memory the engine deliberately hands to kernel swap. +An extent is an extent-sized anonymous allocation: "write" compresses the evicted chunk into it and issues `MADV_PAGEOUT`, pushing the pages to the swap device; "read" issues `MADV_WILLNEED` ahead of need — asynchronous swap-in, the backend's readahead mechanism — then decompresses into the chunk's freshly allocated slot; "free" is a plain deallocation, with any swapped copy discarded for free. + +This is the lz4 + `MADV_PAGEOUT` strategy of [#36948](https://github.com/MaterializeInc/materialize/pull/36948) generalized from a spill-path special case into the pool's backing layer: the same measured costs, the same ~5.6× reduction in swap traffic from compression, now with the pool's write elision and lifecycle policy above it and the Layer 3 format readable through it. +The I/O executor choice applies unchanged — compress-plus-madvise runs on the evicting worker or on spill threads. + +The backend has no metadata costs at all; its weakness is the read path, where the kernel owns fault servicing and the old reclaim ceilings apply at low thread counts — mitigated by compression (5.6× fewer bytes to fault) and `MADV_WILLNEED` prefetch, but not eliminated. +Where both backends are available, the choice is a config knob over the same interface; milestone 2 benches them head to head. + +### Layer 2: buffer manager + +#### Address space and translation + +Reserve large anonymous virtual-memory regions per size class (Umbra's design): a 2 MiB class for batcher chunks and large column pages, hugepage-aligned, plus one or two smaller classes for Layer 3 pages and headers. +Virtual reservation is `MAP_NORESERVE`-cheap; physical memory materializes on use and is released on eviction with batched `MADV_DONTNEED`. +The pool memory is not file-backed: the pool is a cache over the extent store, not a view of it, and data moves between the two only through explicit, engine-scheduled I/O — the kernel never transfers a byte in either direction, unlike both lgalloc (where the mapping is the file) and swap (where anonymous memory is implicitly device-backed). + +Slots are scoped to residency: eviction returns a chunk's slot to a free list along with its physical pages, and fault-in allocates a fresh slot, so a chunk's address is stable only between a fault-in and the next eviction. +Pointers into a chunk are valid only under a pin (or within an epoch), which blocks eviction; nothing may cache a raw pointer across pins — the same discipline the borrow-safety rules below already require. +This is the classical buffer-manager position (frame and page identity decoupled, all access latch-mediated) rather than vmcache's, and the divergence is deliberate. +vmcache fixes every page's address for the lifetime of the *database* in order to serve two consumers Materialize does not have: optimistic lock-free readers (which must be restartable — differential's cursors hand out borrows that cannot retry, and timely's worker-sharded arrangements make reader-side contention structurally absent anyway) and arbitrary reference graphs without pointer fix-up (all access here re-derives from the handle under a pin). +What lifetime-stable addresses would cost is decisive at Materialize's scale: slot demand would track the *live* set — for backlog-shaped consumers, an unbounded un-drained backlog — putting a virtual-address ceiling on backlog size and, worse, accreting ~0.2% of peak backlog as unevictable page-table memory (vmcache budgets exactly this, ~2 GB DRAM per TB, and accepts it because its mapped entity is a bounded, purchased database; a transient queue inverts that economics). +Residency-scoped slots make slot demand track the budget instead: touched address space, and therefore page tables, are bounded by peak residency, and reservations can be absurdly generous (the prototype reserves 1 TiB per class) without ever costing anything. +Should an optimistic-read tier ever be wanted, relocation does not preclude it: regions stay mapped, so a speculative read of a reused slot returns wrong-but-safe bytes, and validation needs one slot-ownership check beyond the version compare. + +Translation while resident is arithmetic on the chunk handle. +There is no page table, no hash map, no latch on the resident path. + +#### Chunk states + +Residency is a state, not a type. +Each chunk carries one atomic state word (the vmcache per-page word, minus the dirty states immutability deletes): + +* `UnbackedResident` — lives only in the pool; no disk copy exists. +* `WriteInFlight` — write to an extent has been enqueued or issued; the chunk remains readable. +* `BackedResident` — clean; a disk copy exists; eviction is free (release physical pages, no I/O). +* `Evicted` — disk copy only; access faults it back in. +* `Faulting` — read in flight; concurrent accessors wait or retry. + +The `PagedColumn` enum and its `Resident`/`Paged`/`Compressed` variants disappear; callers hold a chunk handle and access it uniformly. + +#### Write-behind, and never writing dead data + +"Page out" becomes a state transition, not an I/O. +Under budget pressure (or eager-backing policy, below) a chunk transitions to `WriteInFlight` and its write-back is performed by the evicting worker or queued to spill threads (see I/O execution); on completion it is `BackedResident`; physical pages are released only when the budget actually demands it. + +This captures what makes the swap backend fast — laziness, free re-access before reclaim — while keeping what makes the file backend controllable, and adds the one thing neither backend can do: freeing an `UnbackedResident` chunk is a pure memory operation, and freeing a `WriteInFlight` chunk cancels the write. +In a merge-heavy workload most chunks die young; avoided writes are the largest available win, and only the engine knows liveness. + +#### Eviction policy + +Lifecycle hints drive eviction; a small second-chance FIFO is the backstop for unannotated chunks. + +* Dead chunks are freed immediately (not evicted — there is nothing to keep). +* Chunks in chains scheduled to merge soon are pinned-equivalent hot. +* Chunks in long chains awaiting more input are cold. +* Hydration-era output is write-once-read-rarely: eager-evict FIFO. + +LeanStore's cooling stage exists because a generic buffer manager must speculate about coldness; the engine here knows it, so the speculation machinery reduces to a backstop. +The budget is the existing `TieredPolicy` atomic pool reinterpreted: it bounds resident bytes, and exceeding it selects eviction victims rather than gating pageout decisions per chunk. + +#### Borrow safety: epochs + +Readers take no per-access locks. +Workers advance an epoch counter at operator yield points; the evictor releases physical pages only for chunks unpinned for a full epoch. +Borrows into pool memory never cross a yield (cursor positions stored across activations are indices; re-access goes through the container, which re-faults if needed), so epoch protection is sound with zero reader-side cost beyond the state-word load. + +This deliberately departs from Umbra's optimistic versioned latches: optimistic validate-and-restart requires restartable readers, and differential cursor consumers dereference borrows in arbitrary downstream code that cannot be re-executed. +The trade is explicit: we give up evicting a page out from under an active reader — which we do not need, because borrows are yield-bounded — for zero per-access synchronization. + +#### I/O execution + +Common to every option: transfers are chunk-granular `pread`/`pwrite` with `O_DIRECT`, and a worker that needs evicted data immediately faults it in synchronously (one bounded NVMe read, ~100 µs–1 ms for 2 MiB-class transfers depending on device). +The Haas/Leis NVMe results size the stack: io_uring with deep queues matters at 4 KiB OLTP page sizes; at ≥256 KiB transfers, synchronous calls at modest concurrency saturate the device, so 2 MiB chunks need no exotic submission machinery either way. + +Who performs write-back and readahead is a design choice with two candidates, decided by measurement at milestone 2. + +**Synchronous, on-worker — Umbra's model.** +The evicting worker writes the victim chunk itself; merge code prefetches by issuing reads early. +Published Umbra works exactly this way — synchronous `pread`/`pwrite` from worker threads throughout, chosen explicitly for simplicity — and the simplicity is just as real here: no queues, no completion tracking, no cross-thread chunk-state transitions, and natural backpressure, since the worker causing spill pays for spill. +The stall arithmetic may well be acceptable: a 2 MiB `O_DIRECT` write at device speed is roughly a millisecond (plus sub-millisecond lz4), bounded, chunk-granular, and taken at a point the engine chose — categorically different from swap's unbounded per-4 KiB fault storms even though both are "synchronous". +The structural weakness is the read side: one thread cannot overlap I/O with compute, so a cold merge serializes read-then-process per chunk and runs at device latency rather than device bandwidth — a gap kernel readahead used to hide on the buffered path and `O_DIRECT` forfeits. + +**Asynchronous, off-worker — LeanStore's page-provider model.** +A small pool of dedicated spill threads consumes a write-behind queue and services readahead, so workers never perform eviction I/O and merges overlap fault-in with compute. +This is the LeanStore 2018 pattern; io_uring (which LeanStore adopted only in its 2024 NVMe redesign) stays deferred behind the same interface until transfer sizes shrink enough to need it. +The costs are the machinery — queues, completions, sizing knobs, cross-thread state-word transitions — and a longer cancellation path for dead chunks with writes in flight. + +A middle path exists if measurement splits the difference: workers take write stalls synchronously but submit their own readahead through a small per-worker io_uring, checking completions at access time — read-side overlap without dedicated threads. +The chunk state machine is identical in all three shapes; only the executor of the `WriteInFlight` and `Faulting` transitions differs, so the choice is contained behind one interface and revisitable. + +The staging prototype supplies the first data point (see Measured under Performance estimates): on the swap-backed store, where eviction cost is `MADV_PAGEOUT` page-table work rather than a device write, unguarded on-worker enforcement pinned CPUs, and off-worker spill threads behind single-flight enforcement and a bounded queue resolved it. +The on-worker option remains live for the file-extent backend, where the write is a single bounded `pwrite`. + +Eviction throughput is bounded by `madvise` page-table work and TLB shootdowns (the vmcache paper's measured ceiling; their fix, the exmap kernel module, is not shippable here). +Mitigation is granularity: 2 MiB chunks are ~500× fewer page-table operations per byte than 4 KiB pages, and `MADV_DONTNEED` calls are batched. + +#### Compression + +Compression is a property of the extent, not the residency state. +Resident form is always uncompressed in its slot; lz4 (or stronger, see BtrBlocks under Prior art) is applied at write time by whichever thread performs the write (see I/O execution) and reversed at fault-in. +This keeps codec CPU off the access path — and off worker threads entirely in the off-worker option — and removes today's oddity where `CompressedInner::Memory` holds lz4 bytes in resident memory — softened by the eager `MADV_PAGEOUT` of [#36948](https://github.com/MaterializeInc/materialize/pull/36948), but still kernel-managed on the way back in. + +### Layer 3: paged sealed batches + +#### The integration seam + +Differential batch storage is generic over containers: `OrdValBatch` is parameterized by a `Layout` whose `KeyContainer`, `ValContainer`, offset, time, and diff containers each implement `BatchContainer`, and Materialize already substitutes its own containers in `mz_row_spine`. +A paged container — elements stored across Layer 2 chunks plus a small resident header — implements `BatchContainer` without forking `OrdValBatch` or cursor logic. +Pin/epoch protection makes this sound: `index` returns a borrow into pool memory, valid because eviction cannot reclaim or relocate a page mid-borrow, and accessors re-derive their view from the handle rather than caching pointers across borrows. +Differential's pending `Chunk` abstraction offers a cleaner, chunk-granular seam that this design prefers where available; see "Integration with differential's `Chunk` abstraction" below. + +#### What is free to vary, and what is not + +The index-structure literature is a zoo: update-in-place B+-trees, LSM variants, hybrids that buffer writes in tree leaves (Bf-Tree), record-granular hot/cold migration (2-Tree, anti-caching), key-value separation, per-run filters, learned indexes. +This design does not adjudicate the zoo, and its choices should not be mistaken for a claim to have done so. +The narrowing principle is that differential's spine semantics are load-bearing: operators rely on batches being immutable, sorted, timestamped, consolidated, and merged under frontier control, so the macro-structure — immutable sorted runs compacted by the fueled scheduler — is an input to this design, not a choice it gets to make. +Replacing it with an update-in-place tree or a record-migrating hybrid is a differential redesign, out of scope here. + +What remains genuinely free, per arrangement, and where the zoo maps onto it: + +* **Within-run layout**: leaf page size; key representation — inline, prefix-truncated, or out-of-line (`UpsertKey` is already an out-of-line key: a fixed-width hash standing in for the real key); value placement (inline vs WiscKey-style extents). +* **Cross-run read path**: resident fence keys always; per-run filters where probe traffic warrants them, with the LSM filter-allocation results (Monkey) applying directly since spine runs are levels; the record-granular hot cache as the explicitly open question. +* **Hot/cold mechanism**: page-granular residency is the committed mechanism; record-granular migration is the deferred alternative if probe traffic defeats page granularity. + +This narrowing is also the precise answer to "why not a B+-tree": not because B-trees lose some abstract benchmark, but because the spine already fixes the macro-structure, and a static B+-tree per run is simply what falls out of indexing an immutable sorted run. + +#### Run format + +A paged batch is, structurally, a static bulk-loaded B+-tree: + +* **Resident header** (small, never evicted): fence keys — the first key of each page, owned — a page table mapping logical index ranges to chunk handles, and optionally a filter. + Resident overhead is tens of bytes per 2 MiB page, four orders of magnitude below the data. +* **Column pages** (leaves): the batch's parallel columns split at aligned boundaries. + Size classes per column density: small pages for offsets/times/diffs, large pages for key and val data. + +`seek_key` binary-searches fence keys with zero I/O and faults at most one leaf page. +This is Umbra's B+-tree read path with the write path amputated: because batches are immutable and built bottom-up at seal, there are no structure-modification operations, no latch coupling, no insert path — every hard problem in the mutable-tree literature is absent by construction. + +#### Access patterns, read amplification, and policy + +* **Spine compaction**: two linear scans; readahead keeps a bounded resident window, making GB-scale batch merges RSS-bounded. +* **Joins**: probe traffic is delta-proportional — each input batch probes the arrangement at its updated keys. + Probes arrive in key order but may touch an arbitrarily sparse subset of pages; dense probe sets approximate scans, sparse ones are point lookups in disguise. +* **Upsert feedback**: point lookups, batched and sortable before issue. +* **Interactive peeks**: genuinely random and latency-sensitive. + +Read amplification is the central tension for everything except compaction. +A cold probe faults a whole leaf page to read one row: at 2 MiB pages that is roughly four orders of magnitude of amplification, plus a decompress if the page carried a codec — "decode a megabyte to read a row" is exactly the failure mode a page-granular design courts, and a design tuned only for scan throughput would court it constantly. +Four mechanisms bound it, in escalating order: + +* Leaf size is a per-arrangement choice, not a constant: lookup-heavy arrangements use small leaves (64 KiB-class, ~30× less worst-case amplification) at the cost of more header entries; scan-heavy arrangements keep large leaves. +* Compression is per-page policy, not format: lookup-serving tiers skip the codec, so a faulted page is readable without decode. +* Per-run filters reject absent keys without faulting anything. +* The residency policy is per-arrangement, and the safety valve is total: an arrangement whose probe traffic makes paging a net loss is simply kept resident — exactly today's behavior. + The design degrades to the status quo, not below it. + +Batched seeks convert most sparse probes into anticipated reads. +Probe traffic arrives in batches — upsert feedback reads back the keys of an input batch, a join processes a batch of updates against the arrangement — and a cursor that receives the whole batch can see its future: plan against the resident fence keys and filters (zero I/O; a linear merge of two sorted sequences) the exact page set the probes touch, prefetch that set in one pass (pages in flight concurrently rather than faulted serially per key), then drain key by key against resident pages. +Differential's pending `Chunk` work plans exactly such a batched-key cursor variant, and the probe input being itself a sorted chunk makes the planning phase a chunk-against-fences merge. +This moves batched probe traffic out of the demand-miss bucket entirely and narrows the residency-policy burden to genuinely interactive single-key peeks. + +If sparse cold probes remain on the hot path after all of the above, page granularity itself is the wrong caching unit for that traffic, and the remaining option is record-granular caching above the page layer (the Bf-Tree mini-page idea) — an open question, not a committed mechanism. +Arrangements serving interactive peeks additionally keep recent batches and headers resident under a priority-aware budget (see Open questions). + +#### Hydration + +Build pages during hydration, seal, write-behind, evict eagerly. +Arrangement hydration proceeds with RSS bounded by the budget regardless of state size — the co-tenancy problem the stash-merge fueling work attacks from the demand side, solved from the supply side. + +#### Values + +Large `Row` values dominate some arrangements and are rewritten by every merge. +WiscKey-style full key-value separation conflicts with differential consolidation — merges compare `(key, val)` pairs, and out-of-line values would cost a dereference per comparison — but values are only compared when keys and times tie, so with mostly-unique keys the dereference rate may be low enough that separation wins for large rows. +The template is Umbra's string layout: small values inline in the leaf page, large values out-of-line in write-once extents referenced by pointer, threshold chosen by measurement. +The initial format keeps the val container abstract enough to add this without a format break. + +### Integration with differential's `Chunk` abstraction + +Differential [PR #744](https://github.com/TimelyDataflow/differential-dataflow/pull/744) introduces `trait Chunk`: a consolidated, sorted, `Rc`-shared run of `(data, time, diff)` updates with a size bound and a maximal-packing ("grading") invariant, designed so one abstraction backs the containers collections transit, the merge batcher's chains, and — as a `Vec` plus a time description — a whole batch (`ChunkBatch`). +The harness code (binary merger, fueled batch merger, compaction queue, builder) is generic; all layout-aware work lives behind the trait. +If it lands, it is the natural differential-side landing zone for Layer 3, and several pieces of this design simplify into it. + +#### The mapping + +* **`ChunkBatch` is the run format of this design, minus the paging.** + It carries per-chunk `first_keys` / `last_keys` / `first_vals` / `last_vals` containers — the resident header's fence keys, including the val-level fences that boundary-straddling `(key, val)` runs need — and its cursor binary-searches that index before opening a within-chunk cursor, handling boundary spills explicitly without touching chunk contents. +* **The fault boundary is the trait.** + Everything a seek needs is resident in the `ChunkBatch` (fence containers are copied out of chunks at construction), and all data access funnels through `Chunk::cursor` and the inner cursor's accessors. + Chunk-granular paging therefore matches the pool's unit exactly, and is a cleaner seam than paging inside `BatchContainer`: the integration-seam subsection above describes the container-level fallback, but `Chunk` is the preferred target. +* **The fueled batch merger is the bounded-window consumer.** + It reads sources by cloning chunks (refcount bumps, never consuming), holds at most two heads plus graded output, and its source indices announce exactly which chunks fault next — the readahead driver comes for free. + Its implementor contract (output bounded by input consumed; recycle drained-input storage as output buffers) is the write-behind and pool-slot-recycling discipline of Layer 2, stated independently. +* **Batched-seek cursors are the probe prefetch story.** + A planned cursor variant accepting a whole chunk of probe keys lines up exactly with the paged read path's plan/prefetch/drain shape: the resident fence containers answer which chunks the entire probe set touches without I/O, prefetch covers that set in one pass, and the drain proceeds against resident data (see "Access patterns, read amplification, and policy" above). +* **One representation across the lifecycle.** + The same chunk transits the collection, sits in batcher chains, and lands in the sealed batch as an `Rc` move — eliminating the re-serialization at seal that today's pipeline pays between batcher and spine containers, and making Layer 2's zero-copy end state (builders filling pool memory directly) reachable across the whole path. +* **The trait formalizes "what is free to vary."** + Within-chunk layout is opaque to the harness, so the index-structure zoo is contested per-implementor, exactly as the narrowing argument above wants. + +#### A `PagedChunk` implementor + +The integration is a third `Chunk` implementor whose backing is a Layer 2 pool handle rather than `Rc`: + +* `Clone` is a handle refcount bump, satisfying the cheap-clone contract. +* `cursor()` and the inner accessors fault evicted backing in through interior mutability and return borrows tied to `&self` — sound because pin/epoch protection blocks eviction and relocation during the borrow, and every access re-derives from the handle; this consumer is what makes that machinery earn its keep. +* `bounds()` serves from an owned resident copy captured at seal time (the trait already demands cheap endpoint access, and `ChunkBatch::new` calls it while the chunk is naturally resident); after construction, seeks touch no evicted chunk until an inner cursor opens. +* `prune` overrides the default copy-merge with a range adjustment over shared storage, which the trait documentation already anticipates. + +#### Frictions + +* **Grading counts updates; paging wants bytes.** + `TARGET` is an update count serving merge-suspension granularity and index size; a paged chunk wants byte-targeted sizing for I/O efficiency, and variable-size rows make the two diverge. + The columnar implementor already tolerates oversized chunks at val boundaries, so the invariant bends; byte-based grading is feedback for the PR rather than a blocker. +* **The columnar implementor's v1 merge decompresses and recompresses**, materializing owned keys and vals — flagged in the PR as a known limitation. + Paged columnar chunks want range-copy merging for the same reason `ord_neu`'s merger has it; one fix serves both. +* **Boundary spill-walks fault mid-iteration.** + `map_times` and val-stepping open cursors on neighbor chunks inside operator closures — accesses the readahead API cannot easily predict. + Acceptable under synchronous fault-in; they belong on the fault-point inventory that milestone 3 audits. + +The consequence for sequencing: milestone 3 re-targets from a paged `BatchContainer` to a paged `Chunk`, shrinking from "prototype a paged container and audit borrow lifetimes across the cursor stack" to "implement one trait and audit the inner-cursor fault points." + +### Backing policy: lazy and eager + +One policy bit per chunk — when must it be backed — with the data's position in the implicit LSM choosing the value: + +* **Batcher chunks and young/small spine batches**: lazy. + Write-behind under pressure only, with die-young elision. + This is the memtable/L0 of the LSM; eager backing here is waste, and even maximally disk-first storage engines keep their write buffers in memory. +* **Sealed batches past a size or level threshold**: eager. + A deep batch survives long, is read sporadically, and merges rarely; one write at seal is cheap against its lifetime, it is exactly the data that should leave RSS, and it is what makes warm restart cover the bytes that matter. + The spine's geometric structure supplies the threshold. +* **Hydration-era output**: eager, always. + +Eager backing converts pressure response from a write storm at the worst moment into "release clean pages," a pure memory operation: the degradation curve goes from a cliff to a slope. + +### Configuration + +Dyncfg-driven, mirroring the existing pager flags: + +* enable flags per consumer (compute batchers, storage upsert stash, paged sealed batches), so rollout is independent per surface; +* resident-bytes budget (the reinterpreted `TieredPolicy` pool); +* eager-backing threshold (batch size or spine level); +* I/O executor selection (and spill-thread count where applicable), codec, scratch sizing. + +The existing `ColumnPager`/`PagingPolicy` seam is the Layer 2 integration point for batchers; live reconfiguration semantics carry over. + +## Performance estimates + +Measured numbers come from the pager design doc's benches ([20260504_pager.md](20260504_pager.md), reproduced where load-bearing), the CLU-108 benchmarks behind [#36948](https://github.com/MaterializeInc/materialize/pull/36948), and the upsert-hydration profile that motivated Layer 1. +Estimates for this design derive from device arithmetic and are marked as such; milestone 2 exists to replace them with measurements. + +### Cost model + +Per 2 MiB chunk, on the pager doc's two reference boxes (single encrypted NVMe at ~1.4 GB/s sustained; r8gd striped instance NVMe at ~7 GB/s): + +* `O_DIRECT` read or write: ~0.3 ms (striped) to ~1.4 ms (encrypted single disk). +* lz4: ~2–3 ms to compress (≈0.7–1 GB/s per core), ~0.4–0.5 ms to decompress (≈4–5 GB/s); CLU-108 measured ~5.6× ratio on arrangement data. + Note the asymmetry: on-worker compression costs more than the write itself, which weights the codec decision toward the off-worker executor or toward compressing only eager-backed (cold) tiers. +* Kernel swap path for the same 2 MiB: 512 synchronous 4 KiB faults; the pager doc measured 2.12 M page-ins for an 8 GiB working set with 64 of 65 wall-seconds spent in the kernel. +* Per-chunk filesystem metadata (today's file backend): dominant at scale — 35.6 s of unlink/inode-eviction against 4.3 s of opens over one measured hydration; the extent allocator's equivalent is a free-list push, effectively zero. + +### Headline comparison + +"Swap" is the current swap backend (`MADV_COLD`, plus the lz4+`MADV_PAGEOUT` variant of #36948 where noted); "lgalloc" is kernel-paged file-backed mmap; "file" is today's per-chunk scratch files; "this design" is Layer 2 with extents. + +| Dimension | Swap | lgalloc | File (today) | This design (estimate) | +|---|---|---|---|---| +| Hot re-access, resident | memory speed, unless already reclaimed (refault) | memory speed | full round trip every time (~1–4 ms/chunk) | pointer deref + one atomic load | +| Merge throughput, 1 thread, 2–4× pressure | 0.12–0.15 GiB/s (measured) | ≈ swap (same fault path) | 0.36–0.50 GiB/s (measured) | ~0.6 GiB/s sync executor; ~0.7 GiB/s (device/2) with read overlap | +| Merge throughput, 16–64 threads, fast disk | ~1.5 GiB/s overall, ~2.5 merge-phase (measured) | ≈ swap | 1.73 GiB/s, disk-bound (measured) | ≥ file; same disk ceiling raised by write elision and, where data compresses, by the lz4 ratio (5.6× measured) on disk traffic | +| RSS under pressure | pins to cgroup cap; 0.40 GiB with lz4+PAGEOUT (measured) | kernel-managed, opaque | working window, ~376 MB at 64 threads (measured) | budget, by construction | +| Worker stall profile | unbounded; 97% sys time single-threaded (measured) | ≈ swap, plus dirty-page writeback at kernel discretion | bounded but eager: every pageout pays serialize+write | ≤ ~1 ms bounded per eviction (sync executor); ~0 (off-worker) | +| Per-chunk metadata | none | none | dominant at scale (measured, see model) | none (free-list) | +| Cold point lookup into sealed state | n/a today (resident); per-touched-4 KiB fault if it paged | per-touched-4 KiB fault | whole-chunk rehydrate minimum | one page read, ~0.1–1 ms (Layer 3) | +| Hydration RSS | working set (kernel may lag) | working set, kernel-paced relief | unbounded on the columnar path | budget, with build at device write bandwidth (Layer 3) | + +No dedicated lgalloc benchmarks exist in our record; its column is inferred from sharing the kernel fault path with swap, with the added caveat of file-backed dirty writeback. +Treat it as qualitatively-swap rather than independently measured. + +The "this design" column assumes file extents. +On the swap-backed extent store (see Layer 1), write costs match the measured lz4+`MADV_PAGEOUT` line (compress plus synchronous reclaim of the compressed range), and the read path replaces the `O_DIRECT` `pread` with `MADV_WILLNEED`-prefetched swap-in over 5.6× fewer bytes — kernel-serviced, so the low-thread reclaim ceilings soften the merge-throughput rows toward the swap column while the RSS, stall-bounding, metadata, and elision rows hold. + +### What the estimates assume, and where they could be wrong + +* **Write elision rate.** + The merge-throughput gains assume a meaningful fraction of chunks die unbacked (lazy tier). + In steady-state merging this fraction is high (chains turn over continuously); under eager backing it is zero by definition. + The staging prototype observed the mechanism directly (see Measured, below): early-hydration churn produced spill-cancellation rates of 100–400/s, each an avoided compress-and-pageout. +* **Compression ratio.** + The 5.6× figure is one workload's arrangement data; the disk-ceiling multiplier scales directly with it and drops to 1× on incompressible data. +* **Eviction overhead.** + Batched `MADV_DONTNEED` at 2 MiB granularity is assumed cheap; the vmcache results say page-table work serializes at high eviction rates, and our margin comes from chunk granularity (~500× fewer operations per byte than 4 KiB paging). + A workload that thrashes the budget boundary could still expose this. +* **Epoch latency.** + Eviction waits one epoch (one yield cycle) per chunk; a worker stuck in a long-running operator step delays reclaim process-wide. + This is an accounting hazard, not a throughput one, but it can transiently overshoot the budget. +* **Single-thread sync-executor estimate.** + The ~0.6 GiB/s figure assumes read-process-write serialization per chunk with no overlap; it degrades toward the file backend's numbers if compression runs on-worker (see cost model) and improves toward device/2 with the io_uring middle path. + +### Measured: the staging prototype (June 2026) + +Milestone 2's swap-backed prototype ran on staging under the upsert-v2 source stash — a hydration workload that accumulates backlog while persist catches up — and replaced several estimates above with measurements. + +* **Bounded accumulation holds, at ~3000× past the budget.** + The workload accumulated ~395 GiB of logical stash (70.6 GiB of lz4 extents on the swap device; the 5.6× ratio reproduced on production-shaped data) while pool residency held at the configured 128 MiB floor and the process's anonymous RSS stayed under ~8.5 GiB. + Of that, ~3 GiB is the merge engine's working set, flat as the backlog grew — it scales with chunk size and concurrency, not state size. +* **The kernel never reclaimed.** + Cgroup `pgscan` stayed zero across the run, and the pool VMAs showed `Swap: 0` throughout: the engine evicts ahead of pressure by construction, and slots are only ever discarded, never kernel-paged. +* **Page tables tracked the compressed ledger at ~0.2–0.4%** (82–250 MiB against 20–70 GiB of extents) — the slot-per-resident economics the Address space section argues for, observed. +* **Die-young elision is real and visible.** + Early in hydration the chains are small and merge constantly, so chunks die younger than the spill queue's latency: cancellations ran at 100–400/s — each an avoided write — decaying to zero as chains matured and lifetimes stretched. + The same churn saturated the bounded spill queue and pushed eviction inline onto workers; growable spill-thread counts and churn-aware victim selection (skip the youngest chains) are the identified follow-ups. +* **Eviction must not run on workers unguarded.** + A first cut let every worker attempt budget enforcement concurrently; `madvise` plus lock contention pinned CPUs. + Single-flight enforcement over a resident-only queue plus dedicated spill threads resolved it — for the swap-backed store, where page-table work dominates eviction cost, the off-worker executor is the prototype's de facto answer to the I/O execution question. +* **Extents must be sized to the compressed payload.** + Allocating lz4's worst case inflated swap writeback ~5.6×; at hydration eviction rates this backed up device writeback and bloated the working set with in-flight pages. + Compressing into reused scratch and allocating exact (page-rounded) extents fixed it. +* **Size classes must cover the chunk shapes the ship heuristic actually produces.** + The batcher's capacity heuristic yields bimodal chunks (≈1.8–2.0 and 3.6–4.0 MiB serialized); without 4 and 8 MiB classes the large mode silently fell back to unpageable heap. +* **Boundedness is phase-scoped, and drains are the sharp edge.** + Accumulation is budget-bounded; the drain initially was not: `Batcher::seal` materialized the entire ship side before the drain consumed it, an O(backlog) rehydration observed live at ~80 GiB RSS when persist caught up. + The stash drain now rehydrates sealed chunks one at a time; the arrangement-build path inside `arrange_core` retains the same materialization (see Open questions). +* **Working-set metrics misread this architecture.** + `MADV_PAGEOUT` leaves clean swap-cache copies that the kernel, absent pressure, never drops; cgroup working-set charges them, so dashboards show the ledger as linear "memory growth" that is actually droppable cache. + Anonymous RSS is the honest health signal. + The cache also earns its keep: ~99% of observed fault-ins were served from it as minor faults — an accidental free middle tier between the pool and the device. + +## Minimal viable prototype and milestones + +1. **Extent store under the existing file backend.** + Same `Handle` API; per-chunk files replaced by pooled extents. + Validates: inode-churn elimination (re-run the workload that measured 35.6 s of unlink), `O_DIRECT` alignment plumbing, allocator fragmentation under merge churn. + Exercisable in CI and on dev boxes regardless of production volume topology; production nodes today mount no filesystem (the whole disk is swap), so this backend deploys per cluster class as scratch volumes appear, and milestone 2's initial production backend is the swap-backed store. +2. **Buffer pool under the batcher.** + Size-class regions, state words, write-behind, lifecycle eviction; integrated behind `ColumnPager` so the merge batcher is unchanged above the seam. + Initial production backend is the swap-backed extent store, generalizing the measured lz4+`MADV_PAGEOUT` path; the file backend benches head to head wherever hardware allows. + Validates: RSS bounded by budget under the pager design doc's merge benches; dead-chunk write elision rate; worker threads never in reclaim (`pgscan_direct` flat); throughput at least matching the better of today's two backends at 1, 16, and 64 threads. + Also decides the I/O execution model: run the same benches under the on-worker and off-worker executors, measuring operator-step time inflation from on-worker stalls and cold-merge throughput with and without read overlap. + At this milestone the pager's swap and file backends are deletable — its only two consumers route through this seam. + Node-level swap is unaffected; it remains the backstop for everything outside the pool (see "Incremental migration: coexisting with swap"). +3. **Borrow-safety prototype for Layer 3.** + A paged `BatchContainer` for one container type plus an audit (and assertion machinery) that no consumer holds a container borrow across a yield. + If differential's `Chunk` abstraction lands first, this milestone re-targets to a paged `Chunk` implementor and the audit narrows to the inner-cursor fault points (see "Integration with differential's `Chunk` abstraction"). + This is the step most likely to send the design back for revision; do it before committing to the full format. +4. **Paged sealed batches for one spine.** + `ValRowSpine` (upsert feedback) first: single consumer, batched lookups, no interactive peeks. + Validates: bounded-RSS hydration end to end; merge throughput with readahead; cold-seek latency. +5. **General arrangements and peek-aware policy.** + Eager-backing thresholds, per-dataflow priorities, rollout to compute indexes. + +## Incremental migration: coexisting with swap + +"Swap" names two different things, and the migration story differs for each. + +The *pager's swap backend* is an implementation detail with exactly two consumers (the compute batchers and the storage upsert stash), both already behind the `ColumnPager` seam and per-consumer flags. +The pool replaces it consumer by consumer, each flip independent, dyncfg-driven, and reversible; the backend is deletable when the last consumer flips. + +*Node-level swap* is the process-wide backstop under every anonymous allocation: lgalloc-backed columnation arrangements, persist's arrow buffers, operator heap state, allocator headroom. +It must remain provisioned until every source of large allocations has a different mechanism — a long tail that includes projects outside this design's scope, and one this design must coexist with rather than wait for. +Nothing here assumes swap is absent; nothing here breaks when it is present. + +### Coexistence semantics + +Pool-resident pages are ordinary anonymous memory, so under global pressure the kernel may swap them — engine-managed and kernel-managed reclaim overlap, with two consequences. + +The first is wasted swap write-out: the kernel may page out pool-resident chunks the engine would have dropped (dead soon) or written more cheaply (compressed, to an extent). +This is bounded by keeping the pool budget comfortably under the container limit, so kswapd rarely finds pool pages in its reclaim scans; pressure signals (PSI, `pgscan` rates) feed the existing dyncfg machinery to shrink the budget when the rest of the process grows. + +The second would be double I/O on fault-in — a `pread` into pages the kernel swapped out triggers swap-in of data about to be overwritten — but fault-in is overwrite-by-construction, so the engine issues `MADV_DONTNEED` on the destination range first: any swap copy is discarded, the range refills as zero pages, and no swap-in occurs. +The lifecycle-knowledge advantage survives coexistence intact. + +`mlock`ing the pool would partition cleanly — pool memory engine-managed only, everything else kernel-managed — at the cost of a `RLIMIT_MEMLOCK`/capability dependency in the container environment. +It is a hardening option, not a requirement; budget headroom plus pre-fault `MADV_DONTNEED` covers the common case without it. + +Swap also keeps a role this design is glad to have: defense in depth. +A misconfigured budget degrades into kernel paging rather than an OOM kill, and operators retain the existing knob while confidence in the pool's accounting builds. + +### No filesystem yet: deployment starts swap-backed + +Production nodes currently provision the entire disk as swap, so the file extent store has nowhere to live on day one, and the deployment order inverts the layer order: Layers 2 and 3 ship first on the swap-backed extent store, and file extents light up per cluster class as scratch volumes appear. +This is less of a detour than it looks. +The swap-backed store is a refactor and generalization of the already-measured lz4+`MADV_PAGEOUT` strategy, not new I/O infrastructure, and everything above the extent interface — the pool, the state machine, the policy, the Layer 3 format — is backend-agnostic by construction. +Switching a cluster from swap-backed to file extents is a configuration change, not a migration; the bytes are recreatable, so the switch does not even need to preserve them. +It also reframes the volume-topology question for operators: provisioning a scratch filesystem becomes a per-cluster-class performance decision (buying the explicit read path and `O_DIRECT` writes) rather than a prerequisite for the architecture. + +### What shrinks, and when + +Each consumer the pool absorbs leaves the swap working set: first the batcher chunks (milestone 2), then sealed columnar batches (milestones 3–5). +Columnation-era arrangements stay on lgalloc until the columnar path subsumes them — a separate project — and persist's buffers and operator heap state have their own timelines. +Swap provisioning shrinks correspondingly, from working-set-sized toward insurance-sized; turning it off is a per-cluster operational decision for when monitoring shows negligible swap traffic, not a milestone of this design. + +## Remote extents: the object-store direction + +The extent store's contract — write compressed bytes, get an opaque handle; explicit, non-faulting reads; free — is already an object-store contract, and nothing in Layers 2–3 assumes the backing device is local. +An S3-backed extent store is therefore an extension point of this design rather than a successor to it. +This section records where the literature sits so the option stays deliberately reachable; none of it is committed work. + +The relevant literature splits into four veins. + +* **Cloud-native database tiering** (Snowflake, NSDI 2020; Socrates/SQL Hyperscale, SIGMOD 2019; Neon) is unanimous on architecture: object storage is never placed directly behind a synchronous miss; a local cache tier — Snowflake's ephemeral SSD file cache, Socrates' RBPEX buffer-pool extension — always sits between. + Translated here: S3 extents would not replace swap or file extents but sit below them, a third rung on the ladder (RAM pool → local extents → object segments). +* **Object-store I/O economics** (AnyBlob — Durner, Leis & Neumann, VLDB 2023) quantifies the device: ~10–30 ms first-byte latency regardless of size, bandwidth effectively free, requests priced. + Saturating throughput takes ~8–16 MiB requests at dozens-to-hundreds of concurrent in-flight GETs under an engine-integrated download scheduler. + Two consequences: compressed extents are far too small to be objects and must coalesce into multi-extent **segments** with an extent → (segment, offset) indirection; and the batched-seek cursor (see Layer 3) stops being an optimization and becomes the API that lets a download manager turn a probe set into few, large, parallel requests. +* **Far memory** (Infiniswap, NSDI 2017; Fastswap; Hermit, NSDI 2023; versus AIFM, OSDI 2020) settles the interface question: transparent page-granular remote paging amplifies unacceptably over high-latency links, and application-integrated, object-granular access with explicit dereference scopes wins decisively. + AIFM's deref scope is this design's pin, one network hop further out — the handle-and-pin discipline adopted in Layer 2 (and unavailable to lgalloc-style always-valid pointers) is precisely what makes a remote backend viable at all. +* **Log-structured lifecycle** (LFS and its cleaning literature; RAMCloud's log cleaner; RocksDB-Cloud) names the new problem the tier brings: immutable remote segments turn `free` from a deallocation into garbage accumulation, so live-ratio tracking and segment compaction — machinery the swap backend's plain `dealloc` made unnecessary — arrive with the tier. + +What changes if this ships: the synchronous executor dies (a 10+ ms remote stall is not a 100 µs decompress, so the asynchronous prefetch path stops being a measured choice and becomes mandatory), and `free` becomes refcounted segment GC. +What does not change: the pool, the budget, the chunk states, pins, and the extent seam itself. + +Two pragmatics temper the ambition. +First, a zero-code intermediate exists: swapping to network block storage (EBS-class) extends capacity past the local device with no engine changes at all — the kernel does not care that the swap device is network-attached — and should be benched before any of this is built. +Second, Materialize already operates an S3-backed, locally-cached, compacted store of immutable blobs: persist. +Remote extents would reconstruct persist's bottom half (blob interface, cache, GC) for pre-consolidation state, so the load-bearing design question is not how to talk to S3 but whether cold extents become a tenant of persist's blob and cache layers or a deliberately separate mini-store. +The prize that makes the question worth eventually answering is durability of state across process death: today a restart re-ingests the backlog from the source, where re-attaching to remote segments would turn restart into cache warming — the Snowflake/Neon elasticity story, and the reason warm restart sits in Out of scope as a follow-up rather than a non-goal. + +## Prior art + +The design is an application of the modern buffer-manager literature to a workload with unusually favorable properties; the mapping is deliberate and close. + +* **Umbra** (Neumann & Freitag, CIDR 2020, [pdf](https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf)). + Size-class anonymous VM regions, `MADV_DONTNEED` release, variable-size pages so large objects stay contiguous, and the stance that all data structures share one buffer-managed budget — Layers 1–2 adopt all four. + Umbra's versioned latches and pointer swizzling are replaced by pins/epochs over residency-scoped slots, because immutability and yield-bounded borrows make them unnecessary. +* **LeanStore** (Leis et al., ICDE 2018, [pdf](https://db.in.tum.de/~leis/papers/leanstore.pdf); NVMe redesign VLDB 2024). + The governing insight: make the resident path free and pay translation only at the disk boundary. + Its cooling-FIFO replacement policy survives here only as a backstop, because the engine has lifecycle knowledge LeanStore must speculate about. +* **vmcache** (Leis et al., SIGMOD 2023, [pdf](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf)). + Per-page atomic state words and explicit fault/evict over anonymous memory; Layer 2's chunk model is vmcache minus dirty states. + Its lifetime-stable page addresses, however, are deliberately *not* adopted: vmcache maps virtual memory one-to-one with storage (paying a budgeted ~2 GB of page tables per TB) to serve optimistic restartable readers and pointer graphs, neither of which exists here, and backlog-shaped state inverts the economics — see "Address space and translation". + Its companion measurement — eviction throughput ceilinged by `madvise`/TLB-shootdown costs, fixed there by the exmap kernel module — is the constraint our 2 MiB granularity is designed around. +* **mmap critique** (Crotty et al., CIDR 2022, [pdf](https://db.cs.cmu.edu/papers/2022/cidr2022-p13-crotty.pdf)). + Why kernel-controlled paging (including our swap backend) fails: unschedulable synchronous faults, single-threaded reclaim, TLB shootdowns. + The pager design doc's swap-backend measurements are an independent reproduction. +* **NVMe I/O stack** (Haas & Leis, VLDB 2023, [pdf](https://vldb.org/pvldb/vol16/p2090-haas.pdf)). + `O_DIRECT` plus engine-managed buffering; deep async queues matter at 4 KiB pages, synchronous threads suffice at large transfers. + Grounds the spill-thread choice and the io_uring deferral. +* **LSM design space** (Dong et al., CIDR 2017; Dayan & Idreos, *Dostoevsky*, SIGMOD 2018; Lu et al., *WiscKey*, FAST 2016, [pdf](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf)). + Compaction math and key-value separation. + With durability deleted, an LSM reduces to exactly what a differential spine already is; WiscKey informs the value-separation option, tempered by the consolidation conflict noted above. +* **DuckDB** ([memory management](https://duckdb.org/2024/07/09/memory-management)). + Native-format spill (eviction = unpin, no serialization) and recycled temp-file slots — Layer 1's direct precedent. +* **Flink on RocksDB** ([guide](https://flink.apache.org/2021/01/18/using-rocksdb-state-backend-in-apache-flink-when-and-how/)); **ForSt** (Mei et al., VLDB 2025, [pdf](https://www.vldb.org/pvldb/vol18/p4846-mei.pdf)). + The negative example: a byte-API KV store under a dataflow engine pays serialization on every access and fights two compaction schedulers and two caches. + ForSt's disaggregated architecture (durable state remote, local disk a cache) is the architecture Materialize already has with persist; its async state access is out of scope here. +* **BtrBlocks** (Kuschewski et al., SIGMOD 2023). + Compression applied at the pool/storage boundary with hot data uncompressed in buffer-managed pages; grounds the compression placement and is the natural format conversation for the persist-alignment north star. +* **B-tree alternatives** (LMDB; Wang et al. on the Bw-tree, SIGMOD 2018; Callaghan's amplification framework; Hao & Chandramouli, *Bf-Tree*, VLDB 2024, [pdf](https://vldb.org/pvldb/vol17/p3442-hao.pdf)). + An update-in-place or COW disk B-tree pays random-write amplification to buy point-read latency that immutable runs plus resident fence-key indexes can serve at one read per cold seek. + Bf-Tree's variable-length mini-pages — simultaneously a record cache and a write buffer — are the precedent for the record-granular caching open question. +* **Hot/cold hybrids and filter sizing** (Zhou et al., *2-Tree*, CIDR 2023; DeBrabant et al., *Anti-Caching*, VLDB 2013; Dayan, Athanassoulis & Idreos, *Monkey*, SIGMOD 2017). + Record-granular hot/cold migration (2-Tree, anti-caching) is the deferred alternative if probe traffic defeats page-granular residency; Monkey's optimal per-level filter allocation applies directly to per-run filters, since spine runs are LSM levels in all but name. + +## Alternatives + +### Embed RocksDB (or any general-purpose KV store) + +Rejected. +Flink's experience is the controlled experiment: per-access serialization across a byte-oriented API, a second compaction scheduler fighting differential's fueled merges, a second cache (block cache) double-buffering against ours, and opaque memory accounting. +Differential already owns sorted-run maintenance; the missing piece is run storage, not a storage engine. + +### File-backed mmap of scratch files (the lgalloc architecture) + +This alternative is not hypothetical: lgalloc is its production-tested instance, and it still backs columnation arrangements today (see Background). +Rejected for new work per Crotty et al., our own swap-backend data, and the operational record: faults are synchronous and unschedulable on worker threads, reclaim is kernel-paced, writeback timing is invisible, and policy has to be approximated from below the allocation boundary with an accumulating set of knobs. +This design keeps what lgalloc got right — size-classed regions over a scratch volume — and keeps virtual memory as a translation mechanism (anonymous regions, explicit I/O) while rejecting kernel-controlled paging: the vmcache position. + +### Keep the two-backend pager and optimize it + +Extent pooling (Layer 1) fixes the file backend's inode churn, and `MADV_PAGEOUT` batching could soften swap's reclaim storms, but the blob model itself caps the ceiling: whole-chunk rehydration on every access, residency decided irrevocably at pageout time, no path to paging sealed batches, and two backends each accidentally good at half the workload. +Layer 2 subsumes both backends' strengths in one mechanism. + +### Pure disk-first (every chunk written at seal) + +Rejected as a universal policy, adopted as a per-level policy. +Universal eager backing imposes a write floor proportional to merge traffic — most batcher chunks die in seconds, and differential rewrites each record O(log n) times — which is unaffordable on EBS-class scratch and wasteful everywhere. +The eager/lazy threshold (see Backing policy) keeps disk-first's benefits where they are real: deep, sealed, long-lived state. + +### Optimistic latches instead of epochs + +Umbra's readers validate a version counter and restart on conflict, which permits eviction under active readers. +Differential cursor consumers receive borrows and cannot be restarted, so validation has nowhere to jump back to. +Epochs cost nothing on the read path and only delay eviction by one yield cycle; the flexibility lost is flexibility this workload cannot use. + +## Open questions + +* **Epoch mechanics.** + Per-worker epoch counters advanced at yield, with the evictor taking a min — or a pin-count fallback for any consumer found to hold borrows across yields? + Milestone 3 exists to answer this empirically; the assertion machinery should ship regardless. +* **Owned vs. pinned API for the first cut of Layer 2.** + Keeping `take`-style owned rehydration (one memcpy out of the pool) eases migration; pinned borrows are the end state; builders allocating pool memory directly (zero-copy seal) is the end-end state. + Recommend owned-first, with the chunk handle designed so pinning is additive. +* **Budget topology.** + One global pool with per-dataflow priorities, or partitioned pools per cluster replica role? + Peek-serving arrangements need protection from hydration floods either way; the simplest sufficient mechanism is preferred. +* **I/O execution model.** + On-worker synchronous (Umbra's simplicity), dedicated spill threads (LeanStore's page provider), or the middle path of on-worker io_uring readahead with synchronous writes? + Milestone 2 decides on measured operator-step inflation and cold-merge throughput; if threads win, count and placement (per-process vs per-worker, NUMA) follow as second-order questions. +* **Eager-backing threshold.** + Spine level, batch byte size, or batch age? + Level is the principled choice; byte size is the robust one when spines are shallow. +* **Value separation threshold.** + Inline-vs-out-of-line cutoff for large rows, pending the dereference-rate measurement during merges of real upsert state. +* **Record-granular hot caching.** + If sparse cold probes dominate some workloads even with small leaves, filters, and uncompressed lookup tiers, the page is the wrong caching unit for that traffic, and a record cache above the pool (Bf-Tree's mini-pages are the precedent) becomes worth its complexity. + Decide on milestone 4 evidence rather than speculation. +* **Drain-side materialization in `arrange_core`.** + Differential's `Batcher::seal` returns a materialized `Vec` of chunks and `Builder::seal` consumes it whole, so arrangement construction rehydrates an entire sealed run at once — measured on staging as the dominant drain-phase RSS spike once the stash drain went chunk-at-a-time. + Bounding it requires an iterator-shaped seam between batcher and builder in differential's trait pair (or the `Chunk`-based path, where a paged chunk transits by handle and nothing rehydrates); sizing data from the fixed-stash runs decides urgency. +* **Scratch exhaustion.** + When the extent store fills: stop evicting and let RSS grow (current behavior, in effect), or backpressure ingestion? + Needs an answer before eager backing ships, since eager mode writes more. diff --git a/misc/python/materialize/mzcompose/__init__.py b/misc/python/materialize/mzcompose/__init__.py index f50b5d96f5086..fb60815fe0b87 100644 --- a/misc/python/materialize/mzcompose/__init__.py +++ b/misc/python/materialize/mzcompose/__init__.py @@ -500,6 +500,8 @@ def get_default_system_parameters( "column_paged_batcher_budget_fraction", "column_paged_batcher_lz4", "column_paged_batcher_swap_pageout", + "column_paged_batcher_use_pool", + "column_paged_batcher_pool_spill_threads", "enable_upsert_paged_spill", "enable_lgalloc_eager_reclamation", "lgalloc_background_interval", diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 664b62099a591..29c84b67a766c 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -1645,6 +1645,12 @@ def __init__( self.flags_with_values["column_paged_batcher_swap_pageout"] = ( BOOLEAN_FLAG_VALUES ) + self.flags_with_values["column_paged_batcher_use_pool"] = BOOLEAN_FLAG_VALUES + self.flags_with_values["column_paged_batcher_pool_spill_threads"] = [ + "0", + "2", + "4", + ] self.flags_with_values["enable_upsert_paged_spill"] = BOOLEAN_FLAG_VALUES # If you are adding a new config flag in Materialize, consider using it diff --git a/src/compute-types/src/dyncfgs.rs b/src/compute-types/src/dyncfgs.rs index f49cf1ebbee28..85a5c3d0b7659 100644 --- a/src/compute-types/src/dyncfgs.rs +++ b/src/compute-types/src/dyncfgs.rs @@ -75,8 +75,9 @@ pub const ENABLE_COLUMN_PAGED_BATCHER_SPILL: Config = Config::new( pub const COLUMN_PAGED_BATCHER_BUDGET_FRACTION: Config = Config::new( "column_paged_batcher_budget_fraction", 0.05, - "Fraction of replica memory the column-paged batcher's tiered policy may hold resident \ - before spilling to the backend. Total budget = max(mem_limit * fraction, 128 MiB).", + "Fraction of physical RAM the column-paged batcher may hold resident before spilling \ + (resident budgets derive from RAM, never from announced limits that include swap). \ + Total budget = max(ram * fraction, 128 MiB).", ); /// Compress chunks the column-paged batcher spills, using lz4. Only @@ -115,6 +116,70 @@ pub const COLUMN_PAGED_BATCHER_SWAP_PAGEOUT: Config = Config::new( meaningful when `column_paged_batcher_lz4 = true` and the swap backend is active.", ); +/// Route column-paged batcher spill through the buffer pool +/// (`mz_ore::pool`, swap-backed extents) instead of the tiered pager +/// backends. The pool owns residency: chunks stay resident at stable +/// addresses until its budget (the same fraction-derived total as the +/// tiered policy's) forces compression into swap-backed extents, and +/// chunks consumed before eviction never cost a write at all. The backend +/// and lz4 configs are ignored in pool mode; the pool always compresses at +/// the eviction boundary. Falls back to the tiered path if the pool's +/// virtual reservation fails. +pub const COLUMN_PAGED_BATCHER_USE_POOL: Config = Config::new( + "column_paged_batcher_use_pool", + false, + "Route column-paged batcher spill through the buffer pool (swap-backed extents) instead of \ + the tiered pager backends. Only meaningful when `enable_column_paged_batcher_spill = true`.", +); + +/// Number of buffer-pool spill threads performing eviction I/O (lz4 +/// compression plus the synchronous-reclaim `MADV_PAGEOUT`) off the threads +/// that trip the budget. Zero evicts inline on the calling thread, which +/// measurably convoys workers behind eviction I/O at hydration eviction +/// rates. Thread spawning is once per process: raising the value later has +/// no effect beyond re-enabling, and lowering it to zero falls back to +/// inline eviction while spawned threads idle. +pub const COLUMN_PAGED_BATCHER_SPILL_WORKER_COUNT: Config = Config::new( + "column_paged_batcher_spill_worker_count", + 2, + "Buffer-pool spill threads for off-worker eviction I/O; 0 evicts inline on the caller.", +); + +/// Eagerly compress unbacked buffer-pool chunks to `BackedResident` on idle +/// spill threads (write-behind). The chunk stays readable in its slot while +/// a compressed extent accumulates on the swap device, so budget-driven +/// eviction becomes a pure page release instead of a compression. Trades +/// background CPU (compression of chunks that may die before pressure +/// reaches them) for near-free pressure response. +pub const COLUMN_PAGED_BATCHER_EAGER_BACKING: Config = Config::new( + "column_paged_batcher_eager_backing", + false, + "Eagerly compress buffer-pool chunks to compressed-but-resident on idle spill threads, so \ + budget-driven eviction is a pure page release. Only meaningful in pool mode with spill \ + workers.", +); + +/// Ceiling on the buffer pool's total RSS, as a fraction of *physical RAM* +/// (never the announced limit, which includes swap on swap-provisioned +/// nodes). The compressed-but-resident extent tier is the headroom above the +/// slot budget and warm cap: chunks evicted from the budget stay in RAM +/// compressed (~5.6x denser; reads decompress without faulting) until this +/// ceiling forces the oldest extents out to the swap device via +/// `MADV_PAGEOUT`. Zero collapses the tier: extents page out as soon as +/// they are written. +/// +/// The default pairs with the 0.05 budget default to leave ~20% of RAM for +/// the compressed tier — the same share zswap's default compressed pool +/// takes, and roughly RAM-sized logical coverage at the measured ~5.6x +/// ratio — while keeping three quarters of RAM for everything else in the +/// process. +pub const COLUMN_PAGED_BATCHER_POOL_RSS_TARGET_FRACTION: Config = Config::new( + "column_paged_batcher_pool_rss_target_fraction", + 0.25, + "Ceiling on the buffer pool's total RSS as a fraction of physical RAM; the headroom above \ + the slot budget holds compressed-but-resident extents. Zero pages extents out immediately.", +); + /// Whether rendering should use `mz_join_core` rather than DD's `JoinCore::join_core`. pub const ENABLE_MZ_JOIN_CORE: Config = Config::new( "enable_mz_join_core", @@ -536,4 +601,8 @@ pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet { .add(&COLUMN_PAGED_BATCHER_BUDGET_FRACTION) .add(&COLUMN_PAGED_BATCHER_LZ4) .add(&COLUMN_PAGED_BATCHER_SWAP_PAGEOUT) + .add(&COLUMN_PAGED_BATCHER_USE_POOL) + .add(&COLUMN_PAGED_BATCHER_SPILL_WORKER_COUNT) + .add(&COLUMN_PAGED_BATCHER_EAGER_BACKING) + .add(&COLUMN_PAGED_BATCHER_POOL_RSS_TARGET_FRACTION) } diff --git a/src/compute/src/compute_state.rs b/src/compute/src/compute_state.rs index 82e223f58b8d3..2f1144ec22860 100644 --- a/src/compute/src/compute_state.rs +++ b/src/compute/src/compute_state.rs @@ -324,21 +324,33 @@ impl ComputeState { // available, swap otherwise. { use mz_ore::pager::Backend; - use mz_timely_util::column_pager::{Codec, apply_tiered_config}; + use mz_timely_util::column_pager::{ + Codec, PoolPagerConfig, apply_pool_config, apply_tiered_config, + }; let enabled = ENABLE_COLUMN_PAGED_BATCHER_SPILL.get(config); + let use_pool = COLUMN_PAGED_BATCHER_USE_POOL.get(config); + let spill_threads = COLUMN_PAGED_BATCHER_SPILL_WORKER_COUNT.get(config); + let eager_backing = COLUMN_PAGED_BATCHER_EAGER_BACKING.get(config); let codec = COLUMN_PAGED_BATCHER_LZ4.get(config).then_some(Codec::Lz4); let swap_pageout = COLUMN_PAGED_BATCHER_SWAP_PAGEOUT.get(config); - // Budget derivation: fraction × announced memory limit, with a - // 128 MiB floor so the no-pressure case doesn't page per chunk. - // Falls back to a 4 GiB assumption if no limit was announced - // (e.g. dev environments). + // Budget derivation: fraction × physical RAM, with a 128 MiB + // floor so the no-pressure case doesn't page per chunk. Resident + // budgets derive from RAM — never from the announced memory + // limit, which on swap-provisioned nodes deliberately includes + // swap for the memory limiter's purposes. Falls back to a 4 GiB + // assumption if detection fails. The pool and tiered paths share + // the derivation: the budget bounds resident bytes either way. const MIB: usize = 1024 * 1024; - const DEFAULT_MEM_LIMIT: usize = 4 * 1024 * MIB; - let mem_limit = crate::memory_limiter::get_memory_limit().unwrap_or(DEFAULT_MEM_LIMIT); + const DEFAULT_RAM: usize = 4 * 1024 * MIB; + let ram = mz_ore::memory::physical_memory_bytes().unwrap_or(DEFAULT_RAM); let fraction = COLUMN_PAGED_BATCHER_BUDGET_FRACTION.get(config).max(0.0); - let total = usize::cast_lossy(f64::cast_lossy(mem_limit) * fraction).max(128 * MIB); + let total = usize::cast_lossy(f64::cast_lossy(ram) * fraction).max(128 * MIB); + let target_fraction = COLUMN_PAGED_BATCHER_POOL_RSS_TARGET_FRACTION + .get(config) + .max(0.0); + let rss_target = usize::cast_lossy(f64::cast_lossy(ram) * target_fraction); let backend = if self.context.scratch_directory.is_some() { Backend::File @@ -346,17 +358,50 @@ impl ComputeState { Backend::Swap }; - debug!( + let pool_config = PoolPagerConfig { enabled, - ?backend, - ?codec, - swap_pageout, - fraction, - mem_limit, - budget_bytes = total, - "column-paged batcher: applying tiered config", - ); - apply_tiered_config(enabled, total, backend, codec, swap_pageout); + budget_bytes: total, + spill_threads, + eager_backing, + rss_target_bytes: rss_target, + }; + if use_pool && apply_pool_config(pool_config) { + // Keep the tiered singleton configured even though the pool + // is the installed mechanism: consumers that captured a + // tiered pager (boot-time config ordering between the + // compute and storage protocols is unconstrained) must see + // the operator's budget and codec, not the singleton's + // zero-budget, codec-less boot state. + mz_timely_util::column_pager::tiered_policy().reconfigure(total, backend, codec); + info!( + enabled, + fraction, + ram, + budget_bytes = total, + spill_threads, + eager_backing, + rss_target_bytes = rss_target, + "column-paged batcher: applying pool config", + ); + } else { + if use_pool { + warn!( + "column-paged batcher: buffer pool unavailable; \ + falling back to tiered config", + ); + } + info!( + enabled, + ?backend, + ?codec, + swap_pageout, + fraction, + ram, + budget_bytes = total, + "column-paged batcher: applying tiered config", + ); + apply_tiered_config(enabled, total, backend, codec, swap_pageout); + } } // Remember the maintenance interval locally to avoid reading it from the config set on diff --git a/src/compute/src/server.rs b/src/compute/src/server.rs index b263db122cd3d..071709c22a6aa 100644 --- a/src/compute/src/server.rs +++ b/src/compute/src/server.rs @@ -101,10 +101,7 @@ pub async fn serve( assert_eq!(storage_log_readers.len(), workers_per_process); storage_log_readers.into_iter().map(Some).collect() }; - mz_timely_util::column_pager::metrics::register( - metrics_registry, - mz_timely_util::column_pager::tiered_policy(), - ); + mz_timely_util::column_pager::metrics::register(metrics_registry); let config = Config { persist_clients, diff --git a/src/compute/src/sink/correction_v2.rs b/src/compute/src/sink/correction_v2.rs index 3dbc39c2fa076..7a4b4b36ee5c0 100644 --- a/src/compute/src/sink/correction_v2.rs +++ b/src/compute/src/sink/correction_v2.rs @@ -2038,32 +2038,16 @@ mod tests { ); } - /// A [`PagingPolicy`] that always spills to the swap backend, uncompressed. - /// - /// The default global pager keeps every chunk resident; installing this drives the actual - /// spill path so the tests exercise [`Chunk::column`]'s page-in through [`mz_ore::pager`]. - /// - /// [`PagingPolicy`]: column_pager::PagingPolicy - struct ForceSwap; - - impl column_pager::PagingPolicy for ForceSwap { - fn decide(&self, _hint: column_pager::PageHint) -> column_pager::PageDecision { - column_pager::PageDecision::Page { - backend: mz_ore::pager::Backend::Swap, - codec: None, - } - } - fn record(&self, _event: column_pager::PageEvent) {} - } - - /// Install a global pager that spills every chunk to swap for the duration of `f`, then - /// restore the default (disabled) pager. The global pager is process-wide; concurrent tests - /// only ever observe a correct round-trip regardless of backend, so racing on it is benign. + /// Configure the global pager to spill every chunk to swap (uncompressed) for the duration + /// of `f`, then restore the default (disabled) pager. A zero tiered budget makes every + /// `decide` answer `Page`, driving the actual spill path so the tests exercise + /// [`Chunk::column`]'s page-in through [`mz_ore::pager`]. The pager configuration is + /// process-wide; concurrent tests only ever observe a correct round-trip regardless of + /// backend, so racing on it is benign. fn with_swap_pager(f: impl FnOnce() -> R) -> R { - use std::sync::Arc; - column_pager::set_global_pager(column_pager::ColumnPager::new(Arc::new(ForceSwap))); + column_pager::apply_tiered_config(true, 0, mz_ore::pager::Backend::Swap, None, false); let result = f(); - column_pager::set_global_pager(column_pager::ColumnPager::disabled()); + column_pager::apply_tiered_config(false, 0, mz_ore::pager::Backend::Swap, None, false); result } diff --git a/src/ore/Cargo.toml b/src/ore/Cargo.toml index c23b2cfe46542..8e4fa25c55405 100644 --- a/src/ore/Cargo.toml +++ b/src/ore/Cargo.toml @@ -37,6 +37,7 @@ ipnet.workspace = true itertools.workspace = true lgalloc = { workspace = true, optional = true } libc = { workspace = true, optional = true } +lz4_flex = { workspace = true, optional = true } mz-ore-proc = { path = "../ore-proc", default-features = false } num.workspace = true num-traits = { workspace = true, optional = true } @@ -146,7 +147,7 @@ assert-no-tracing = ["ctor"] assert = ["assert-no-tracing", "ctor", "tracing"] proptest = ["dep:proptest", "proptest-derive"] overflowing = ["assert"] -pager = ["dep:bytemuck", "libc", "rand", "dep:tracing"] +pager = ["dep:bytemuck", "libc", "rand", "dep:tracing", "dep:lz4_flex"] [[test]] name = "future" diff --git a/src/ore/src/lib.rs b/src/ore/src/lib.rs index 6c879d27160ac..6fed3fe198d76 100644 --- a/src/ore/src/lib.rs +++ b/src/ore/src/lib.rs @@ -50,6 +50,7 @@ pub mod hint; pub mod id_gen; pub mod iter; pub mod lex; +pub mod memory; #[cfg_attr(nightly_doc_features, doc(cfg(feature = "metrics")))] #[cfg(feature = "metrics")] pub mod metrics; @@ -70,6 +71,9 @@ pub mod pager; pub mod panic; pub mod path; pub mod permutations; +#[cfg_attr(nightly_doc_features, doc(cfg(all(feature = "pager", unix))))] +#[cfg(all(feature = "pager", unix))] +pub mod pool; #[cfg(feature = "process")] pub mod process; #[cfg(feature = "region")] diff --git a/src/ore/src/memory.rs b/src/ore/src/memory.rs new file mode 100644 index 0000000000000..22f9451651077 --- /dev/null +++ b/src/ore/src/memory.rs @@ -0,0 +1,94 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Physical memory introspection. + +/// Returns the physical memory available to this process in bytes: the +/// host's RAM, clamped by the cgroup (v2) memory limit when one is set. +/// `None` if detection fails. +/// +/// Deliberately distinct from any *announced* memory limit: on nodes whose +/// disk is provisioned as swap, the announced limit includes swap so the +/// memory limiter can bound total heap. Budgets that bound *resident* bytes +/// must instead derive from memory that can be resident, which is what this +/// reports. +pub fn physical_memory_bytes() -> Option { + let host = host_memory_bytes()?; + match cgroup_memory_max() { + Some(limit) if limit < host => Some(limit), + _ => Some(host), + } +} + +#[cfg(target_os = "linux")] +fn host_memory_bytes() -> Option { + let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?; + let line = meminfo.lines().find(|l| l.starts_with("MemTotal:"))?; + let kib: usize = line.split_whitespace().nth(1)?.parse().ok()?; + Some(kib * 1024) +} + +#[cfg(target_os = "macos")] +fn host_memory_bytes() -> Option { + let mut size: u64 = 0; + let mut len = std::mem::size_of::(); + // SAFETY: `sysctlbyname` reads into an out-buffer of the size we report; + // `hw.memsize` is a `u64` and `len` matches. + let ret = unsafe { + libc::sysctlbyname( + c"hw.memsize".as_ptr(), + std::ptr::from_mut(&mut size).cast::(), + &mut len, + std::ptr::null_mut(), + 0, + ) + }; + if ret == 0 { + usize::try_from(size).ok() + } else { + None + } +} + +#[cfg(not(any(target_os = "linux", target_os = "macos")))] +fn host_memory_bytes() -> Option { + None +} + +/// The cgroup v2 memory limit, if this process runs under one. The file +/// holds `max` (no limit) or a byte count; non-numeric content yields `None`. +#[cfg(target_os = "linux")] +fn cgroup_memory_max() -> Option { + let raw = std::fs::read_to_string("/sys/fs/cgroup/memory.max").ok()?; + raw.trim().parse().ok() +} + +#[cfg(not(target_os = "linux"))] +fn cgroup_memory_max() -> Option { + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[mz_ore::test] + fn detects_some_memory() { + let bytes = physical_memory_bytes().expect("detection works on test platforms"); + // Sanity: more than 64 MiB, less than 1 PiB. + assert!(bytes > 64 << 20); + assert!(bytes < 1 << 50); + } +} diff --git a/src/ore/src/pool.rs b/src/ore/src/pool.rs new file mode 100644 index 0000000000000..5fc41fb2feaf8 --- /dev/null +++ b/src/ore/src/pool.rs @@ -0,0 +1,2336 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Prototype buffer pool for dataflow state. See +//! `doc/developer/design/20260610_buffer_managed_state.md`. +//! +//! The pool is the cache: size-class anonymous virtual-memory regions whose +//! slots hold resident chunks. Slots are scoped to residency — eviction +//! returns a chunk's slot to the free list along with its physical pages, and +//! fault-in allocates a fresh one — so slot demand tracks the resident set +//! (bounded by the budget), not the potentially unbounded live backlog, and +//! a chunk's address is stable only between a fault-in and the next eviction. +//! Pointers into a chunk are valid only under a [`PinGuard`], which blocks +//! eviction; nothing may cache a pointer across pins. The backing is the +//! swap-backed extent store of the design's Layer 1: a page-aligned anonymous +//! allocation holding the chunk's lz4-compressed bytes, pushed to the swap +//! device with `MADV_PAGEOUT`. +//! +//! Residency is a state, not a type. Fault-in is synchronous on the pinning +//! caller (the design's `Faulting` transition collapses into the call), while +//! eviction I/O runs on spill threads when enabled — `WriteInFlight` marks a +//! chunk whose compression a spill thread owns — and inline on the evicting +//! caller otherwise. Chunks are immutable after [`Pool::insert`], which is +//! what makes a `BackedResident` slot always identical to its extent and +//! re-eviction free of I/O. +//! +//! Freeing an `UnbackedResident` chunk is a pure memory operation — the +//! design's "never write dead data" win, surfaced as `writes_elided` in +//! [`PoolStats`]. Budget pressure evicts cold chunks via a second-chance FIFO, +//! the design's backstop policy for unannotated chunks. + +mod extent; +mod region; + +use std::collections::VecDeque; +use std::ops::Deref; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex, Weak}; + +use crate::cast::CastFrom; +use crate::pool::extent::SwapExtent; +use crate::pool::region::{Region, SIZE_CLASSES}; + +/// Configuration for a [`Pool`]. +#[derive(Debug, Clone, Copy)] +pub struct PoolConfig { + /// Resident-bytes budget. [`Pool::enforce_budget`] evicts until the + /// uncompressed bytes of resident chunks fall to this bound. + pub budget_bytes: usize, + /// Virtual reservation per size class. Purely virtual: physical memory + /// materializes only for slots in use, and slots are scoped to residency, + /// so this must exceed the largest plausible *resident* set per class — + /// the budget plus pinned and in-flight slack, not the backlog. The + /// default is deliberately enormous (address space costs nothing, and + /// touched pages are bounded by peak residency) so that no realistic + /// budget, on any machine size, reaches the heap-fallback path. + pub class_capacity_bytes: usize, +} + +impl Default for PoolConfig { + fn default() -> Self { + PoolConfig { + budget_bytes: 256 << 20, + class_capacity_bytes: 1 << 40, + } + } +} + +/// Residency state of a chunk. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Residency { + /// Lives only in the pool; no extent copy exists. Freeing it never + /// touches the backing store. + UnbackedResident, + /// Resident, and an identical extent copy exists; eviction releases + /// physical pages without I/O. + BackedResident, + /// Resident and readable, with compression into an extent scheduled on a + /// spill thread. Completion moves the chunk to [`Residency::Evicted`] + /// (or [`Residency::BackedResident`] if pins appeared meanwhile); a free + /// or a pin observed at dequeue cancels the write instead. + WriteInFlight, + /// Extent copy only; the chunk holds no slot. Access faults it back in + /// from the extent into a freshly allocated slot, so its address may + /// differ from its last residence. + Evicted, + /// Larger than the largest size class; held as a plain heap allocation, + /// always resident. A prototype limitation, not a design state. + Oversize, +} + +/// Snapshot of pool counters. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct PoolStats { + /// Chunks inserted. + pub inserts: u64, + /// Chunks freed (handle dropped or [`ChunkHandle::take`]n). + pub frees: u64, + /// Backing writes elided: chunks freed while `UnbackedResident`, dead + /// before any compression or extent write happened. + pub writes_elided: u64, + /// Evictions that compressed the chunk into a new extent. + pub evictions_compress: u64, + /// Evictions of `BackedResident` chunks: pure page release, no I/O. + pub evictions_cheap: u64, + /// Fault-ins of `Evicted` chunks. + pub faults: u64, + /// Compressed bytes written into extents. + pub extent_bytes_written: u64, + /// Evictions handed to spill threads. + pub spill_scheduled: u64, + /// Scheduled evictions cancelled before compressing (freed or pinned at + /// dequeue). + pub spill_cancelled: u64, + /// Entries currently queued for or being processed by spill threads. + pub spill_in_flight: u64, + /// Inserts that fell back to the heap because their size class had no + /// free slot (the live set outgrew the class reservation). Heap-backed + /// chunks behave like oversize ones: always resident, never paged. + pub slot_exhausted_fallbacks: u64, + /// Live size-classed chunks across all classes, whatever their residency. + /// For backlog-shaped consumers this tracks the un-drained backlog in + /// chunks. (Slots are scoped to residency, so the quantity that exhausts + /// a class reservation is the resident subset, bounded by the budget.) + pub live_chunks: u64, + /// Uncompressed bytes of currently resident chunks (including oversize). + pub resident_bytes: u64, + /// Uncompressed bytes of live oversize chunks. + pub oversize_bytes: u64, + /// Class bytes of free slots currently kept warm (pages resident for + /// fault-free reuse). Bounded by a fraction of the budget; RSS exceeds + /// `resident_bytes` by up to this amount. + pub warm_bytes: u64, + /// Slot allocations served from the warm list: reuses that faulted no + /// pages and skipped the kernel's page zeroing. + pub warm_reuses: u64, + /// Chunks eagerly compressed to `BackedResident` by idle spill threads + /// (write-behind): still readable in their slots, with eviction + /// pre-paid. + pub eager_backs: u64, + /// Allocation bytes of compressed extents currently resident — the + /// compressed-but-resident middle tier. Bounded by the RSS target; + /// exceeding it pages the oldest extents out to the swap device. + pub extent_resident_bytes: u64, + /// Extents pushed to the swap device by RSS-target enforcement. + pub extent_pageouts: u64, +} + +#[derive(Debug, Default)] +struct Counters { + inserts: AtomicU64, + spill_scheduled: AtomicU64, + spill_cancelled: AtomicU64, + slot_exhausted_fallbacks: AtomicU64, + frees: AtomicU64, + writes_elided: AtomicU64, + evictions_compress: AtomicU64, + evictions_cheap: AtomicU64, + faults: AtomicU64, + extent_bytes_written: AtomicU64, + resident_bytes: AtomicU64, + oversize_bytes: AtomicU64, + warm_bytes: AtomicU64, + warm_reuses: AtomicU64, + eager_backs: AtomicU64, + extent_resident_bytes: AtomicU64, + extent_pageouts: AtomicU64, +} + +/// A buffer pool over swap-backed extents. Cheap to clone; all clones share +/// one budget and one backing store. +#[derive(Debug, Clone)] +pub struct Pool(Arc); + +#[derive(Debug)] +struct PoolInner { + /// Resident-bytes target. Atomic so a running pool can be retuned in + /// place (operator-driven budget changes) without orphaning live + /// handles, which share this value through their `Arc`. + budget_bytes: AtomicU64, + /// Ceiling on the pool's *total* RSS: slots (the budget) plus warm free + /// slots plus compressed-resident extents. The compressed tier's + /// capacity derives as `rss_target - budget - warm cap`; zero (the + /// default) collapses the tier, paging every extent out as soon as it + /// is written. + rss_target_bytes: AtomicU64, + /// One region per entry of [`SIZE_CLASSES`], same order. + regions: Vec, + /// Second-chance FIFO of eviction candidates. Entries for freed chunks + /// go stale in place and are dropped by [`PoolInner::prune_queue`]. + queue: Mutex>>, + /// FIFO of chunks whose extents are resident, oldest first — the + /// RSS-target enforcement's victim queue. Entries go stale when an + /// extent pages out, is dropped, or its chunk dies; visits drop them. + extent_queue: Mutex>>, + /// Number of live size-classed chunks (whatever their residency), which + /// is the number of non-stale queue entries; [`PoolInner::prune_queue`] + /// compacts the queue against it. + live_chunks: AtomicU64, + /// Single-flight claim for budget enforcement. + enforcing: Mutex<()>, + counters: Counters, + spill: Spill, +} + +/// Hand-off point between budget enforcement and spill threads. Eviction I/O +/// (compression and the synchronous-reclaim `pageout`) runs on spill threads +/// when enabled, keeping multi-millisecond work off the threads that trip the +/// budget; with no spill threads, eviction runs inline on the caller. +#[derive(Debug, Default)] +struct Spill { + /// Chunks in `WriteInFlight`, awaiting a spill thread. + queue: Mutex>>, + cv: std::sync::Condvar, + /// Whether evictions are handed to spill threads. Set when threads are + /// first spawned; cleared to fall back to inline eviction. + enabled: std::sync::atomic::AtomicBool, + /// Whether idle spill threads eagerly compress unbacked chunks to + /// `BackedResident` (write-behind): the chunk stays readable in its slot + /// while a compressed extent accumulates on the swap device, so a later + /// budget-driven eviction is a pure page release instead of a + /// compression. Costs CPU on chunks that die before eviction would have + /// reached them; pays at every pressure event. + eager: std::sync::atomic::AtomicBool, + /// Number of spill threads spawned (spawn-once; later config changes + /// only toggle `enabled`). + threads: AtomicU64, + /// Queued plus currently-processing entries; `quiesce` waits on zero. + in_flight: AtomicU64, +} + +/// Beyond this many queued spill entries, eviction degrades to inline on the +/// caller: bounded memory overshoot under burst beats an unbounded queue of +/// still-resident chunks. +const SPILL_QUEUE_MAX: usize = 64; + +/// What a spill thread does with a chunk once compressed. +#[derive(Clone, Copy, PartialEq, Eq)] +enum SpillKind { + /// Budget-driven: release the slot, leaving the chunk `Evicted`. Pins + /// observed at dequeue cancel the work — a chunk being read is + /// demonstrably hot and should not be evicted. + Evict, + /// Eager write-behind: keep the slot, leaving the chunk + /// `BackedResident`. Pins are irrelevant — concurrent reads of the + /// immutable slot coexist with compression, and the slot stays put. + Back, +} + +/// Location of a chunk's pool slot. +#[derive(Debug, Clone, Copy)] +struct Slot { + /// Index into [`SIZE_CLASSES`] and `PoolInner::regions`. + class: usize, + /// Slot index within the region. + index: u32, +} + +#[derive(Debug)] +struct ChunkMeta { + pool: Arc, + /// Length in `u64` words; immutable. + len: usize, + /// Size class for slot allocations; `None` for empty chunks and payloads + /// beyond the largest class. Immutable: the chunk's *slot* comes and goes + /// with residency, but it is always drawn from this class. + class: Option, + state: Mutex, +} + +#[derive(Debug)] +struct ChunkState { + residency: Residency, + pins: u32, + /// Second-chance bit, set on pin and cleared (in lieu of eviction) when + /// the budget enforcer first visits the chunk. + touched: bool, + /// Set when the owning handle is dropped, so a queue entry upgraded + /// concurrently with the free cannot touch a recycled slot. + freed: bool, + /// Whether the chunk currently has an entry in the eviction queue. The + /// queue holds resident chunks only: entries are dropped when a visit + /// finds the chunk evicted, and fault-in re-enqueues. The flag is queue + /// hygiene, not a safety invariant — duplicate entries would be benign + /// (visits are idempotent); it exists so fault-hot chunks cannot grow + /// the queue without bound between enforcement passes. + queued: bool, + /// The chunk's slot, held exactly while the chunk occupies pool memory + /// (the resident states and `WriteInFlight`). Eviction returns the slot + /// to the region free list; fault-in allocates a fresh one, so a chunk's + /// address is stable only between a fault-in and the next eviction. + /// Pointers into the slot are valid only under a pin, which blocks + /// eviction. + slot: Option, + /// The backing copy; present exactly in the `BackedResident` and + /// `Evicted` states. + extent: Option, + /// The payload of an `Oversize` chunk. + oversize: Option>, +} + +impl ChunkMeta { + fn len_bytes(&self) -> usize { + self.len * std::mem::size_of::() + } +} + +/// Handle to one immutable chunk in a [`Pool`]. Dropping the handle frees the +/// chunk: the slot (if resident) returns to the region free list with its +/// physical pages released, and the extent (if any) is deallocated, +/// discarding any swapped copy for free. Releasing the pages keeps RSS +/// aligned with the `resident_bytes` gauge the budget enforcer trusts; +/// without it, freed slots would hold warm pages the enforcer cannot see. +#[derive(Debug)] +pub struct ChunkHandle { + meta: Arc, +} + +/// Pins a chunk resident for the guard's lifetime; derefs to the chunk's +/// contents. Pinned chunks are never evicted. +#[derive(Debug)] +pub struct PinGuard<'a> { + meta: &'a ChunkMeta, + ptr: *const u64, + len: usize, +} + +impl Pool { + /// Creates a pool, reserving one virtual region per size class. + pub fn new(cfg: PoolConfig) -> std::io::Result { + let regions = SIZE_CLASSES + .iter() + .map(|&class_size| Region::new(class_size, cfg.class_capacity_bytes)) + .collect::>>()?; + Ok(Pool(Arc::new(PoolInner { + budget_bytes: AtomicU64::new(u64::cast_from(cfg.budget_bytes)), + rss_target_bytes: AtomicU64::new(0), + regions, + queue: Mutex::new(VecDeque::new()), + extent_queue: Mutex::new(VecDeque::new()), + live_chunks: AtomicU64::new(0), + enforcing: Mutex::new(()), + counters: Counters::default(), + spill: Spill::default(), + }))) + } + + /// Copies `data` into a pool slot of the smallest class that fits and + /// clears `data`, preserving its capacity. The returned handle starts + /// `UnbackedResident`. Empty input returns a length-0 handle holding no + /// slot; input larger than the largest class falls back to a plain heap + /// allocation ([`Residency::Oversize`]), a prototype limitation. + pub fn insert(&self, data: &mut Vec) -> ChunkHandle { + let handle = self.insert_with(data.len(), |dst| dst.copy_from_slice(data.as_slice())); + data.clear(); + handle + } + + /// Allocates a chunk of `len` words and fills it in place: `fill` + /// receives the chunk's slot memory directly and must overwrite all of + /// it (the slot's prior contents are unspecified). Payloads beyond the + /// largest size class fall back to a heap allocation, as in + /// [`Pool::insert`]. + /// + /// This is the zero-staging insert: serialization can write its single + /// copy straight into pool memory, paying one page population instead of + /// staging through caller-side buffers that fault their own pages and + /// die immediately after. + pub fn insert_with(&self, len: usize, fill: impl FnOnce(&mut [u64])) -> ChunkHandle { + let inner = &self.0; + inner.counters.inserts.fetch_add(1, Ordering::Relaxed); + let len_bytes = len * std::mem::size_of::(); + if len == 0 { + fill(&mut []); + return ChunkHandle { + meta: Arc::new(ChunkMeta { + pool: Arc::clone(inner), + len: 0, + class: None, + state: Mutex::new(ChunkState { + residency: Residency::UnbackedResident, + pins: 0, + touched: false, + freed: false, + queued: false, + slot: None, + extent: None, + oversize: None, + }), + }), + }; + } + let class = SIZE_CLASSES.iter().position(|&c| c >= len_bytes); + // A class with no free slot degrades to the heap path below: the + // resident set outgrew the class reservation, and an unpageable chunk + // beats a dead replica. Warn once; the fallback counter tracks scale. + let slot = class.and_then(|class| { + let (index, warm) = inner.regions[class].alloc()?; + if warm { + inner.note_warm_reuse(inner.regions[class].class_size()); + } + Some(Slot { class, index }) + }); + if slot.is_none() && class.is_some() { + inner.note_slot_exhausted(len_bytes); + } + let meta = match slot { + Some(slot) => { + let region = &inner.regions[slot.class]; + // SAFETY: the freshly allocated slot is at least `len_bytes` + // long (the class fits the payload) and is exclusively owned + // by this not-yet-shared chunk, so the mutable borrow is + // unique; region memory is mapped and writable, and `u64` has + // no validity requirements beyond size, so exposing the + // unspecified prior contents through `&mut [u64]` is sound. + let dst = unsafe { + std::slice::from_raw_parts_mut(region.slot_ptr(slot.index).cast::(), len) + }; + fill(dst); + inner + .counters + .resident_bytes + .fetch_add(u64::cast_from(len_bytes), Ordering::Relaxed); + ChunkMeta { + pool: Arc::clone(inner), + len, + class, + state: Mutex::new(ChunkState { + residency: Residency::UnbackedResident, + pins: 0, + touched: false, + freed: false, + queued: true, + slot: Some(slot), + extent: None, + oversize: None, + }), + } + } + None => { + let mut payload = vec![0u64; len]; + fill(&mut payload); + inner + .counters + .resident_bytes + .fetch_add(u64::cast_from(len_bytes), Ordering::Relaxed); + inner + .counters + .oversize_bytes + .fetch_add(u64::cast_from(len_bytes), Ordering::Relaxed); + ChunkMeta { + pool: Arc::clone(inner), + len, + class: None, + state: Mutex::new(ChunkState { + residency: Residency::Oversize, + pins: 0, + touched: false, + freed: false, + queued: false, + slot: None, + extent: None, + oversize: Some(payload), + }), + } + } + }; + let meta = Arc::new(meta); + if meta.class.is_some() { + inner.live_chunks.fetch_add(1, Ordering::Relaxed); + inner + .queue + .lock() + .expect("pool queue poisoned") + .push_back(Arc::downgrade(&meta)); + } + inner.enforce_budget(); + ChunkHandle { meta } + } + + /// Snapshot of the pool's counters. + pub fn stats(&self) -> PoolStats { + let c = &self.0.counters; + PoolStats { + inserts: c.inserts.load(Ordering::Relaxed), + frees: c.frees.load(Ordering::Relaxed), + writes_elided: c.writes_elided.load(Ordering::Relaxed), + evictions_compress: c.evictions_compress.load(Ordering::Relaxed), + evictions_cheap: c.evictions_cheap.load(Ordering::Relaxed), + faults: c.faults.load(Ordering::Relaxed), + extent_bytes_written: c.extent_bytes_written.load(Ordering::Relaxed), + resident_bytes: c.resident_bytes.load(Ordering::Relaxed), + oversize_bytes: c.oversize_bytes.load(Ordering::Relaxed), + warm_bytes: c.warm_bytes.load(Ordering::Relaxed), + warm_reuses: c.warm_reuses.load(Ordering::Relaxed), + eager_backs: c.eager_backs.load(Ordering::Relaxed), + extent_resident_bytes: c.extent_resident_bytes.load(Ordering::Relaxed), + extent_pageouts: c.extent_pageouts.load(Ordering::Relaxed), + spill_scheduled: c.spill_scheduled.load(Ordering::Relaxed), + spill_cancelled: c.spill_cancelled.load(Ordering::Relaxed), + spill_in_flight: self.0.spill.in_flight.load(Ordering::Relaxed), + slot_exhausted_fallbacks: c.slot_exhausted_fallbacks.load(Ordering::Relaxed), + live_chunks: self.0.live_chunks.load(Ordering::Relaxed), + } + } + + /// Enables or disables off-worker eviction I/O. The first call with + /// `threads > 0` spawns that many spill threads (spawn-once: later calls + /// only toggle participation); `threads == 0` falls back to inline + /// eviction on the caller for subsequent victims, letting any queued + /// work drain. + pub fn set_spill_threads(&self, threads: usize) { + if threads == 0 { + self.0.spill.enabled.store(false, Ordering::Relaxed); + return; + } + let spawned = self.0.spill.threads.load(Ordering::Relaxed); + if spawned == 0 { + let to_spawn = u64::cast_from(threads); + if self + .0 + .spill + .threads + .compare_exchange(0, to_spawn, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + for i in 0..threads { + let inner = Arc::clone(&self.0); + std::thread::Builder::new() + .name(format!("pool-spill-{i}")) + .spawn(move || inner.spill_worker()) + .expect("spawn pool spill thread"); + } + } + } + self.0.spill.enabled.store(true, Ordering::Relaxed); + } + + /// Enables or disables eager backing: when on, idle spill threads + /// compress unbacked chunks to `BackedResident` ahead of pressure, so + /// budget-driven eviction becomes a pure page release. Only meaningful + /// with spill threads spawned. + pub fn set_eager_backing(&self, eager: bool) { + self.0.spill.eager.store(eager, Ordering::Relaxed); + if eager { + self.0.spill.cv.notify_all(); + } + } + + /// Test hook: performs one eager-backing step on the calling thread. + /// Returns whether progress was made. + #[doc(hidden)] + pub fn back_step(&self) -> bool { + self.0.back_one() + } + + /// Test hook: waits until the spill queue is empty and no entry is being + /// processed, so tests observe deterministic post-eviction states. + #[doc(hidden)] + pub fn quiesce_spill(&self) { + while self.0.spill.in_flight.load(Ordering::Relaxed) > 0 { + std::thread::yield_now(); + } + } + + /// Test hook: enables spill scheduling without spawning threads, so tests + /// drive the queue deterministically via [`Pool::spill_step`]. + #[doc(hidden)] + pub fn enable_spill_without_threads(&self) { + self.0.spill.enabled.store(true, Ordering::Relaxed); + } + + /// Test hook: processes one queued spill entry on the calling thread. + /// Returns whether an entry was processed. + #[doc(hidden)] + pub fn spill_step(&self) -> bool { + let popped = self + .0 + .spill + .queue + .lock() + .expect("spill queue poisoned") + .pop_front(); + let Some(meta) = popped else { + return false; + }; + self.0.spill_process(&meta, SpillKind::Evict); + self.0.spill.in_flight.fetch_sub(1, Ordering::Relaxed); + true + } + + /// Evicts cold chunks until resident bytes fall to the budget or every + /// queued chunk has been visited once. Runs automatically on every insert + /// and fault-in; explicit calls are for tests and pressure hooks. + pub fn enforce_budget(&self) { + self.0.enforce_budget(); + } + + /// Retunes the resident-bytes budget in place and enforces it. Live + /// handles share the new value immediately through their `Arc`; + /// a shrink takes effect by evicting on this call, a grow simply leaves + /// more headroom for future inserts and fault-ins. + pub fn set_budget(&self, budget_bytes: usize) { + let new = u64::cast_from(budget_bytes); + let prev = self.0.budget_bytes.swap(new, Ordering::Relaxed); + // Config application calls this per worker per tick; only a change + // warrants an enforcement pass (a grow needs none, and inserts and + // fault-ins enforce continuously anyway). + if new < prev { + self.0.enforce_budget(); + } + } + + /// Retunes the ceiling on the pool's total RSS — slots plus warm slots + /// plus compressed-resident extents. The compressed tier's capacity is + /// the gap above the budget and warm cap; zero (the default) collapses + /// the tier, paging extents out as soon as they are written. A shrink + /// takes effect by paging out the oldest extents on this call. + pub fn set_rss_target(&self, target_bytes: usize) { + let new = u64::cast_from(target_bytes); + let prev = self.0.rss_target_bytes.swap(new, Ordering::Relaxed); + if new < prev { + self.0.enforce_rss_target(); + } + } + + /// Test-only: the number of entries in the second-chance queue, live and + /// stale. + #[cfg(test)] + fn queue_len(&self) -> usize { + self.0.queue.lock().expect("pool queue poisoned").len() + } + + /// Explicitly evicts one chunk. No-op if the chunk is pinned, already + /// evicted, in flight, empty, or oversize. With spill threads enabled the + /// compression is handed off and completes asynchronously (observable via + /// [`Residency::WriteInFlight`]); without them it runs inline. + pub fn evict(&self, handle: &ChunkHandle) { + let meta = &handle.meta; + let mut state = meta.state.lock().expect("chunk state poisoned"); + if !meta.pool.spill_handoff(meta, &mut state) { + meta.pool.evict_locked(meta, &mut state); + } + drop(state); + meta.pool.enforce_or_defer_rss_target(); + } + + /// Test hook: overwrites every free slot's bytes with `0xDE`. The free + /// list can hand a faulting chunk the very slot it occupied before + /// eviction, still holding its old bytes on platforms where + /// `MADV_DONTNEED` keeps contents (macOS); poisoning lets tests prove + /// that fault-in decompresses from the extent rather than passing stale + /// slot memory through. + #[doc(hidden)] + pub fn poison_free_slots(&self) { + for region in &self.0.regions { + region.poison_free_slots(); + } + } +} + +impl PoolInner { + /// Drops queue entries whose chunk has been freed, detected by their + /// `Weak` no longer holding a live chunk. The compaction runs only when + /// stale entries outnumber live chunks (plus a small floor), so its cost + /// amortizes to a constant per insert and the queue length stays + /// proportional to the number of live slotted chunks even when the pool + /// never comes under budget pressure. + fn prune_queue(&self) { + let live = usize::cast_from(self.live_chunks.load(Ordering::Relaxed)); + let mut queue = self.queue.lock().expect("pool queue poisoned"); + if queue.len() > 2 * live + 16 { + queue.retain(|weak| weak.strong_count() > 0); + } + } + + fn enforce_budget(&self) { + // Single-flight: enforcement runs synchronously on whichever thread + // trips it (every insert and fault-in), and concurrent passes would + // convoy on the queue mutex doing redundant scans of the same + // candidates. One pass at a time reaches the budget just as well; + // skipped callers rely on the in-progress pass. A poisoned claim + // means a prior pass panicked; recover and keep enforcing rather + // than silently disabling the budget for the process's lifetime. + let guard = match self.enforcing.try_lock() { + Ok(guard) => guard, + Err(std::sync::TryLockError::WouldBlock) => return, + Err(std::sync::TryLockError::Poisoned(poisoned)) => poisoned.into_inner(), + }; + self.enforce_budget_inner(); + drop(guard); + // Inline evictions above may have grown the compressed tier. + self.enforce_or_defer_rss_target(); + } + + fn enforce_budget_inner(&self) { + self.prune_queue(); + let resident = |counters: &Counters| counters.resident_bytes.load(Ordering::Relaxed); + // The queue holds resident chunks only (evicted chunks leave it and + // fault-in re-enqueues), so a full pass is proportional to the + // resident set. Visit each queued chunk at most twice per call: a + // first visit may only clear the second-chance bit, so a second is + // needed before an over-budget call is guaranteed to evict every + // unpinned chunk it saw. The bound keeps a queue of pinned chunks + // from spinning this loop forever. + let mut remaining = self + .queue + .lock() + .expect("pool queue poisoned") + .len() + .saturating_mul(2); + while remaining > 0 && resident(&self.counters) > self.budget_bytes.load(Ordering::Relaxed) + { + remaining -= 1; + let popped = self.queue.lock().expect("pool queue poisoned").pop_front(); + let Some(weak) = popped else { + break; + }; + let Some(meta) = weak.upgrade() else { + continue; + }; + let requeue = { + // `try_lock`: a chunk mid-eviction or mid-fault holds its + // lock for milliseconds; skipping it beats convoying every + // budget enforcer in the process behind one chunk's I/O. + let Ok(mut state) = meta.state.try_lock() else { + self.queue + .lock() + .expect("pool queue poisoned") + .push_back(weak); + continue; + }; + if state.freed { + state.queued = false; + false + } else if matches!(state.residency, Residency::Evicted | Residency::Oversize) { + // Nothing to evict: drop the entry. A fault-in re-enqueues + // the chunk, so the queue stays proportional to the + // resident set rather than accumulating every chunk ever + // evicted. + state.queued = false; + false + } else if state.pins > 0 { + true + } else if state.touched { + state.touched = false; + true + } else if self.spill_handoff(&meta, &mut state) { + // Stays queued while in flight; once the spill commits to + // `Evicted`, the next visit drops the entry. + true + } else { + self.evict_locked(&meta, &mut state); + if state.residency == Residency::Evicted { + state.queued = false; + false + } else { + true + } + } + }; + if requeue { + self.queue + .lock() + .expect("pool queue poisoned") + .push_back(weak); + } + } + } + + fn evict_locked(&self, meta: &Arc, state: &mut ChunkState) { + let Some(slot) = state.slot else { + return; + }; + if state.pins > 0 || state.freed { + return; + } + let region = &self.regions[slot.class]; + match state.residency { + Residency::UnbackedResident => { + // SAFETY: the slot belongs to this live chunk, the state lock + // is held and `pins == 0`, so nothing writes the slot while + // this borrow is live; resident contents are initialized + // (written at insert or fault-in) and `len` fits the class. + let data = unsafe { + std::slice::from_raw_parts( + region.slot_ptr(slot.index).cast_const().cast::(), + meta.len, + ) + }; + let extent = SwapExtent::write(data); + self.counters + .extent_bytes_written + .fetch_add(u64::cast_from(extent.comp_len()), Ordering::Relaxed); + self.counters + .evictions_compress + .fetch_add(1, Ordering::Relaxed); + self.note_extent_resident(meta, extent.alloc_size()); + state.extent = Some(extent); + } + Residency::BackedResident => { + self.counters + .evictions_cheap + .fetch_add(1, Ordering::Relaxed); + } + Residency::WriteInFlight | Residency::Evicted | Residency::Oversize => return, + } + // `release_slot`'s precondition holds: `pins == 0` and `!freed`, + // checked above under the held state lock. + self.release_slot(meta, state); + state.residency = Residency::Evicted; + } + + /// Records and (once) warns about a size-class slot exhaustion forcing a + /// heap fallback. + fn note_slot_exhausted(&self, len_bytes: usize) { + self.counters + .slot_exhausted_fallbacks + .fetch_add(1, Ordering::Relaxed); + static EXHAUSTED_ONCE: std::sync::Once = std::sync::Once::new(); + EXHAUSTED_ONCE.call_once(|| { + tracing::warn!( + len_bytes, + "buffer pool size class exhausted; falling back to heap chunks \ + (raise PoolConfig::class_capacity_bytes)", + ); + }); + } + + /// Whether the next eviction should be handed to spill threads: enabled, + /// and the queue is below the backpressure bound (beyond it, callers + /// evict inline rather than growing an unbounded queue of still-resident + /// chunks). + fn spill_eligible(&self) -> bool { + self.spill.enabled.load(Ordering::Relaxed) + && usize::cast_from(self.spill.in_flight.load(Ordering::Relaxed)) < SPILL_QUEUE_MAX + } + + /// Hands a `WriteInFlight` chunk to the spill threads. + fn spill_schedule(&self, meta: Arc) { + self.counters + .spill_scheduled + .fetch_add(1, Ordering::Relaxed); + self.spill.in_flight.fetch_add(1, Ordering::Relaxed); + self.spill + .queue + .lock() + .expect("spill queue poisoned") + .push_back(meta); + self.spill.cv.notify_one(); + } + + /// Spill-thread main loop. The thread owns an `Arc`, so the + /// pool (a process-wide singleton in production) lives as long as its + /// threads. Queued (budget-driven) evictions take priority; with eager + /// backing enabled, idle threads compress unbacked chunks to + /// `BackedResident` instead of parking, and park with a timeout once + /// everything reachable is backed. + fn spill_worker(self: Arc) { + loop { + // Tier-2 pageouts ride the spill threads: every pass through the + // loop (job completion, condvar wakeup, park timeout) trims the + // compressed tier if needed. A single atomic load when under cap. + self.enforce_rss_target(); + let meta = { + let mut queue = self.spill.queue.lock().expect("spill queue poisoned"); + if let Some(meta) = queue.pop_front() { + Some(meta) + } else if self.spill.eager.load(Ordering::Relaxed) { + None + } else { + // Park until a hand-off or an enforcement wakeup; the + // timeout backstops a lost notify. Loop back through the + // tier check before re-examining the queue. + let _ = self + .spill + .cv + .wait_timeout(queue, std::time::Duration::from_millis(100)) + .expect("spill queue poisoned"); + continue; + } + }; + match meta { + Some(meta) => { + self.spill_process(&meta, SpillKind::Evict); + self.spill.in_flight.fetch_sub(1, Ordering::Relaxed); + } + None => { + if !self.back_one() { + // Nothing left to back: park briefly. Eviction + // hand-offs and enforcement wakeups notify the + // condvar; fresh inserts are picked up by the + // timeout. + let queue = self.spill.queue.lock().expect("spill queue poisoned"); + if queue.is_empty() { + let _ = self + .spill + .cv + .wait_timeout(queue, std::time::Duration::from_millis(100)) + .expect("spill queue poisoned"); + } + } + } + } + } + } + + /// Eagerly compresses one unbacked chunk from the eviction queue into + /// `BackedResident`, returning whether any progress was made (work done + /// or candidates remaining). Bounded scan; non-actionable entries are + /// requeued or dropped per the same rules budget enforcement uses, + /// except that the second-chance `touched` bit is left alone — backing + /// is not an eviction and must not consume a chunk's reprieve. + fn back_one(&self) -> bool { + for _ in 0..16 { + let popped = self.queue.lock().expect("pool queue poisoned").pop_front(); + let Some(weak) = popped else { + return false; + }; + let Some(meta) = weak.upgrade() else { + continue; + }; + { + let Ok(mut state) = meta.state.try_lock() else { + self.queue + .lock() + .expect("pool queue poisoned") + .push_back(weak); + continue; + }; + if state.freed { + state.queued = false; + continue; + } + match state.residency { + Residency::Evicted | Residency::Oversize => { + state.queued = false; + continue; + } + Residency::UnbackedResident => { + state.residency = Residency::WriteInFlight; + } + Residency::BackedResident | Residency::WriteInFlight => { + self.queue + .lock() + .expect("pool queue poisoned") + .push_back(weak); + continue; + } + } + } + self.spill.in_flight.fetch_add(1, Ordering::Relaxed); + self.spill_process(&meta, SpillKind::Back); + self.spill.in_flight.fetch_sub(1, Ordering::Relaxed); + // The chunk remains an eviction candidate (now a cheap one). + self.queue + .lock() + .expect("pool queue poisoned") + .push_back(weak); + return true; + } + true + } + + /// Performs (or cancels) one scheduled compression. Lock discipline: the + /// chunk lock is held only to validate and to commit — never across the + /// compression or the `pageout` reclaim, which are the multi-millisecond + /// costs this path exists to keep off budget-enforcing threads. + fn spill_process(&self, meta: &Arc, kind: SpillKind) { + // Validate under the lock, then release it for the I/O. The slot is + // captured under the lock and remains owned by this chunk for the + // unlocked compression: in `WriteInFlight`, eviction skips the chunk + // and `ChunkHandle::drop` defers slot release to this thread. + let slot; + { + let mut state = meta.state.lock().expect("chunk state poisoned"); + if state.freed { + // Freed while queued: the deferred cleanup is ours, and the + // chunk dies without ever compressing — the write-behind + // cancellation window. `ChunkHandle::drop` already counted + // the free and the live-chunks decrement. + self.counters + .spill_cancelled + .fetch_add(1, Ordering::Relaxed); + self.counters.writes_elided.fetch_add(1, Ordering::Relaxed); + self.release_slot(meta, &mut state); + return; + } + if state.residency != Residency::WriteInFlight { + return; + } + if kind == SpillKind::Evict && state.pins > 0 { + // Being read: cancel rather than compress data that is + // demonstrably hot. The chunk stays in the second-chance + // queue and a later pass reconsiders it. (Backing proceeds + // pinned: reads of the immutable slot coexist with + // compression, and the slot is staying put anyway.) + state.residency = Residency::UnbackedResident; + self.counters + .spill_cancelled + .fetch_add(1, Ordering::Relaxed); + return; + } + slot = state.slot.expect("write-in-flight chunk has a slot"); + } + let region = &self.regions[slot.class]; + // SAFETY: the chunk is live (the queue holds an `Arc`) and in + // `WriteInFlight`, so the slot is not recycled (`ChunkHandle::drop` + // defers slot release to this thread in that state) and its contents + // are initialized and immutable; concurrent pins may read the slot + // but nothing writes it. `len` fits the class. + let data = unsafe { + std::slice::from_raw_parts( + region.slot_ptr(slot.index).cast_const().cast::(), + meta.len, + ) + }; + let extent = SwapExtent::write(data); + // Commit under the lock. + let mut state = meta.state.lock().expect("chunk state poisoned"); + if state.freed { + // Freed during compression: the extent is garbage; cleanup is + // ours as above. Compression ran, so this is not an elided free. + self.counters + .spill_cancelled + .fetch_add(1, Ordering::Relaxed); + self.release_slot(meta, &mut state); + return; + } + self.counters + .extent_bytes_written + .fetch_add(u64::cast_from(extent.comp_len()), Ordering::Relaxed); + self.note_extent_resident(meta, extent.alloc_size()); + state.extent = Some(extent); + if kind == SpillKind::Back { + // Write-behind: the chunk stays readable in its slot; the extent + // makes any later budget-driven eviction a pure page release. + self.counters.eager_backs.fetch_add(1, Ordering::Relaxed); + state.residency = Residency::BackedResident; + drop(state); + self.enforce_rss_target(); + return; + } + self.counters + .evictions_compress + .fetch_add(1, Ordering::Relaxed); + if state.pins > 0 { + // Pinned during compression: keep the slot resident; the extent + // makes any later eviction cheap. + state.residency = Residency::BackedResident; + drop(state); + self.enforce_rss_target(); + return; + } + // `release_slot`'s precondition holds: `pins == 0` and `!freed`, + // both observed under the held state lock. + self.release_slot(meta, &mut state); + state.residency = Residency::Evicted; + drop(state); + self.enforce_rss_target(); + } + + /// Releases `state`'s slot — slot returned to the region free list, + /// physical pages discarded unless the slot joins the bounded warm pool — + /// and decrements resident bytes. Releasing pages beyond the warm pool is + /// what keeps RSS aligned with the `resident_bytes` gauge the budget + /// enforcer trusts; the warm pool relaxes that alignment by an explicit, + /// bounded amount (`warm_bytes`, capped at a fraction of the budget) so + /// slot reuse faults no pages and skips the kernel's page zeroing. Slots + /// are scoped to residency: fault-in allocates a fresh slot, so a chunk's + /// address is stable only between a fault-in and the next eviction. + /// + /// Precondition, established by every caller under the held state lock: + /// no reference into the slot exists — either `pins == 0`, or the handle + /// is gone (`freed` set) so no `PinGuard` can be created and none + /// survives. This is what makes the `dontneed` below sound, and what + /// makes keeping a warm slot's stale contents safe: the slot's next + /// occupant fully overwrites every byte it reads, satisfying the + /// contents-undefined contract either way. + fn release_slot(&self, meta: &ChunkMeta, state: &mut ChunkState) { + let slot = state.slot.take().expect("slotted chunk"); + let region = &self.regions[slot.class]; + let warm = self.try_keep_warm(region.class_size()); + if !warm { + // SAFETY: no reference into the slot exists (the function-level + // precondition, established under the held state lock). + unsafe { + region::dontneed(region.slot_ptr(slot.index), region.class_size()); + } + } + region.free(slot.index, warm); + self.counters + .resident_bytes + .fetch_sub(u64::cast_from(meta.len_bytes()), Ordering::Relaxed); + } + + /// The warm pool's byte ceiling: an eighth of the budget, clamped at an + /// absolute maximum. The fraction sizes fault amortization at small + /// budgets; the clamp keeps large budgets from parking gigabytes of idle + /// warm slots no fault rate could justify. + fn warm_cap(&self) -> u64 { + (self.budget_bytes.load(Ordering::Relaxed) / 8).min(1 << 30) + } + + /// Claims warm-pool capacity for a slot of `class_size` bytes, returning + /// whether the slot may keep its pages. The RSS overshoot the warm pool + /// introduces is bounded by [`PoolInner::warm_cap`] and visible as the + /// `warm_bytes` stat. + fn try_keep_warm(&self, class_size: usize) -> bool { + let cap = self.warm_cap(); + let class_bytes = u64::cast_from(class_size); + self.counters + .warm_bytes + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| { + (cur + class_bytes <= cap).then_some(cur + class_bytes) + }) + .is_ok() + } + + /// Records a warm slot reuse: the allocation faulted no pages, and its + /// bytes leave the warm pool. + fn note_warm_reuse(&self, class_size: usize) { + self.counters + .warm_bytes + .fetch_sub(u64::cast_from(class_size), Ordering::Relaxed); + self.counters.warm_reuses.fetch_add(1, Ordering::Relaxed); + } + + /// Capacity of the compressed-resident tier: the RSS target's headroom + /// above the slot budget and the warm cap. Zero when no target is set — + /// extents then page out as soon as written, today's pre-tier behavior. + fn compressed_cap(&self) -> u64 { + let target = self.rss_target_bytes.load(Ordering::Relaxed); + let floor = self + .budget_bytes + .load(Ordering::Relaxed) + .saturating_add(self.warm_cap()); + target.saturating_sub(floor) + } + + /// Counts a newly resident extent (written or faulted back in) against + /// the compressed tier and enqueues its chunk for RSS-target + /// enforcement. Callers hold the chunk's state lock with the extent + /// present and resident, and follow up with + /// [`PoolInner::enforce_rss_target`] once the lock is released. + fn note_extent_resident(&self, meta: &Arc, extent_alloc: usize) { + self.counters + .extent_resident_bytes + .fetch_add(u64::cast_from(extent_alloc), Ordering::Relaxed); + self.extent_queue + .lock() + .expect("extent queue poisoned") + .push_back(Arc::downgrade(meta)); + } + + /// Uncounts a resident extent that is being dropped (chunk freed or + /// degraded). Its queue entry goes stale and is dropped on visit. + fn note_extent_released(&self, extent: &SwapExtent) { + if extent.is_resident() { + self.counters + .extent_resident_bytes + .fetch_sub(u64::cast_from(extent.alloc_size()), Ordering::Relaxed); + } + } + + /// Routes RSS-target enforcement off latency-sensitive threads: with + /// spill threads running, wakes one to perform the pageouts + /// (`MADV_PAGEOUT` is synchronous reclaim — page-table walks, TLB + /// shootdowns, writeback submission — bounded per compressed extent but + /// not free at chunk rates); without them, enforces inline as the only + /// option. + /// + /// Deferral makes the target eventually-enforced with bounded lag (a + /// notify with every spill thread mid-job is absorbed; the next loop + /// pass catches up). The backstop below turns that into a bound by + /// construction: a caller finding the tier at double its capacity + /// enforces inline regardless, so sustained creation can never outrun + /// trimming by more than one capacity's worth. + fn enforce_or_defer_rss_target(&self) { + if self.spill.enabled.load(Ordering::Relaxed) + && self.spill.threads.load(Ordering::Relaxed) > 0 + { + let resident = self.counters.extent_resident_bytes.load(Ordering::Relaxed); + if resident > self.compressed_cap().saturating_mul(2) { + self.enforce_rss_target(); + } else { + self.spill.cv.notify_one(); + } + } else { + self.enforce_rss_target(); + } + } + + /// Pages out the oldest resident extents until the compressed tier falls + /// to its capacity. The compression is already paid and the device write + /// is the kernel's async writeback, so each pageout is one bounded + /// madvise; spill threads run this between jobs, and other threads only + /// when no spill threads exist (see + /// [`PoolInner::enforce_or_defer_rss_target`]). Not single-flighted: + /// concurrent passes pop disjoint victims. Visits are bounded by the + /// queue's length at entry; stale entries (extent paged out, dropped, or + /// chunk dead) are dropped. + fn enforce_rss_target(&self) { + let cap = self.compressed_cap(); + let resident = |c: &Counters| c.extent_resident_bytes.load(Ordering::Relaxed); + // Under-cap is the common case: answer it with one atomic load and + // no queue lock, so frequent callers (the spill loop) stay cheap. + if resident(&self.counters) <= cap { + return; + } + let mut remaining = self + .extent_queue + .lock() + .expect("extent queue poisoned") + .len(); + while remaining > 0 && resident(&self.counters) > cap { + remaining -= 1; + let popped = self + .extent_queue + .lock() + .expect("extent queue poisoned") + .pop_front(); + let Some(weak) = popped else { + break; + }; + let Some(meta) = weak.upgrade() else { + continue; + }; + // `try_lock`: a chunk mid-fault or mid-compression holds its lock + // for milliseconds; requeue rather than convoy behind it. + let Ok(mut state) = meta.state.try_lock() else { + self.extent_queue + .lock() + .expect("extent queue poisoned") + .push_back(weak); + continue; + }; + match &mut state.extent { + Some(extent) if extent.is_resident() => { + extent.pageout(); + self.counters + .extent_resident_bytes + .fetch_sub(u64::cast_from(extent.alloc_size()), Ordering::Relaxed); + self.counters + .extent_pageouts + .fetch_add(1, Ordering::Relaxed); + } + // Paged out already, dropped, or the chunk degraded: the + // entry is stale. A later resident event re-enqueues. + _ => {} + } + } + } + + /// If the chunk is an unpinned, live `UnbackedResident` and the spill + /// threads have capacity, transitions it to `WriteInFlight` and hands it + /// to them, returning `true`. The hand-off happens under the held state + /// lock; the spill thread blocks on that lock only after this call + /// returns and the caller releases it. + fn spill_handoff(&self, meta: &Arc, state: &mut ChunkState) -> bool { + if state.residency != Residency::UnbackedResident + || state.pins > 0 + || state.freed + || !self.spill_eligible() + { + return false; + } + state.residency = Residency::WriteInFlight; + self.spill_schedule(Arc::clone(meta)); + true + } +} + +impl ChunkHandle { + /// Length of the chunk in `u64` words. + pub fn len(&self) -> usize { + self.meta.len + } + + /// Returns `true` if the chunk holds no data. + pub fn is_empty(&self) -> bool { + self.meta.len == 0 + } + + /// Length of the chunk in bytes. + pub fn len_bytes(&self) -> usize { + self.meta.len_bytes() + } + + /// The chunk's current residency state. + pub fn residency(&self) -> Residency { + self.meta + .state + .lock() + .expect("chunk state poisoned") + .residency + } + + /// Pins the chunk resident, faulting it in from its extent if evicted, + /// and returns a guard dereferencing to its contents. Concurrent pinners + /// of an evicted chunk serialize on the chunk's state lock; the second + /// observes `BackedResident` and skips the fault. + /// + /// A fault-in raises resident bytes and so enforces the budget, keeping + /// read-only traffic (a seek-heavy phase performs no inserts) bounded by + /// the budget; the just-pinned chunk is protected by its pin count. + pub fn pin(&self) -> PinGuard<'_> { + let meta = &*self.meta; + // The empty chunk holds no slot and nothing to protect: hand out a + // dangling-but-aligned pointer (valid for a zero-length slice) + // without touching the lock or the pin count. `PinGuard::drop` + // mirrors the skip. + if meta.len == 0 { + return PinGuard { + meta, + ptr: std::ptr::NonNull::::dangling().as_ptr().cast_const(), + len: 0, + }; + } + let mut state = meta.state.lock().expect("chunk state poisoned"); + let mut faulted = false; + let ptr = match state.residency { + Residency::Oversize => { + let payload = state.oversize.as_ref().expect("oversize chunk has payload"); + payload.as_ptr() + } + Residency::Evicted => { + // Fault-in allocates a fresh slot: slots are scoped to + // residency, so the chunk's address may differ from its last + // residence. If the class is exhausted, decompress to the + // heap instead and let the chunk live out its days as an + // oversize-style resident — degraded, never dead. + let class = meta.class.expect("evicted chunk has a size class"); + faulted = true; + meta.pool.counters.faults.fetch_add(1, Ordering::Relaxed); + meta.pool + .counters + .resident_bytes + .fetch_add(u64::cast_from(meta.len_bytes()), Ordering::Relaxed); + match meta.pool.regions[class].alloc() { + Some((index, warm)) => { + let slot = Slot { class, index }; + let region = &meta.pool.regions[class]; + if warm { + meta.pool.note_warm_reuse(region.class_size()); + } + let slot_ptr = region.slot_ptr(slot.index); + // SAFETY: the freshly allocated slot is exclusively + // owned by this chunk under the held state lock, so + // no other reference into it exists; `len_bytes` fits + // the class. + let dst = + unsafe { std::slice::from_raw_parts_mut(slot_ptr, meta.len_bytes()) }; + let extent = state.extent.as_mut().expect("evicted chunk has an extent"); + // Reading faults the extent's pages back in; re-count + // it against the compressed tier and re-enqueue. + let was_resident = extent.is_resident(); + extent.read_into(dst); + if !was_resident { + let alloc = extent.alloc_size(); + meta.pool.note_extent_resident(&self.meta, alloc); + } + state.slot = Some(slot); + state.residency = Residency::BackedResident; + slot_ptr.cast_const().cast::() + } + None => { + meta.pool.note_slot_exhausted(meta.len_bytes()); + let mut payload = vec![0u64; meta.len]; + let dst: &mut [u8] = bytemuck::cast_slice_mut(payload.as_mut_slice()); + let extent = state.extent.as_mut().expect("evicted chunk has an extent"); + // The extent is dropped below; only uncount it if it + // was counted (resident) before this read revived it. + let was_resident = extent.is_resident(); + extent.read_into(dst); + if was_resident { + let alloc = u64::cast_from(extent.alloc_size()); + meta.pool + .counters + .extent_resident_bytes + .fetch_sub(alloc, Ordering::Relaxed); + } + state.extent = None; + let ptr = payload.as_ptr(); + state.oversize = Some(payload); + state.residency = Residency::Oversize; + meta.pool + .counters + .oversize_bytes + .fetch_add(u64::cast_from(meta.len_bytes()), Ordering::Relaxed); + ptr + } + } + } + Residency::UnbackedResident | Residency::BackedResident | Residency::WriteInFlight => { + let slot = state.slot.expect("resident non-empty chunk has a slot"); + meta.pool.regions[slot.class] + .slot_ptr(slot.index) + .cast_const() + .cast::() + } + }; + state.touched = true; + state.pins += 1; + // A fault-in made the chunk resident again: re-enqueue it as an + // eviction candidate (its entry was dropped when a queue visit found + // it evicted). The flag dedups against entries still circulating. + let enqueue = faulted && !state.queued && state.residency != Residency::Oversize; + if enqueue { + state.queued = true; + } + drop(state); + if enqueue { + meta.pool + .queue + .lock() + .expect("pool queue poisoned") + .push_back(Arc::downgrade(&self.meta)); + } + // Enforce after releasing the state lock: the enforcer locks chunk + // states itself, and the pin count already shields this chunk from + // being chosen as a victim. The fault also revived the extent's + // pages, so the compressed tier may need trimming too. + if faulted { + meta.pool.enforce_budget(); + meta.pool.enforce_or_defer_rss_target(); + } + PinGuard { + meta, + ptr, + len: meta.len, + } + } + + /// If the chunk is evicted, hints the kernel to swap its extent back in + /// ahead of need; otherwise a no-op. + pub fn prefetch(&self) { + let state = self.meta.state.lock().expect("chunk state poisoned"); + if state.residency == Residency::Evicted { + if let Some(extent) = &state.extent { + extent.prefetch(); + } + } + } + + /// Copies the whole contents into `dst` (cleared first), then frees the + /// chunk. + pub fn take(self, dst: &mut Vec) { + dst.clear(); + if self.meta.len > 0 { + let pin = self.pin(); + dst.extend_from_slice(&pin); + } + } + + /// Test-only: the byte size of the chunk's size class, or `None` for + /// empty and oversize chunks. + #[cfg(test)] + pub(crate) fn size_class_bytes(&self) -> Option { + self.meta.class.map(|class| SIZE_CLASSES[class]) + } +} + +impl Drop for ChunkHandle { + fn drop(&mut self) { + let pool = &self.meta.pool; + let mut state = self.meta.state.lock().expect("chunk state poisoned"); + debug_assert_eq!(state.pins, 0, "chunk freed while pinned"); + pool.counters.frees.fetch_add(1, Ordering::Relaxed); + state.freed = true; + if self.meta.class.is_some() { + pool.live_chunks.fetch_sub(1, Ordering::Relaxed); + } + let len_bytes = u64::cast_from(self.meta.len_bytes()); + // `release_slot`'s precondition holds in every arm below: the handle + // is being dropped, so no `PinGuard` (which borrows the handle) + // exists, and `freed` was set under the state lock held here, so + // concurrent queue visitors skip the chunk. + match state.residency { + Residency::UnbackedResident => { + if state.slot.is_some() { + pool.counters.writes_elided.fetch_add(1, Ordering::Relaxed); + pool.release_slot(&self.meta, &mut state); + } + } + Residency::BackedResident => { + pool.release_slot(&self.meta, &mut state); + if let Some(extent) = &state.extent { + pool.note_extent_released(extent); + } + state.extent = None; + } + Residency::Evicted => { + // Eviction already released the slot. + debug_assert!(state.slot.is_none(), "evicted chunk holds no slot"); + if let Some(extent) = &state.extent { + pool.note_extent_released(extent); + } + state.extent = None; + } + Residency::WriteInFlight => { + // A spill thread may be reading the slot to compress it. + // `freed` (set above) tells it the chunk died; it owns the + // slot release, the `resident_bytes` decrement, and the + // cancellation accounting from here. + } + Residency::Oversize => { + pool.counters + .resident_bytes + .fetch_sub(len_bytes, Ordering::Relaxed); + pool.counters + .oversize_bytes + .fetch_sub(len_bytes, Ordering::Relaxed); + state.oversize = None; + } + } + } +} + +impl Deref for PinGuard<'_> { + type Target = [u64]; + + fn deref(&self) -> &[u64] { + // SAFETY: `ptr`/`len` were captured under the chunk's state lock with + // the pin count incremented. Eviction, poisoning, and freeing all + // check the pin count under that lock and skip pinned chunks, so the + // chunk's slot (or heap payload) cannot be released or relocated + // while this guard lives, and chunks are immutable after insert, so + // the pointee is initialized, valid, and unaliased by writers for the + // guard's lifetime. For the empty chunk `ptr` is a dangling + // well-aligned pointer, which is valid for a zero-length slice. + unsafe { std::slice::from_raw_parts(self.ptr, self.len) } + } +} + +impl Drop for PinGuard<'_> { + fn drop(&mut self) { + // Empty-chunk pins never took the lock or incremented the count. + if self.len == 0 { + return; + } + let mut state = self.meta.state.lock().expect("chunk state poisoned"); + state.pins -= 1; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Keep test pools small: 64 MiB of virtual reservation per class. + fn test_pool(budget_bytes: usize) -> Pool { + Pool::new(PoolConfig { + budget_bytes, + class_capacity_bytes: 64 << 20, + }) + .expect("pool creation") + } + + fn payload(words: usize, seed: u64) -> Vec { + (0..u64::cast_from(words)) + .map(|i| seed.wrapping_mul(0x9E3779B97F4A7C15).wrapping_add(i)) + .collect() + } + + /// Words that fill a 64 KiB class exactly. + const SMALL: usize = (64 << 10) / 8; + + #[allow(dead_code)] + fn assert_handle_send_sync() { + fn check() {} + check::(); + check::(); + } + + /// With an RSS target set, evicted chunks keep their extents resident + /// (the compressed tier); shrinking the target pages the oldest extents + /// out; reads revive them and re-count them. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn compressed_tier_round_trip() { + let pool = test_pool(256 << 20); + pool.set_rss_target(1 << 30); + let orig = payload(SMALL, 21); + let handle = pool.insert(&mut orig.clone()); + pool.evict(&handle); + assert_eq!(handle.residency(), Residency::Evicted); + let stats = pool.stats(); + assert!( + stats.extent_resident_bytes > 0, + "under the target, the extent stays resident", + ); + assert_eq!(stats.extent_pageouts, 0); + + // Shrinking the target to zero pages the extent out. + pool.set_rss_target(0); + let stats = pool.stats(); + assert_eq!(stats.extent_resident_bytes, 0, "tier collapsed"); + assert_eq!(stats.extent_pageouts, 1); + + // Reading revives the extent: contents round-trip, and with the + // target restored the revived extent is counted again. + pool.set_rss_target(1 << 30); + pool.poison_free_slots(); + { + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + assert!( + pool.stats().extent_resident_bytes > 0, + "revived and counted" + ); + + // Dropping the handle uncounts the resident extent. + drop(handle); + assert_eq!(pool.stats().extent_resident_bytes, 0); + } + + /// With no RSS target (the default), extents page out as soon as they + /// are written — the pre-tier behavior. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn default_target_pages_extents_immediately() { + let pool = test_pool(256 << 20); + let handle = pool.insert(&mut payload(SMALL, 22)); + pool.evict(&handle); + let stats = pool.stats(); + assert_eq!(stats.extent_resident_bytes, 0); + assert_eq!(stats.extent_pageouts, 1); + } + + /// Eager backing compresses a chunk to `BackedResident` while it stays + /// readable in its slot; the later budget-driven eviction is a pure page + /// release, and the contents round-trip through the extent. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn eager_backing_round_trip() { + let pool = test_pool(256 << 20); + let orig = payload(SMALL, 11); + let handle = pool.insert(&mut orig.clone()); + assert_eq!(handle.residency(), Residency::UnbackedResident); + + assert!(pool.back_step(), "one chunk is backable"); + assert_eq!(handle.residency(), Residency::BackedResident); + let stats = pool.stats(); + assert_eq!(stats.eager_backs, 1); + assert_eq!( + stats.evictions_compress, 0, + "backing is not an eviction and compresses off the eviction counter", + ); + assert!(stats.extent_bytes_written > 0); + + // Still readable without a fault path: the slot is resident. + { + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + + // The pre-paid eviction is cheap, and the extent round-trips. + pool.evict(&handle); + assert_eq!(handle.residency(), Residency::Evicted); + assert_eq!(pool.stats().evictions_cheap, 1); + pool.poison_free_slots(); + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + + /// Backing proceeds while the chunk is pinned: reads of the immutable + /// slot coexist with compression, and the slot stays put. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn eager_backing_proceeds_pinned() { + let pool = test_pool(256 << 20); + let orig = payload(SMALL, 12); + let handle = pool.insert(&mut orig.clone()); + let pin = handle.pin(); + assert!(pool.back_step()); + assert_eq!(handle.residency(), Residency::BackedResident); + assert_eq!(&*pin, orig.as_slice()); + drop(pin); + assert_eq!(pool.stats().eager_backs, 1); + } + + /// Freeing under the warm cap parks the slot warm; the next insert of the + /// same class reuses it fault-free and the accounting balances. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn warm_slot_reuse() { + // Budget 8 MiB: warm cap = 1 MiB, so a 64 KiB slot fits warm. + let pool = test_pool(8 << 20); + let orig = payload(SMALL, 7); + let handle = pool.insert(&mut orig.clone()); + drop(handle); + let after_free = pool.stats(); + assert_eq!(after_free.warm_bytes, 64 << 10, "freed slot parks warm"); + assert_eq!(after_free.warm_reuses, 0); + + let handle = pool.insert(&mut orig.clone()); + let after_reuse = pool.stats(); + assert_eq!(after_reuse.warm_reuses, 1, "second insert reuses warm slot"); + assert_eq!(after_reuse.warm_bytes, 0, "reuse drains the warm pool"); + // Contents are correct despite the skipped page release. + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + + /// The warm pool is capped at an eighth of the budget; frees beyond the + /// cap release their pages and park cold. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn warm_pool_respects_cap() { + // Budget 1 MiB: warm cap = 128 KiB = two 64 KiB slots. + let pool = test_pool(1 << 20); + let handles: Vec<_> = (0..4) + .map(|seed| pool.insert(&mut payload(SMALL, seed))) + .collect(); + drop(handles); + let stats = pool.stats(); + assert_eq!( + stats.warm_bytes, + 128 << 10, + "warm pool stops at the budget/8 cap", + ); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn round_trip_resident() { + let pool = test_pool(256 << 20); + let orig = payload(1000, 1); + let mut data = orig.clone(); + let capacity = data.capacity(); + let handle = pool.insert(&mut data); + assert!(data.is_empty()); + assert_eq!(data.capacity(), capacity, "insert preserves capacity"); + assert_eq!(handle.len(), orig.len()); + assert_eq!(handle.len_bytes(), orig.len() * 8); + assert_eq!(handle.residency(), Residency::UnbackedResident); + { + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + let mut out = Vec::new(); + handle.take(&mut out); + assert_eq!(out, orig); + let stats = pool.stats(); + assert_eq!(stats.inserts, 1); + assert_eq!(stats.frees, 1); + assert_eq!(stats.resident_bytes, 0); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn evict_then_fault_preserves_contents() { + let pool = test_pool(256 << 20); + let orig = payload(SMALL, 2); + let handle = pool.insert(&mut orig.clone()); + pool.evict(&handle); + assert_eq!(handle.residency(), Residency::Evicted); + let stats = pool.stats(); + assert_eq!(stats.evictions_compress, 1); + assert_eq!(stats.resident_bytes, 0); + assert!(stats.extent_bytes_written > 0); + { + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + assert_eq!(handle.residency(), Residency::BackedResident); + assert_eq!(pool.stats().faults, 1); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn poison_proves_fault_in_reads_extent() { + let pool = test_pool(256 << 20); + let orig = payload(SMALL, 3); + let handle = pool.insert(&mut orig.clone()); + pool.evict(&handle); + // The free list can hand fault-in the chunk's previous slot, and on + // macOS `MADV_DONTNEED` may have left the old bytes in it; poison all + // free slots so a fault-in passing stale memory through would fail. + pool.poison_free_slots(); + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn re_eviction_of_backed_chunk_is_cheap() { + let pool = test_pool(256 << 20); + let orig = payload(SMALL, 4); + let handle = pool.insert(&mut orig.clone()); + pool.evict(&handle); + let written_after_first = pool.stats().extent_bytes_written; + assert!(written_after_first > 0); + { + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + pool.evict(&handle); + let stats = pool.stats(); + assert_eq!(stats.evictions_compress, 1); + assert_eq!(stats.evictions_cheap, 1); + assert_eq!(stats.extent_bytes_written, written_after_first); + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn pinned_chunks_are_never_evicted() { + let pool = test_pool(0); + let orig = payload(SMALL, 5); + // Insert enforces the (zero) budget, evicting the chunk immediately. + let handle = pool.insert(&mut orig.clone()); + assert_eq!(handle.residency(), Residency::Evicted); + let pin = handle.pin(); + pool.enforce_budget(); + assert_eq!(handle.residency(), Residency::BackedResident); + assert_eq!(&*pin, orig.as_slice()); + pool.evict(&handle); + assert_eq!(handle.residency(), Residency::BackedResident); + drop(pin); + // The pin set the second-chance bit: enforcement clears it on the + // first visit and evicts on the second. + pool.enforce_budget(); + assert_eq!(handle.residency(), Residency::Evicted); + } + + /// Slots are scoped to residency: eviction releases the slot, so a + /// capacity holding exactly one chunk can serve any number of chunks one + /// at a time. (Addresses are deliberately NOT stable across evictions — + /// pointers are valid only under a pin.) + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn eviction_releases_the_slot() { + // One 64 KiB slot per class. + let pool = Pool::new(PoolConfig { + budget_bytes: usize::MAX, + class_capacity_bytes: 64 << 10, + }) + .expect("pool creation"); + let a = pool.insert(&mut payload(SMALL, 6)); + pool.evict(&a); + // The class's only slot is free again: a second chunk fits without + // falling back to the heap. + let b = pool.insert(&mut payload(SMALL, 7)); + assert_eq!(b.residency(), Residency::UnbackedResident); + assert_eq!(pool.stats().slot_exhausted_fallbacks, 0); + // Faulting `a` back in needs the slot `b` now holds: evict `b` first, + // then both round-trip through their extents. + pool.evict(&b); + assert_eq!(&*a.pin(), &payload(SMALL, 6)[..]); + drop(a); + assert_eq!(&*b.pin(), &payload(SMALL, 7)[..]); + } + + /// The eviction queue holds resident chunks only: an enforcement pass + /// drops entries for evicted chunks, and fault-in re-enqueues, so the + /// scan each insert pays stays proportional to the resident set rather + /// than every chunk ever evicted. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn queue_holds_resident_chunks_only() { + let pool = test_pool(128 << 10); + let mut handles = Vec::new(); + for seed in 0..8 { + handles.push(pool.insert(&mut payload(SMALL, 800 + seed))); + } + // Budget pressure evicted ~6 of 8; one more pass visits the evicted + // entries and drops them (their first visit performed the eviction + // and dropped them already, but second-chance survivors may linger). + pool.enforce_budget(); + let resident = handles + .iter() + .filter(|h| h.residency() != Residency::Evicted) + .count(); + assert!( + pool.queue_len() <= resident + 1, + "queue ({}) tracks the resident set ({resident}), not all 8 live chunks", + pool.queue_len(), + ); + // Fault one back in: it must become an eviction candidate again. + let evicted = handles + .iter() + .find(|h| h.residency() == Residency::Evicted) + .expect("something was evicted"); + drop(evicted.pin()); + pool.evict(evicted); + assert_eq!( + evicted.residency(), + Residency::Evicted, + "fault-in re-enqueued the chunk, so it could be evicted again", + ); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn dead_data_is_never_written() { + let pool = test_pool(256 << 20); + let handle = pool.insert(&mut payload(SMALL, 7)); + drop(handle); + let stats = pool.stats(); + assert_eq!(stats.frees, 1); + assert_eq!(stats.writes_elided, 1); + assert_eq!(stats.extent_bytes_written, 0); + assert_eq!(stats.resident_bytes, 0); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn budget_is_enforced_on_insert() { + let budget = 128 << 10; + let pool = test_pool(budget); + let mut handles = Vec::new(); + for seed in 0..8 { + handles.push(pool.insert(&mut payload(SMALL, 100 + seed))); + } + let stats = pool.stats(); + assert!( + stats.resident_bytes <= u64::cast_from(budget), + "resident {} exceeds budget {}", + stats.resident_bytes, + budget, + ); + assert!(stats.evictions_compress >= 6); + let resident = handles + .iter() + .filter(|h| { + matches!( + h.residency(), + Residency::UnbackedResident | Residency::BackedResident + ) + }) + .count(); + assert_eq!(resident, 2, "budget holds exactly two small chunks"); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn set_budget_retunes_in_place() { + let pool = test_pool(usize::MAX); + let mut handles = Vec::new(); + for seed in 0..8 { + handles.push(pool.insert(&mut payload(SMALL, 200 + seed))); + } + assert_eq!(pool.stats().evictions_compress, 0); + + // Shrinking the budget evicts immediately. + pool.set_budget(128 << 10); + let stats = pool.stats(); + assert!(stats.resident_bytes <= 128 << 10); + assert!(stats.evictions_compress >= 6); + + // Growing it leaves headroom: a fresh insert stays resident. + pool.set_budget(usize::MAX); + let h = pool.insert(&mut payload(SMALL, 300)); + assert_eq!(h.residency(), Residency::UnbackedResident); + for h in &handles { + let pin = h.pin(); + assert_eq!(pin.len(), SMALL); + } + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn second_chance_prefers_untouched_victims() { + // Budget holds one and a half small chunks. + let pool = test_pool((64 << 10) + (32 << 10)); + let orig_a = payload(SMALL, 8); + let handle_a = pool.insert(&mut orig_a.clone()); + { + let pin = handle_a.pin(); + assert_eq!(&*pin, orig_a.as_slice()); + } + // Inserting B overflows the budget; A is older but touched, so the + // enforcer gives it a second chance and evicts untouched B instead. + let handle_b = pool.insert(&mut payload(SMALL, 9)); + assert_eq!(handle_a.residency(), Residency::UnbackedResident); + assert_eq!(handle_b.residency(), Residency::Evicted); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn empty_insert_consumes_no_slot() { + let pool = test_pool(256 << 20); + let mut data = Vec::new(); + let handle = pool.insert(&mut data); + assert_eq!(handle.len(), 0); + assert!(handle.is_empty()); + assert_eq!(handle.size_class_bytes(), None); + { + let pin = handle.pin(); + assert!(pin.is_empty()); + } + let mut out = vec![1u64, 2, 3]; + handle.take(&mut out); + assert!(out.is_empty()); + let stats = pool.stats(); + assert_eq!(stats.resident_bytes, 0); + assert_eq!(stats.writes_elided, 0); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn oversize_round_trips() { + let pool = test_pool(256 << 20); + let words = SIZE_CLASSES[SIZE_CLASSES.len() - 1] / 8 + 1; + let orig = payload(words, 10); + let handle = pool.insert(&mut orig.clone()); + assert_eq!(handle.residency(), Residency::Oversize); + assert_eq!(handle.size_class_bytes(), None); + let stats = pool.stats(); + assert_eq!(stats.oversize_bytes, u64::cast_from(words * 8)); + // Explicit eviction and budget enforcement leave oversize chunks + // resident. + pool.evict(&handle); + pool.enforce_budget(); + assert_eq!(handle.residency(), Residency::Oversize); + { + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + let mut out = Vec::new(); + handle.take(&mut out); + assert_eq!(out, orig); + let stats = pool.stats(); + assert_eq!(stats.oversize_bytes, 0); + assert_eq!(stats.resident_bytes, 0); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn payload_lands_in_smallest_fitting_class() { + let pool = test_pool(256 << 20); + let handle = pool.insert(&mut payload((100 << 10) / 8, 11)); + assert_eq!(handle.size_class_bytes(), Some(128 << 10)); + let exact = pool.insert(&mut payload(SMALL, 12)); + assert_eq!(exact.size_class_bytes(), Some(64 << 10)); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn prefetch_is_safe_in_all_states() { + let pool = test_pool(256 << 20); + let orig = payload(SMALL, 13); + let handle = pool.insert(&mut orig.clone()); + handle.prefetch(); + pool.evict(&handle); + handle.prefetch(); + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn multithreaded_smoke() { + // Budget of one small chunk: four threads each holding one resident + // chunk keep the pool over budget, so every insert's and fault-in's + // enforcement pass selects victims owned by other threads, racing + // cross-thread eviction against pin, fault-in, and free. + let pool = test_pool(64 << 10); + let threads: Vec<_> = (0..4u64) + .map(|t| { + let pool = pool.clone(); + std::thread::spawn(move || { + for round in 0..50u64 { + let seed = t * 1000 + round; + let orig = payload(SMALL, seed); + let handle = pool.insert(&mut orig.clone()); + pool.evict(&handle); + { + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + // Enforcement under a held pin must spare the + // pinned chunk and may evict everyone else's. + pool.enforce_budget(); + assert_eq!(&*pin, orig.as_slice()); + } + let mut out = Vec::new(); + handle.take(&mut out); + assert_eq!(out, orig); + } + }) + }) + .collect(); + for thread in threads { + thread.join().expect("worker thread panicked"); + } + let stats = pool.stats(); + assert_eq!(stats.inserts, 200); + assert_eq!(stats.frees, 200); + assert_eq!(stats.resident_bytes, 0); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn concurrent_pin_enforce_churn() { + // Races the three actors that can touch one chunk's slot: readers + // pinning and verifying shared chunks, an enforcer evicting them + // (the zero budget makes every unpinned chunk a victim), and a + // churner whose insert/free traffic turns the queue over. Contents + // are asserted on every pin, so an eviction or slot recycle racing a + // fault-in shows up as corruption. + let pool = test_pool(0); + let shared: Arc, ChunkHandle)>> = Arc::new( + (0..4u64) + .map(|seed| { + let orig = payload(SMALL, 600 + seed); + let handle = pool.insert(&mut orig.clone()); + (orig, handle) + }) + .collect(), + ); + let mut threads = Vec::new(); + for t in 0..2u64 { + let shared = Arc::clone(&shared); + threads.push(std::thread::spawn(move || { + for round in 0..300u64 { + let (orig, handle) = &shared[usize::cast_from((t + round) % 4)]; + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + })); + } + { + let pool = pool.clone(); + threads.push(std::thread::spawn(move || { + for _ in 0..600 { + pool.enforce_budget(); + } + })); + } + { + let pool = pool.clone(); + threads.push(std::thread::spawn(move || { + for round in 0..300u64 { + let orig = payload(SMALL, 700 + round); + let handle = pool.insert(&mut orig.clone()); + let pin = handle.pin(); + assert_eq!(&*pin, orig.as_slice()); + } + })); + } + for thread in threads { + thread.join().expect("worker thread panicked"); + } + drop(shared); + assert_eq!(pool.stats().resident_bytes, 0); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn budget_is_enforced_on_fault_in() { + // Read-only traffic: every chunk starts evicted and is then pinned + // once, with no inserts in between. Fault-in itself must enforce the + // budget, or the working set would grow to the whole run. + let budget = 128 << 10; + let pool = test_pool(budget); + let origs: Vec<_> = (0..8u64).map(|seed| payload(SMALL, 300 + seed)).collect(); + let handles: Vec<_> = origs.iter().map(|o| pool.insert(&mut o.clone())).collect(); + for handle in &handles { + pool.evict(handle); + } + assert_eq!(pool.stats().resident_bytes, 0); + for (index, handle) in handles.iter().enumerate() { + { + let pin = handle.pin(); + assert_eq!(&*pin, origs[index].as_slice()); + } + let resident = pool.stats().resident_bytes; + assert!( + resident <= u64::cast_from(budget), + "resident {resident} exceeds budget {budget} on the fault-in path", + ); + } + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn queue_stays_bounded_under_budget() { + // Chunk churn that never exceeds the budget: the enforcer's eviction + // loop never runs, so stale queue entries must be reclaimed by + // pruning alone. + let pool = test_pool(256 << 20); + for seed in 0..1000u64 { + let handle = pool.insert(&mut payload(SMALL, seed)); + drop(handle); + } + let len = pool.queue_len(); + assert!(len <= 32, "queue holds {len} entries for zero live chunks"); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn spill_async_evict_round_trip() { + let pool = test_pool(usize::MAX); + pool.enable_spill_without_threads(); + let h = pool.insert(&mut payload(SMALL, 400)); + pool.evict(&h); + assert_eq!(h.residency(), Residency::WriteInFlight); + // Readable while in flight. + { + let pin = h.pin(); + assert_eq!(&pin[..3], &payload(SMALL, 400)[..3]); + } + // The guard dropped before processing, so the eviction commits. + assert!(pool.spill_step()); + assert_eq!(h.residency(), Residency::Evicted); + let stats = pool.stats(); + assert_eq!(stats.spill_scheduled, 1); + assert_eq!(stats.evictions_compress, 1); + pool.poison_free_slots(); + let pin = h.pin(); + assert_eq!(&*pin, &payload(SMALL, 400)[..]); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn spill_freed_while_queued_is_elided() { + let pool = test_pool(usize::MAX); + pool.enable_spill_without_threads(); + let h = pool.insert(&mut payload(SMALL, 401)); + pool.evict(&h); + assert_eq!(h.residency(), Residency::WriteInFlight); + drop(h); + assert!(pool.spill_step()); + let stats = pool.stats(); + assert_eq!(stats.spill_cancelled, 1); + assert_eq!(stats.writes_elided, 1, "freed before compression: elided"); + assert_eq!(stats.extent_bytes_written, 0, "no extent was written"); + assert_eq!(stats.resident_bytes, 0, "slot accounting settled"); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn spill_pinned_at_processing_cancels() { + let pool = test_pool(usize::MAX); + pool.enable_spill_without_threads(); + let h = pool.insert(&mut payload(SMALL, 402)); + pool.evict(&h); + let pin = h.pin(); + assert!(pool.spill_step()); + // Pinned at processing time: cancelled back to resident, no extent. + assert_eq!(h.residency(), Residency::UnbackedResident); + assert_eq!(pool.stats().spill_cancelled, 1); + assert_eq!(pool.stats().extent_bytes_written, 0); + assert_eq!(&*pin, &payload(SMALL, 402)[..]); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap, madvise, and threads + fn spill_threads_end_to_end() { + let pool = test_pool(128 << 10); + pool.set_spill_threads(2); + let mut handles = Vec::new(); + for seed in 0..16 { + handles.push(pool.insert(&mut payload(SMALL, 500 + seed))); + } + pool.quiesce_spill(); + let stats = pool.stats(); + assert!( + stats.spill_scheduled > 0, + "budget pressure should have scheduled spills", + ); + for (i, h) in handles.iter().enumerate() { + let pin = h.pin(); + assert_eq!(&*pin, &payload(SMALL, 500 + u64::cast_from(i))[..]); + } + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn insert_with_fills_in_place() { + let pool = test_pool(usize::MAX); + let want = payload(SMALL, 600); + let h = pool.insert_with(SMALL, |dst| { + assert_eq!(dst.len(), SMALL, "fill sees exactly the chunk length"); + dst.copy_from_slice(&want); + }); + assert_eq!(h.residency(), Residency::UnbackedResident); + assert_eq!(&*h.pin(), &want[..]); + pool.evict(&h); + let pin = h.pin(); + assert_eq!(&*pin, &want[..], "round-trips through the extent"); + + // Empty and oversize fall back like `insert`. + let empty = pool.insert_with(0, |dst| assert!(dst.is_empty())); + assert!(empty.is_empty()); + let big_len = (SIZE_CLASSES[SIZE_CLASSES.len() - 1] / 8) + 1; + let big = pool.insert_with(big_len, |dst| dst.fill(7)); + assert_eq!(big.residency(), Residency::Oversize); + assert_eq!(big.pin().len(), big_len); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn slot_exhaustion_degrades_to_heap() { + // Two 64 KiB slots per class at this capacity; the third insert finds + // no slot and must fall back to the heap rather than panic. + let pool = Pool::new(PoolConfig { + budget_bytes: usize::MAX, + class_capacity_bytes: 128 << 10, + }) + .expect("pool creation"); + let a = pool.insert(&mut payload(SMALL, 700)); + let b = pool.insert(&mut payload(SMALL, 701)); + let c = pool.insert(&mut payload(SMALL, 702)); + assert_eq!(a.residency(), Residency::UnbackedResident); + assert_eq!(b.residency(), Residency::UnbackedResident); + assert_eq!( + c.residency(), + Residency::Oversize, + "fallback is heap-backed" + ); + assert_eq!(pool.stats().slot_exhausted_fallbacks, 1); + assert_eq!(&*c.pin(), &payload(SMALL, 702)[..]); + // Freeing a slotted chunk lets the next insert use the region again. + drop(a); + let d = pool.insert(&mut payload(SMALL, 703)); + assert_eq!(d.residency(), Residency::UnbackedResident); + assert_eq!(&*d.pin(), &payload(SMALL, 703)[..]); + } +} diff --git a/src/ore/src/pool/extent.rs b/src/ore/src/pool/extent.rs new file mode 100644 index 0000000000000..995ab480188da --- /dev/null +++ b/src/ore/src/pool/extent.rs @@ -0,0 +1,232 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Swap-backed extents: the backing store for the buffer pool on nodes whose +//! whole disk is provisioned as swap. +//! +//! An extent is a page-aligned anonymous allocation holding the lz4-compressed +//! bytes of one chunk. "Write" compresses into the allocation, which stays +//! resident — the compressed-but-resident middle tier of the pool's ladder — +//! until the pool's RSS target forces [`SwapExtent::pageout`], which pushes +//! the pages to the swap device with `MADV_PAGEOUT`. "Read" issues +//! `MADV_WILLNEED` ahead of the decompress (and makes the pages resident +//! again); "free" is a plain deallocation, with any swapped copy discarded +//! for free. Nothing is ever both uncompressed and on the device. + +use std::alloc::Layout; + +use crate::cast::CastFrom; +use crate::pool::region; + +/// Alignment and size granule of extent allocations. +const EXTENT_ALIGN: usize = 4096; + +/// Length in bytes of the little-endian `u32` uncompressed-size prefix that +/// precedes the compressed bytes, matching the +/// `lz4_flex::block::compress_prepend_size` framing. +const SIZE_PREFIX: usize = 4; + +/// One chunk's compressed backing copy. +#[derive(Debug)] +pub(crate) struct SwapExtent { + ptr: *mut u8, + layout: Layout, + comp_len: usize, + /// Whether the extent's pages are (engine-)resident: set at write and by + /// [`SwapExtent::read_into`], cleared by [`SwapExtent::pageout`]. Drives + /// the pool's `extent_resident_bytes` accounting; mutated only under the + /// owning chunk's state mutex. + resident: bool, +} + +// SAFETY: the extent exclusively owns its allocation; nothing else holds a +// pointer into it, so moving the owner across threads is sound. All access +// goes through the owning chunk's state mutex. +unsafe impl Send for SwapExtent {} + +impl SwapExtent { + /// Compresses `data` into a fresh extent. The pages stay resident; the + /// pool's RSS-target enforcement decides when [`SwapExtent::pageout`] + /// pushes them to the device. + /// + /// Compression goes through a reused thread-local scratch buffer so the + /// extent allocation can be sized to the *actual* compressed payload + /// (rounded to the page granule) rather than lz4's worst case. Worst-case + /// sizing costs ~5.6× on compressible data — in swap capacity, in swap + /// write bandwidth per eviction (the whole allocation is paged out), and + /// in `alloc_zeroed` memset traffic on recycled allocations — which at + /// hydration eviction rates backs up device writeback and bloats the + /// working set with swap-cache pages. + pub(crate) fn write(data: &[u64]) -> SwapExtent { + use std::cell::RefCell; + thread_local! { + static SCRATCH: RefCell> = const { RefCell::new(Vec::new()) }; + } + let bytes: &[u8] = bytemuck::cast_slice(data); + SCRATCH.with(|scratch| { + let mut scratch = scratch.borrow_mut(); + let max_out = lz4_flex::block::get_maximum_output_size(bytes.len()); + scratch.resize(SIZE_PREFIX + max_out, 0); + let uncompressed_len = + u32::try_from(bytes.len()).expect("chunk payloads are bounded by the size classes"); + scratch[..SIZE_PREFIX].copy_from_slice(&uncompressed_len.to_le_bytes()); + let compressed = lz4_flex::block::compress_into(bytes, &mut scratch[SIZE_PREFIX..]) + .expect("output sized by get_maximum_output_size"); + let comp_len = SIZE_PREFIX + compressed; + + let size = comp_len.next_multiple_of(EXTENT_ALIGN); + let layout = Layout::from_size_align(size, EXTENT_ALIGN).expect("valid extent layout"); + // SAFETY: `layout` has nonzero size (`size` is at least one granule). + let ptr = unsafe { std::alloc::alloc(layout) }; + if ptr.is_null() { + std::alloc::handle_alloc_error(layout); + } + // SAFETY: `ptr` is a fresh allocation of `size >= comp_len` bytes + // exclusively owned here; the copy plus the tail zeroing below + // initialize every byte, so later borrows of the allocation (in + // `read_into`) see initialized memory. The source is the scratch + // buffer, which cannot alias the fresh allocation. + unsafe { + std::ptr::copy_nonoverlapping(scratch.as_ptr(), ptr, comp_len); + std::ptr::write_bytes(ptr.add(comp_len), 0, size - comp_len); + } + SwapExtent { + ptr, + layout, + comp_len, + resident: true, + } + }) + } + + /// The byte size of the extent's allocation: the granule the resident + /// accounting and the pageout operate on. + pub(crate) fn alloc_size(&self) -> usize { + self.layout.size() + } + + /// Whether the extent's pages are engine-resident (not pushed to the + /// device since the last write or read). + pub(crate) fn is_resident(&self) -> bool { + self.resident + } + + /// Hints the kernel to push the extent's pages to the swap device and + /// marks the extent non-resident. Cheap: the compression is already + /// paid, the madvise is microseconds, and the device write happens on + /// the kernel's asynchronous writeback path. + pub(crate) fn pageout(&mut self) { + region::pageout(self.ptr, self.layout.size()); + self.resident = false; + } + + /// Compressed size in bytes, including the size prefix. + pub(crate) fn comp_len(&self) -> usize { + self.comp_len + } + + /// Hints the kernel to swap the extent's pages back in ahead of a read. + pub(crate) fn prefetch(&self) { + region::willneed(self.ptr, self.layout.size()); + } + + /// Decompresses the extent into `dst`, which must be exactly the chunk's + /// uncompressed length. Reading faults the pages back in, so the extent + /// is resident again afterwards; the caller owns the accounting for that + /// transition (the pool re-counts and re-enqueues it for the RSS target). + pub(crate) fn read_into(&mut self, dst: &mut [u8]) { + self.resident = true; + self.prefetch(); + // SAFETY: the extent exclusively owns `[ptr, ptr + layout.size())` + // and `comp_len <= layout.size()` by construction in `write`. + let buf = unsafe { std::slice::from_raw_parts(self.ptr, self.comp_len) }; + let prefix: [u8; SIZE_PREFIX] = buf[..SIZE_PREFIX].try_into().expect("prefix length"); + let uncompressed_len = usize::cast_from(u32::from_le_bytes(prefix)); + assert_eq!( + uncompressed_len, + dst.len(), + "destination must match the extent's uncompressed length" + ); + let written = lz4_flex::block::decompress_into(&buf[SIZE_PREFIX..], dst) + .expect("extent holds a valid lz4 block"); + assert_eq!(written, dst.len(), "decompressed length mismatch"); + } +} + +impl Drop for SwapExtent { + fn drop(&mut self) { + // SAFETY: `ptr` was returned by `alloc` with exactly this `layout` + // in `write` and is deallocated exactly once, here. + unsafe { + std::alloc::dealloc(self.ptr, self.layout); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // madvise is a foreign call + fn round_trip() { + let data: Vec = (0..10_000).map(|i| i * 37).collect(); + let mut extent = SwapExtent::write(&data); + assert!(extent.comp_len() > SIZE_PREFIX); + extent.prefetch(); + let mut out = vec![0u64; data.len()]; + extent.read_into(bytemuck::cast_slice_mut(&mut out)); + assert_eq!(out, data); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // madvise is a foreign call + fn compressible_data_shrinks() { + let data = vec![42u64; 100_000]; + let mut extent = SwapExtent::write(&data); + assert!(extent.comp_len() < data.len() * 8 / 4); + let mut out = vec![0u64; data.len()]; + extent.read_into(bytemuck::cast_slice_mut(&mut out)); + assert_eq!(out, data); + } + + /// The allocation is sized to the compressed payload, not lz4's worst + /// case: extents must cost swap capacity and write bandwidth in + /// proportion to what they store. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // madvise is a foreign call + fn allocation_is_sized_to_payload() { + let data = vec![7u64; 100_000]; + let extent = SwapExtent::write(&data); + assert_eq!( + extent.alloc_size(), + extent.comp_len().next_multiple_of(EXTENT_ALIGN), + ); + assert!( + extent.alloc_size() < data.len() * 8 / 8, + "compressible data must not be stored at worst-case size", + ); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // madvise is a foreign call + #[should_panic(expected = "destination must match")] + fn wrong_destination_length_panics() { + let data = vec![1u64; 16]; + let mut extent = SwapExtent::write(&data); + let mut out = vec![0u64; 8]; + extent.read_into(bytemuck::cast_slice_mut(&mut out)); + } +} diff --git a/src/ore/src/pool/region.rs b/src/ore/src/pool/region.rs new file mode 100644 index 0000000000000..7ae78d70dca95 --- /dev/null +++ b/src/ore/src/pool/region.rs @@ -0,0 +1,444 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Size-class virtual-memory regions for the buffer pool. +//! +//! One [`Region`] per size class, each a single anonymous `mmap` reservation. +//! The reservation is virtual; physical memory materializes on first write to +//! a slot. Slots are scoped to residency: eviction releases a slot's physical +//! pages with [`dontneed`] and returns the slot index to the free list, and a +//! fault-in allocates a fresh slot — possibly a different one — so a chunk's +//! address is stable only between a fault-in and the next eviction, and +//! pointers into a slot are valid only under the owning chunk's pin. +//! +//! Two fault-amortization mechanisms soften the cost of cycling slots: +//! +//! * Regions whose class is at least one huge page are aligned to the huge +//! page and advised `MADV_HUGEPAGE`, so populating a large slot costs one +//! fault instead of one per 4 KiB. +//! * The free list is split into a *warm* side (pages kept resident; reuse +//! faults nothing and skips the kernel's page zeroing) and a *cold* side +//! (pages released). The pool decides which side a freed slot joins, +//! bounding total warm bytes as a fraction of its budget. + +use std::io; +use std::sync::Mutex; + +use crate::cast::CastFrom; + +/// Chunk size classes in bytes, smallest first. The pool places each chunk in +/// the smallest class that fits its payload. +/// +/// The top classes deliberately overshoot the batchers' nominal ~2 MiB chunk +/// target: the ship heuristic re-targets the next 2 MiB boundary whenever a +/// single push crosses one, so real chunk sizes are multimodal with bands +/// just under each boundary. A class that fits only the nominal target sends +/// the higher bands to the unpageable heap fallback. Slot internal +/// fragmentation is virtual-only — slots populate lazily, so a chunk costs +/// physical memory for its payload, not its class size. +pub(crate) const SIZE_CLASSES: [usize; 8] = [ + 64 << 10, + 128 << 10, + 256 << 10, + 512 << 10, + 1 << 20, + 2 << 20, + 4 << 20, + 8 << 20, +]; + +/// One anonymous virtual-memory reservation serving fixed-size slots of a +/// single size class. +#[derive(Debug)] +pub(crate) struct Region { + base: *mut u8, + capacity: usize, + class_size: usize, + slots: Mutex, +} + +/// Free-list-plus-bump slot allocator. A slot index returns to a free list +/// whenever its chunk stops being resident — eviction and free alike. Warm +/// slots keep their physical pages (reuse is fault-free); cold slots had +/// theirs released. Never-allocated slots beyond the high-water mark are +/// untouched virtual space and fault on first write like cold ones. +#[derive(Debug)] +struct SlotAllocator { + free_warm: Vec, + free_cold: Vec, + high_water: u32, + max_slots: u32, +} + +// SAFETY: `base` points at an anonymous mapping owned exclusively by this +// `Region` for its whole lifetime. Slot allocation is serialized by the +// `slots` mutex, and access to a slot's contents is serialized by the +// owning chunk's state mutex; the raw pointer itself carries no thread +// affinity. +unsafe impl Send for Region {} +// SAFETY: see the `Send` justification; all interior mutability is behind +// the `slots` mutex, and disjoint slots are written only by their owning +// chunks. +unsafe impl Sync for Region {} + +impl Region { + /// Reserves a region of `capacity_bytes` (rounded down to a whole number + /// of slots) for slots of `class_size` bytes. + /// + /// On Linux, regions whose class is at least [`HUGE_PAGE`] are aligned to + /// the huge page and advised `MADV_HUGEPAGE`: their slots tile huge-page + /// boundaries exactly, so populating a slot is one huge-page fault rather + /// than one fault per 4 KiB, and a whole-slot [`dontneed`] frees whole + /// huge pages without splitting any. + pub(crate) fn new(class_size: usize, capacity_bytes: usize) -> io::Result { + assert!(class_size > 0 && class_size % page_size() == 0); + let capacity = capacity_bytes - capacity_bytes % class_size; + if capacity == 0 { + // A capacity below one slot yields an empty region: `alloc` + // always answers `None` and the caller's exhaustion fallback + // carries the class. No mapping exists; drop has nothing to do. + return Ok(Region { + base: std::ptr::NonNull::::dangling().as_ptr(), + capacity: 0, + class_size, + slots: Mutex::new(SlotAllocator { + free_warm: Vec::new(), + free_cold: Vec::new(), + high_water: 0, + max_slots: 0, + }), + }); + } + let max_slots = u32::try_from(capacity / class_size).expect("slot count fits u32"); + let base = map_region(class_size, capacity)?; + Ok(Region { + base, + capacity, + class_size, + slots: Mutex::new(SlotAllocator { + free_warm: Vec::new(), + free_cold: Vec::new(), + high_water: 0, + max_slots, + }), + }) + } + + /// Size in bytes of every slot in this region. + pub(crate) fn class_size(&self) -> usize { + self.class_size + } + + /// Allocates a slot index, or `None` if every slot of the class is in + /// use; the flag reports whether the slot came from the warm list (its + /// pages are resident; writing it faults nothing). Warm slots are + /// preferred, then cold, then never-touched bump slots. + /// + /// Slots are scoped to residency (eviction frees them), so demand scales + /// with the *resident* set — bounded by the pool budget plus pinned and + /// in-flight slack — and exhaustion means residency outgrew + /// `class_capacity_bytes`; callers degrade rather than fail. + pub(crate) fn alloc(&self) -> Option<(u32, bool)> { + let mut slots = self.slots.lock().expect("region allocator poisoned"); + if let Some(slot) = slots.free_warm.pop() { + return Some((slot, true)); + } + if let Some(slot) = slots.free_cold.pop() { + return Some((slot, false)); + } + if slots.high_water == slots.max_slots { + return None; + } + let slot = slots.high_water; + slots.high_water += 1; + Some((slot, false)) + } + + /// Returns a slot to the warm or cold free list. The caller must be + /// freeing the chunk that owned the slot, must have released the slot's + /// physical pages iff `warm` is false, and owns the warm-bytes accounting + /// that bounds the warm side. + pub(crate) fn free(&self, slot: u32, warm: bool) { + let mut slots = self.slots.lock().expect("region allocator poisoned"); + debug_assert!(slot < slots.high_water); + if warm { + slots.free_warm.push(slot); + } else { + slots.free_cold.push(slot); + } + } + + /// Test hook: overwrites every free slot with `0xDE` so stale contents + /// cannot masquerade as correct data when a slot is reused. + pub(crate) fn poison_free_slots(&self) { + let slots = self.slots.lock().expect("region allocator poisoned"); + for &slot in slots.free_warm.iter().chain(slots.free_cold.iter()) { + let offset = usize::cast_from(slot) * self.class_size; + // SAFETY: the slot is on a free list and the allocator mutex is + // held, so no chunk owns it and no allocation can race; the write + // stays within the region's mapping. + unsafe { + std::ptr::write_bytes(self.base.add(offset), 0xDE, self.class_size); + } + } + } + + /// The base address of a slot, fixed while its owning chunk is resident. + pub(crate) fn slot_ptr(&self, slot: u32) -> *mut u8 { + let offset = usize::cast_from(slot) * self.class_size; + debug_assert!(offset + self.class_size <= self.capacity); + // SAFETY: `slot` was handed out by `alloc`, so `offset + class_size` + // lies within the single `capacity`-byte mapping that `base` points + // to; the add stays in bounds of one allocated object. + unsafe { self.base.add(offset) } + } +} + +/// The transparent-huge-page size assumed for region alignment. Linux x86-64 +/// and aarch64 (4 KiB base pages) both use 2 MiB; if a platform differs, the +/// alignment is merely unhelpful, never wrong. +const HUGE_PAGE: usize = 2 << 20; + +/// Maps the anonymous reservation backing a region. Classes of at least +/// [`HUGE_PAGE`] get a huge-page-aligned base (over-map and trim) and +/// `MADV_HUGEPAGE`, so slot population faults whole huge pages; the advice is +/// best-effort and the kernel falls back to base pages under fragmentation. +fn map_region(class_size: usize, capacity: usize) -> io::Result<*mut u8> { + #[cfg(target_os = "linux")] + let flags = libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE; + #[cfg(not(target_os = "linux"))] + let flags = libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; + + let huge = cfg!(target_os = "linux") && class_size >= HUGE_PAGE; + let map_len = if huge { capacity + HUGE_PAGE } else { capacity }; + + // SAFETY: anonymous mapping with a null hint; the kernel picks a fresh + // range that aliases no existing Rust object. `map_len` is positive and + // page-aligned by construction. + let raw = unsafe { + libc::mmap( + std::ptr::null_mut(), + map_len, + libc::PROT_READ | libc::PROT_WRITE, + flags, + -1, + 0, + ) + }; + if raw == libc::MAP_FAILED { + return Err(io::Error::last_os_error()); + } + let raw = raw.cast::(); + if !huge { + return Ok(raw); + } + + // Trim the over-mapped head and tail so `base` is huge-page aligned and + // the region owns exactly `capacity` bytes; `Drop` unmaps that range. + let addr = raw.addr(); + let aligned = addr.next_multiple_of(HUGE_PAGE); + let head = aligned - addr; + let tail = map_len - head - capacity; + // SAFETY: head and tail are page-aligned subranges of the mapping just + // created, disjoint from the `[aligned, aligned + capacity)` range the + // region keeps; nothing references them. + unsafe { + if head > 0 { + libc::munmap(raw.cast::(), head); + } + if tail > 0 { + libc::munmap(raw.add(head + capacity).cast::(), tail); + } + } + // SAFETY: `head` stays within the original mapping. + let base = unsafe { raw.add(head) }; + + #[cfg(target_os = "linux")] + { + // SAFETY: `base`/`capacity` describe the live aligned mapping; the + // advice is a non-destructive hint and failure is ignorable. + unsafe { + libc::madvise(base.cast::(), capacity, libc::MADV_HUGEPAGE); + } + } + Ok(base) +} + +impl Drop for Region { + fn drop(&mut self) { + // Empty regions never created a mapping. + if self.capacity == 0 { + return; + } + // SAFETY: `base`/`capacity` describe exactly the mapping created in + // `new`, and dropping the region means no chunk (and hence no + // outstanding borrow) refers into it any longer. + unsafe { + libc::munmap(self.base.cast::(), self.capacity); + } + } +} + +/// Releases the physical pages of the page-aligned subrange of +/// `[ptr, ptr + len)`, keeping the virtual range mapped. +/// +/// # Safety +/// +/// The range must lie within a live mapping exclusively owned by the caller, +/// with no outstanding references into it. After the call the range's contents +/// are undefined: Linux replaces them with zero pages, but other systems +/// (macOS in particular) may keep the old bytes resident, so callers must +/// fully overwrite the range before reading it again. +pub(crate) unsafe fn dontneed(ptr: *mut u8, len: usize) { + madvise_aligned(ptr, len, libc::MADV_DONTNEED); +} + +/// Hints the kernel to reclaim the page-aligned subrange of `[ptr, ptr + len)` +/// immediately, writing it to the swap device. Contents are preserved; this is +/// a non-destructive hint. No-op outside Linux. +#[cfg(target_os = "linux")] +pub(crate) fn pageout(ptr: *mut u8, len: usize) { + madvise_aligned(ptr, len, libc::MADV_PAGEOUT); +} + +/// See the Linux definition; reclaim hints have no portable equivalent. +#[cfg(not(target_os = "linux"))] +pub(crate) fn pageout(_ptr: *mut u8, _len: usize) {} + +/// Hints the kernel to fault the page-aligned subrange of `[ptr, ptr + len)` +/// back in ahead of need: asynchronous swap-in, the swap-backed extent store's +/// readahead mechanism. Contents are preserved. No-op outside Linux. +#[cfg(target_os = "linux")] +pub(crate) fn willneed(ptr: *mut u8, len: usize) { + madvise_aligned(ptr, len, libc::MADV_WILLNEED); +} + +/// See the Linux definition; prefetch hints have no portable equivalent. +#[cfg(not(target_os = "linux"))] +pub(crate) fn willneed(_ptr: *mut u8, _len: usize) {} + +/// Applies `advice` to the largest page-aligned subrange of `[ptr, ptr + len)`, +/// rounding the start up and the end down so the advice never spills onto +/// pages the range only partially covers. +fn madvise_aligned(ptr: *mut u8, len: usize, advice: libc::c_int) { + if len == 0 { + return; + } + let page = page_size(); + let base_addr = ptr.addr(); + let Some(start_unaligned) = base_addr.checked_add(page - 1) else { + return; + }; + let Some(end_unaligned) = base_addr.checked_add(len) else { + return; + }; + let aligned_start_addr = start_unaligned & !(page - 1); + let aligned_end_addr = end_unaligned & !(page - 1); + if aligned_end_addr <= aligned_start_addr { + return; + } + let aligned_len = aligned_end_addr - aligned_start_addr; + // SAFETY: `aligned_start_addr` lies in `[base_addr, base_addr + len]` by + // construction (rounding the start up cannot exceed `end_unaligned`, and + // the early return guarantees start < end), so `byte_add` stays within + // the caller's range and preserves provenance. + let aligned_ptr = + unsafe { ptr.byte_add(aligned_start_addr - base_addr) }.cast::(); + // SAFETY: pointer and length describe a fully page-aligned subrange of the + // caller's live mapping (justified above). Callers passing destructive + // advice (`MADV_DONTNEED`) uphold the exclusivity contract documented on + // `dontneed`; the remaining advice values are non-mutating hints. + unsafe { + libc::madvise(aligned_ptr, aligned_len, advice); + } +} + +pub(crate) fn page_size() -> usize { + // SAFETY: `sysconf` with a valid argument is safe. + let raw = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; + usize::try_from(raw).expect("page size is positive and fits usize") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn alloc_free_reuses_slots() { + let region = Region::new(64 << 10, 1 << 20).expect("mmap"); + let (a, warm_a) = region.alloc().expect("slot"); + let (b, _) = region.alloc().expect("slot"); + assert!(!warm_a, "bump slots are not warm"); + assert_ne!(a, b); + assert_ne!(region.slot_ptr(a), region.slot_ptr(b)); + let ptr_a = region.slot_ptr(a); + // A warm free is preferred by the next alloc and reported warm. + region.free(a, true); + let (c, warm_c) = region.alloc().expect("slot"); + assert_eq!(c, a); + assert!(warm_c); + assert_eq!(region.slot_ptr(c), ptr_a); + // A cold free comes back, but not warm. + region.free(c, false); + let (d, warm_d) = region.alloc().expect("slot"); + assert_eq!(d, a); + assert!(!warm_d); + } + + /// Hugepage-class regions get a huge-page-aligned base, so slots tile + /// huge-page boundaries exactly. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn hugepage_class_base_is_aligned() { + let region = Region::new(2 << 20, 16 << 20).expect("mmap"); + let (slot, _) = region.alloc().expect("slot"); + if cfg!(target_os = "linux") { + assert_eq!( + region.slot_ptr(slot).addr() % HUGE_PAGE, + 0, + "hugepage-class slots must be huge-page aligned", + ); + } + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn exhaustion_returns_none() { + let region = Region::new(64 << 10, 128 << 10).expect("mmap"); + assert!(region.alloc().is_some()); + assert!(region.alloc().is_some()); + assert!(region.alloc().is_none(), "third slot exceeds capacity"); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn slots_are_writable_and_advice_is_accepted() { + let region = Region::new(64 << 10, 1 << 20).expect("mmap"); + let (slot, _) = region.alloc().expect("slot"); + let ptr = region.slot_ptr(slot); + // SAFETY: freshly allocated slot, exclusively owned by this test. + unsafe { + std::ptr::write_bytes(ptr, 0xAB, region.class_size()); + } + pageout(ptr, region.class_size()); + willneed(ptr, region.class_size()); + // SAFETY: the slot is exclusively owned and is not read again before + // being overwritten (it is not read again at all). + unsafe { + dontneed(ptr, region.class_size()); + } + } +} diff --git a/src/storage/src/storage_state.rs b/src/storage/src/storage_state.rs index 9526994e90be6..53e8723f000a1 100644 --- a/src/storage/src/storage_state.rs +++ b/src/storage/src/storage_state.rs @@ -854,6 +854,10 @@ impl<'w> Worker<'w> { let enabled = ENABLE_UPSERT_PAGED_SPILL .get(self.storage_state.storage_configuration.config_set()); + info!( + worker = self.timely_worker.index(), + enabled, "upsert stash pager: applying gate", + ); crate::upsert::upsert_stash_pager::set_enabled(enabled); } } diff --git a/src/storage/src/upsert.rs b/src/storage/src/upsert.rs index 16269fd30fb82..e8e2e6bb6b573 100644 --- a/src/storage/src/upsert.rs +++ b/src/storage/src/upsert.rs @@ -298,35 +298,42 @@ upsert_source_time_unit!(GtidPartition, Lsn); /// Pager for the upsert-v2 source stash. /// -/// This draws from the same process-wide [`TieredPolicy`] budget pool as the -/// compute column-paged batcher — there is one budget and one underlying -/// `mz_ore::pager` — but whether the stash *uses* it is gated by storage's own +/// This draws from the process-wide shared spill mechanism — the buffer pool +/// when compute's config installed pool mode, else the [`TieredPolicy`] +/// budget — but whether the stash *uses* it is gated by storage's own /// `enable_upsert_paged_spill` flag, independently of compute's -/// `enable_column_paged_batcher_spill`. The shared pool's budget / backend / -/// codec are configured by compute's `apply_tiered_config` (storage and compute -/// run in the same `clusterd` process). +/// `enable_column_paged_batcher_spill`. The shared mechanism's budget, +/// backend, and codec are configured by compute's config handler (storage +/// and compute run in the same `clusterd` process); each `pager` call +/// resolves against whichever mechanism the last config apply installed. /// /// [`TieredPolicy`]: mz_timely_util::column_pager::policy::TieredPolicy pub mod upsert_stash_pager { - use std::sync::{LazyLock, RwLock}; + use std::sync::atomic::{AtomicBool, Ordering}; use mz_timely_util::column_pager::{ColumnPager, shared_pager}; - /// Active pager handed to upsert source-stash batchers. Defaults to - /// disabled (every chunk resident) until [`set_enabled`] turns it on. - static PAGER: LazyLock> = - LazyLock::new(|| RwLock::new(ColumnPager::disabled())); + /// Whether the stash participates in the shared spill mechanism, applied + /// from storage configuration. + static ENABLED: AtomicBool = AtomicBool::new(false); - /// Enable or disable the stash's use of the shared column pager. When - /// enabled, the stash spills through the shared budget pool; when disabled - /// it keeps every chunk resident. + /// Enable or disable the stash's use of the shared spill mechanism. When + /// enabled, the stash spills through the shared budget; when disabled it + /// keeps every chunk resident. Takes effect for dataflows rendered after + /// the change; running dataflows keep the pager they captured. pub fn set_enabled(enabled: bool) { - *PAGER.write().expect("upsert stash pager poisoned") = shared_pager(enabled); + ENABLED.store(enabled, Ordering::Relaxed); } - /// The current upsert-stash pager. Cheap: clones the inner `Arc`. + /// The upsert-stash pager, resolved against the enable flag and the + /// process-wide shared mechanism (buffer pool vs tiered) at the moment of + /// the call. Callers capture the result at dataflow render and keep it + /// for the dataflow's lifetime, so resolution happens as late as + /// possible: a source rendered (or restarted) after a mechanism change + /// picks up the new mechanism, rather than the one in effect when + /// storage configuration last arrived. pub fn pager() -> ColumnPager { - PAGER.read().expect("upsert stash pager poisoned").clone() + shared_pager(ENABLED.load(Ordering::Relaxed)) } } diff --git a/src/storage/src/upsert_continual_feedback_v2.rs b/src/storage/src/upsert_continual_feedback_v2.rs index c3782be0b42f4..320f26b55fd9e 100644 --- a/src/storage/src/upsert_continual_feedback_v2.rs +++ b/src/storage/src/upsert_continual_feedback_v2.rs @@ -42,9 +42,10 @@ //! to learn which times have been committed. When the persist frontier //! reaches the resume upper, rehydration is complete. //! -//! 3. **Seal & drain.** Call `batcher.seal(input_upper)` to extract all -//! source-finalized entries as sorted, consolidated `Column` chunks. Each -//! entry is classified: +//! 3. **Seal & drain.** Call `batcher.seal_paged(input_upper)` to extract all +//! source-finalized entries as sorted, consolidated `Column` chunks, kept +//! paged and rehydrated one chunk at a time by the drain. Each entry is +//! classified: //! - **Eligible** (at the persist frontier): the persist trace has the //! correct "before" state for this time. Look up the old value via a //! cursor, emit a retraction if present, and emit the new value. @@ -638,7 +639,12 @@ where // Step 1 already consolidated `push_buffer` through the chunker // (which readies a complete chunk per `push_into`), so the // chunker holds nothing pending here and we can seal directly. - let (sealed, _description) = batcher.seal(input_upper.clone()); + // + // `seal_paged` keeps the sealed chunks paged; the drain below + // rehydrates them one at a time, so a large drain (a frontier + // advance releasing a snapshot's worth of stash at once) holds + // at most one chunk resident rather than the whole backlog. + let (sealed, _description) = batcher.seal_paged(input_upper.clone()); // Frontier of data remaining in the batcher (ts >= input_upper). let remaining_frontier = batcher.frontier().to_owned(); @@ -653,8 +659,8 @@ where &*cap, &persist_upper, &mut persist_trace, - &source_config.worker_id, - &source_config.id, + source_config.worker_id, + source_config.id, ) .await; @@ -681,22 +687,19 @@ where let min_ineligible_ts = ineligible.iter().map(|(_, ts, _)| ts).min().cloned(); flush_to_batcher(&mut ineligible, &mut chunker, &mut batcher); - let has_remaining = !remaining_frontier.is_empty() || min_ineligible_ts.is_some(); - if has_remaining { - let min_ts = match ( - remaining_frontier.elements().first(), - min_ineligible_ts.as_ref(), - ) { - (Some(a), Some(b)) => std::cmp::min(a, b).clone(), - (Some(a), None) => a.clone(), - (None, Some(b)) => b.clone(), - (None, None) => unreachable!(), - }; - cap.downgrade(&min_ts); - } else { + // `Option::min` alone would be wrong here — `None` sorts low — + // so chain the candidates and take the min over present ones. + let min_ts = remaining_frontier + .elements() + .first() + .into_iter() + .chain(min_ineligible_ts.as_ref()) + .min(); + match min_ts { + Some(min_ts) => cap.downgrade(min_ts), // Batcher is completely empty — drop the capability so // downstream operators can make progress. - stash_cap = None; + None => stash_cap = None, } } @@ -736,48 +739,43 @@ struct DrainStats { } /// Process sealed chunks from the batcher, classifying each entry by its -/// timestamp relative to `persist_upper`: entries at the frontier are eligible -/// for processing now (cursor lookup + output), entries above it are returned -/// in `ineligible` for re-stashing, and entries below it are already persisted -/// and dropped (see the body for why). +/// timestamp relative to `persist_upper`: +/// +/// * `ts == persist_upper`: eligible for processing now (cursor lookup + +/// output). +/// * `ts > persist_upper`: not yet processable; returned in `ineligible` +/// for re-stashing until the feedback frontier catches up to it. +/// * `ts < persist_upper`: already persisted by some writer and not +/// relevant anymore; DROPPED. The downstream persist_sink would filter +/// such updates out anyway since the shard upper is further ahead, and +/// our state is already up-to-date to `persist_upper` so we could not +/// emit correct retractions for it. Re-stashing it would strand the data +/// forever (`persist_upper` only advances, so `ts == persist_upper` can +/// never again hold) and pin the operator's output frontier below the +/// shard upper. This mirrors v1's `relevant = persist_upper.less_equal(ts)`. /// /// The sealed chunks are already sorted and consolidated by the MergeBatcher, /// so the trace cursor walks forward through keys in order — seeks amortize. +/// Chunks are pulled from the iterator one at a time and dropped before the +/// next is requested, so at most one rehydrated chunk is resident regardless +/// of drain size; eligible values are emitted straight from the column's +/// `RowRef` with no owned `UpsertDiff` copy, and only the re-stashed +/// ineligible set is materialized. async fn drain_sealed_input( - sealed: Vec>>, + sealed: impl Iterator>>, ineligible: &mut Vec>, output_handle: &UpsertOutputHandle, output_cap: &Capability, persist_upper: &Antichain, trace: &mut TraceAgent>, - worker_id: &usize, - source_id: &GlobalId, + worker_id: usize, + source_id: GlobalId, ) -> DrainStats where T: TotalOrder + Lattice + timely::ExchangeData + Timestamp + Clone + Debug + Ord + Sync, T: columnation::Columnation + columnar::Columnar, O: columnar::Columnar, { - // Classify each entry by its timestamp relative to `persist_upper`: - // - // * `ts == persist_upper`: eligible for processing now. - // * `ts > persist_upper`: not yet processable; re-stashed (ineligible) - // until the feedback frontier catches up to it. - // * `ts < persist_upper`: already persisted by some writer and not - // relevant anymore. We DROP it. The downstream persist_sink would - // filter such updates out anyway since the shard upper is further - // ahead, and our state is already up-to-date to `persist_upper` so we - // could not emit correct retractions for it. Re-stashing it would - // strand the data forever (`persist_upper` only advances, so - // `ts == persist_upper` can never again hold) and pin the operator's - // output frontier below the shard upper. This mirrors v1's - // `relevant = persist_upper.less_equal(ts)`. - // Walk the sealed chunks by reference rather than collecting the eligible - // set into an owned Vec. The chunks are globally sorted (the seal merges - // all chains into one run), so the cursor seeks still walk forward and - // amortize, and eligible values are emitted straight from the column's - // `RowRef` with no owned `UpsertDiff` copy. Only the re-stashed ineligible - // set is materialized. let mut eligible_count: u64 = 0; let mut result_count: u64 = 0; let mut output_count: u64 = 0; @@ -787,7 +785,7 @@ where let (mut cursor, storage) = trace.cursor(); - for chunk in &sealed { + for chunk in sealed { for (key, ts, diff) in chunk.borrow().into_index_iter() { let ts = ::into_owned(ts); if !persist_upper.less_equal(&ts) { diff --git a/src/timely-util/examples/column_paged_spill.rs b/src/timely-util/examples/column_paged_spill.rs index 7aa0dc7fcc7e8..4ab47889ead05 100644 --- a/src/timely-util/examples/column_paged_spill.rs +++ b/src/timely-util/examples/column_paged_spill.rs @@ -41,8 +41,7 @@ use differential_dataflow::trace::implementations::spine_fueled::Spine; use differential_dataflow::trace::rc_blanket_impls::RcBuilder; use mz_ore::cast::{CastFrom, CastLossy, ReinterpretCast}; use mz_ore::pager::{self, Backend}; -use mz_timely_util::column_pager::policy::TieredPolicy; -use mz_timely_util::column_pager::{ColumnPager, set_global_pager}; +use mz_timely_util::column_pager::apply_tiered_config; use mz_timely_util::columnar::Col2ValPagedBatcher; use mz_timely_util::columnar::Column; use mz_timely_util::columnar::batcher::ColumnChunker; @@ -79,13 +78,10 @@ struct Config { fn install_pager(spill: bool, budget: usize) { if spill { // Each process keeps a single `mz-pager-{pid}-{nonce}` subdir under - // this root; reused across `set_global_pager` reinstalls. + // this root; reused across config reapplies. pager::set_scratch_dir(std::env::temp_dir()); - let policy = Arc::new(TieredPolicy::new(budget, Backend::File, None)); - set_global_pager(ColumnPager::new(policy)); - } else { - set_global_pager(ColumnPager::disabled()); } + apply_tiered_config(spill, budget, Backend::File, None, false); } fn run_dataflow(cfg: &Config, label: &str) -> Duration { diff --git a/src/timely-util/src/column_pager.rs b/src/timely-util/src/column_pager.rs index 6882c28f4eed8..43cb1a1627651 100644 --- a/src/timely-util/src/column_pager.rs +++ b/src/timely-util/src/column_pager.rs @@ -25,6 +25,11 @@ //! 2. A [`ColumnPager`] that drains a `Column` into a [`PagedColumn`] and //! rehydrates it on demand. //! 3. Lz4 frame-format compression as an optional codec. +//! 4. A pooled path ([`PageDecision::Pool`]) that hands the body to an +//! [`mz_ore::pool::Pool`] instead of a pager backend. Residency becomes a +//! state of the pool's chunk handle rather than a property baked in at +//! pageout time — the prototype seam for +//! `doc/developer/design/20260610_buffer_managed_state.md`. //! //! The serialization uses the existing [`ContainerBytes`] protocol on //! `Column`, so we get a single byte layout that both raw and compressed @@ -37,7 +42,7 @@ pub mod policy; use std::io::{self, Read}; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, LazyLock, RwLock}; +use std::sync::{Arc, LazyLock}; use columnar::Columnar; use lz4_flex::frame::{FrameDecoder, FrameEncoder}; @@ -63,7 +68,7 @@ pub struct PageHint { } /// Outcome of a policy decision. -#[derive(Copy, Clone, Debug)] +#[derive(Clone, Debug)] pub enum PageDecision { /// Keep the column resident; no I/O, no compression. Skip, @@ -74,6 +79,11 @@ pub enum PageDecision { /// Compression codec, or `None` for raw bytes. codec: Option, }, + /// Hand the body to the given buffer pool. The pool owns residency from + /// here on: it enforces its own resident-bytes budget and compresses + /// into swap-backed extents at eviction time, so neither a backend nor a + /// codec choice applies. + Pool(mz_ore::pool::Pool), } /// Notifications the column-pager sends back to the policy. Implementations @@ -172,6 +182,16 @@ pub enum PagedColumn { /// Sizing metadata. meta: Meta, }, + /// Body held as a buffer-pool chunk. Residency is a state of the handle, + /// not of this variant: the pool keeps the chunk resident or evicts it + /// to a swap-backed extent under its own budget, and + /// [`ColumnPager::take`] reads it back from wherever it currently lives. + Pooled { + /// Pool handle owning the chunk. + handle: mz_ore::pool::ChunkHandle, + /// Sizing metadata. + meta: Meta, + }, } /// Drop guard that returns budget to a [`PagingPolicy`] when a @@ -227,6 +247,16 @@ impl ColumnPager { pub fn disabled() -> Self { Self::new(Arc::new(AlwaysResidentPolicy)) } + + /// Constructs a pager backed by `pool`: every non-empty [`page`] routes + /// the body into the pool, which enforces its own resident-bytes budget + /// (see [`policy::PoolPolicy`]). A prototype seam, opt-in via callers' + /// pager injection points rather than the global pager plumbing. + /// + /// [`page`]: ColumnPager::page + pub fn pooled(pool: mz_ore::pool::Pool) -> Self { + Self::new(Arc::new(policy::PoolPolicy::new(pool))) + } } /// Policy that keeps every column resident and discards events. Backs @@ -247,16 +277,19 @@ impl PagingPolicy for AlwaysResidentPolicy { // either duplicate the global flag at the struct level or invite confusion // about which configuration wins." // -// The lower-level `mz_ore::pager` already uses a global atomic for backend -// selection. This module's policy/budget layer mirrors that shape: one -// `ColumnPager` per process, swapped atomically when the controller changes -// the configuration. Merge batchers clone the `Arc` inside on use; live -// reinstalls take effect on the next call without per-thread coordination. +// The configuration state is exactly two bits — which shared mechanism is +// installed ([`POOL_MODE`]) and whether compute's own batchers are enabled +// ([`COMPUTE_ENABLED`]) — plus the two mechanism singletons and the +// [`SWAP_PAGEOUT`] toggle. Every pager a consumer sees is *derived* from +// those bits at the moment it asks ([`global_pager`] for compute, +// [`shared_pager`] for per-consumer opt-ins), so there is one resolution +// path and nothing cached to fall out of sync. Consumers that capture a +// pager (at render, say) keep it until they next ask; live reconfiguration +// takes effect on the next call. -/// Process-global active pager. Defaults to [`ColumnPager::disabled`] -/// until worker init calls [`set_global_pager`]. -static GLOBAL_PAGER: LazyLock> = - LazyLock::new(|| RwLock::new(ColumnPager::disabled())); +/// Whether compute's own batchers page through the shared mechanism. +/// [`global_pager`] derives from this; set by the `apply_*_config` calls. +static COMPUTE_ENABLED: AtomicBool = AtomicBool::new(false); /// Process-global toggle for `MADV_PAGEOUT` on the lz4 + swap spill path. /// @@ -269,17 +302,6 @@ static GLOBAL_PAGER: LazyLock> = /// to off; the eager-reclaim syscall stays gated until proven. static SWAP_PAGEOUT: AtomicBool = AtomicBool::new(false); -/// Install `pager` as the process-wide active pager. Subsequent -/// [`global_pager`] calls return a clone of this value across all threads. -/// -/// Prefer [`apply_tiered_config`] for the production path so the -/// `TieredPolicy` budget atomic stays stable across reconfigures. Direct -/// `set_global_pager` use is appropriate for tests, the disabled pager, or -/// callers that intentionally want a fresh policy. -pub fn set_global_pager(pager: ColumnPager) { - *GLOBAL_PAGER.write().expect("global pager poisoned") = pager; -} - /// Process-wide [`policy::TieredPolicy`] singleton. /// /// Why a singleton: every `ResidentTicket` keeps an `Arc` @@ -307,11 +329,11 @@ pub fn tiered_policy() -> &'static policy::TieredPolicy { /// [`policy::TieredPolicy`] so in-flight `ResidentTicket`s remain coherent /// with the running budget after the operator tunes any of the inputs. /// -/// When `enabled` is true, installs a [`ColumnPager`] backed by the -/// singleton policy. When false, installs [`ColumnPager::disabled`] — -/// in-flight tickets still credit the singleton, which is harmless: the -/// budget grows above the configured total until the next enable reconciles -/// it via `reconfigure`. +/// Makes the tiered policy the shared mechanism; [`global_pager`] resolves +/// to it when `enabled` and to the disabled pager otherwise. With paging +/// disabled, in-flight tickets still credit the singleton, which is +/// harmless: the budget grows above the configured total until the next +/// enable reconciles it via `reconfigure`. /// /// `swap_pageout` toggles `MADV_PAGEOUT` on the lz4 + swap spill path (see /// `SWAP_PAGEOUT`); it is stored unconditionally so the next `page` call @@ -324,40 +346,145 @@ pub fn apply_tiered_config( swap_pageout: bool, ) { SWAP_PAGEOUT.store(swap_pageout, Ordering::Relaxed); - let p: &Arc = &TIERED_POLICY; - p.reconfigure(total_budget, backend, codec); - if enabled { - #[allow(clippy::clone_on_ref_ptr)] - let dyn_policy: Arc = p.clone(); - set_global_pager(ColumnPager::new(dyn_policy)); - } else { - set_global_pager(ColumnPager::disabled()); - } + TIERED_POLICY.reconfigure(total_budget, backend, codec); + POOL_MODE.store(false, std::sync::atomic::Ordering::Relaxed); + COMPUTE_ENABLED.store(enabled, std::sync::atomic::Ordering::Relaxed); } -/// Returns the current global pager. Cheap: clones the inner `Arc`. +/// Process-wide buffer pool shared by every pooled pager in the process. +/// +/// A singleton for the same reason [`TIERED_POLICY`] is one: live +/// [`PagedColumn::Pooled`] handles keep their `Arc` into the pool, so +/// replacing it on reconfigure would split residency accounting across two +/// budgets. Operator-driven tunes go through +/// [`mz_ore::pool::Pool::set_budget`] on the one instance instead. +/// +/// Construction reserves virtual address space only (a few GiB per size +/// class); physical memory is paid per resident chunk. On the rare platforms +/// or configurations where the reservation fails, the pool is permanently +/// unavailable for this process and [`apply_pool_config`] reports that by +/// returning `false` so callers can fall back to the tiered path. +static GLOBAL_POOL: std::sync::OnceLock> = std::sync::OnceLock::new(); + +/// Whether the pool is the active shared spill mechanism (set by +/// [`apply_pool_config`], cleared by [`apply_tiered_config`]). Read by +/// [`shared_pager`] so per-consumer opt-ins follow whichever mechanism the +/// last config apply installed. +static POOL_MODE: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); + +/// Returns the process-wide buffer pool, initializing it on first call. +/// `None` if the virtual reservation failed at first use. +pub fn global_pool() -> Option { + GLOBAL_POOL + .get_or_init( + || match mz_ore::pool::Pool::new(mz_ore::pool::PoolConfig::default()) { + Ok(pool) => Some(pool), + Err(err) => { + tracing::warn!( + %err, + "column pager: buffer pool reservation failed; pool mode unavailable", + ); + None + } + }, + ) + .clone() +} + +/// Returns the process-wide buffer pool only if something already +/// initialized it; never triggers the virtual reservation itself. Metrics +/// scrapes read through this so that observing a process (which may never +/// enable pool mode) does not mmap the pool's address space as a side +/// effect. +pub fn global_pool_peek() -> Option { + GLOBAL_POOL.get().cloned().flatten() +} + +/// Apply a pool-backed pager configuration. Returns `false` (and changes +/// nothing) if the pool is unavailable, so the caller can fall back to +/// [`apply_tiered_config`]. +/// +/// On success the pool becomes the active shared mechanism — [`global_pager`] +/// resolves to it when `enabled`, and per-consumer opt-ins via +/// [`shared_pager`] reach it either way — and the pool's resident budget is +/// retuned in place so live handles stay coherent. +pub fn apply_pool_config(cfg: PoolPagerConfig) -> bool { + let Some(pool) = global_pool() else { + return false; + }; + pool.set_budget(cfg.budget_bytes); + pool.set_rss_target(cfg.rss_target_bytes); + pool.set_spill_threads(cfg.spill_threads); + pool.set_eager_backing(cfg.eager_backing); + POOL_MODE.store(true, std::sync::atomic::Ordering::Relaxed); + COMPUTE_ENABLED.store(cfg.enabled, std::sync::atomic::Ordering::Relaxed); + true +} + +/// Inputs to [`apply_pool_config`]. All sizes are absolute bytes; fractions +/// are resolved by the caller against *physical RAM* (see +/// `mz_ore::memory::physical_memory_bytes`), never against an announced +/// limit that may include swap. +#[derive(Clone, Copy, Debug)] +pub struct PoolPagerConfig { + /// Whether compute's own batchers page through the pool. + pub enabled: bool, + /// Resident-bytes budget for uncompressed slots. + pub budget_bytes: usize, + /// Spill threads for off-worker eviction I/O (spawn-once). + pub spill_threads: usize, + /// Whether idle spill threads eagerly compress chunks to + /// `BackedResident` ahead of pressure. + pub eager_backing: bool, + /// Ceiling on the pool's total RSS; the compressed-resident tier is the + /// headroom above the budget and warm cap. Zero collapses the tier. + pub rss_target_bytes: usize, +} + +/// The pager for compute's own batchers: [`shared_pager`] resolved against +/// the compute enable bit the last `apply_*_config` call stored. Cheap (one +/// `Arc` clone); called per chunk, so unlike [`shared_pager`] it does not +/// log its resolution. pub fn global_pager() -> ColumnPager { - GLOBAL_PAGER.read().expect("global pager poisoned").clone() + resolve_shared(COMPUTE_ENABLED.load(std::sync::atomic::Ordering::Relaxed)).0 } -/// A pager that, when `enabled`, draws from the shared [`tiered_policy`] budget -/// pool — the same pool `apply_tiered_config` sizes for the process-global -/// pager — and otherwise is a disabled (always-resident) pager. +/// A pager that, when `enabled`, draws from the process-wide shared spill +/// mechanism — the buffer pool when [`apply_pool_config`] installed it, else +/// the [`tiered_policy`] budget `apply_tiered_config` sizes — and otherwise is +/// a disabled (always-resident) pager. /// /// This lets a second consumer (e.g. the storage upsert source stash) opt into -/// the one shared budget independently of whether `apply_tiered_config` enabled +/// the one shared budget independently of whether the config apply enabled /// the process-global pager for its own (compute) batchers. There is still a -/// single budget pool and a single underlying `mz_ore::pager`; only the -/// enable decision is per-consumer. +/// single budget; only the enable decision is per-consumer. Which mechanism +/// is shared follows the most recent config apply ([`apply_pool_config`] vs +/// [`apply_tiered_config`]), so a consumer that captured a pager before a +/// mechanism flip keeps its old one until it next calls here. pub fn shared_pager(enabled: bool) -> ColumnPager { - if enabled { - #[allow(clippy::clone_on_ref_ptr)] - let dyn_policy: Arc = TIERED_POLICY.clone(); - ColumnPager::new(dyn_policy) - } else { - ColumnPager::disabled() + let (pager, resolved) = resolve_shared(enabled); + tracing::info!( + enabled, + pool_mode = POOL_MODE.load(std::sync::atomic::Ordering::Relaxed), + "shared column pager resolved: {resolved}", + ); + pager +} + +/// The one resolution path from the two configuration bits to a pager. +/// Returns the pager and a label naming the resolution for logs. +fn resolve_shared(enabled: bool) -> (ColumnPager, &'static str) { + if !enabled { + return (ColumnPager::disabled(), "disabled"); } + if POOL_MODE.load(std::sync::atomic::Ordering::Relaxed) { + if let Some(pool) = global_pool() { + return (ColumnPager::pooled(pool), "pool"); + } + } + #[allow(clippy::clone_on_ref_ptr)] + let dyn_policy: Arc = TIERED_POLICY.clone(); + (ColumnPager::new(dyn_policy), "tiered") } impl ColumnPager { @@ -389,6 +516,53 @@ impl ColumnPager { }; return PagedColumn::Resident(std::mem::take(col), ticket); } + PageDecision::Pool(pool) => { + debug_assert_eq!(len_bytes % 8, 0); + // Serialize straight into the pool slot: one page population, + // no staging buffers. The `Align` variant is already the + // serialized form and copies in directly; other variants + // write their `ContainerBytes` encoding through a cursor over + // the slot memory. Sizing is exact, so a short or overlong + // write is a `ContainerBytes` contract violation and panics + // via the cursor's bounds. + let handle = match std::mem::take(col) { + Column::Align(v) => { + pool.insert_with(v.len(), |dst| dst.copy_from_slice(v.as_slice())) + } + mut other => { + let handle = pool.insert_with(len_bytes / 8, |dst| { + let bytes: &mut [u8] = bytemuck::cast_slice_mut(dst); + let mut cursor = std::io::Cursor::new(bytes); + other.into_bytes(&mut cursor); + assert_eq!( + usize::try_from(cursor.position()).expect("usize position"), + len_bytes, + "serialized body must fill the chunk exactly", + ); + }); + // `into_bytes` only borrowed `other`; clear it in + // place and hand it back so the caller keeps the + // `Typed` allocation for its next refill. + other.clear(); + *col = other; + handle + } + }; + // The pool compresses internally at eviction time, so the + // policy-visible size is the uncompressed body on both sides. + // The pool's extent store is swap-backed. + metrics::observe_pageout(len_bytes, len_bytes); + self.policy.record(PageEvent::PagedOut { + bytes_in: len_bytes, + bytes_out: len_bytes, + backend: Backend::Swap, + codec: None, + }); + return PagedColumn::Pooled { + handle, + meta: Meta { len_bytes }, + }; + } PageDecision::Page { backend, codec } => (backend, codec), }; let meta = Meta { len_bytes }; @@ -399,23 +573,7 @@ impl ColumnPager { // pager. `Column::Align` already is; other variants are // serialized and copied. debug_assert_eq!(len_bytes % 8, 0); - let body: Vec = match std::mem::take(col) { - // Move the aligned buffer straight into the pager: the - // allocation transfers with no copy. `take` already left - // `col` as a refill-ready `Typed` default. - Column::Align(v) => v, - mut other => { - let mut buf = Vec::with_capacity(len_bytes); - other.into_bytes(&mut buf); - debug_assert_eq!(buf.len() % 8, 0); - // `into_bytes` only borrowed `other`; clear it in place - // and hand it back so the caller keeps the `Typed` - // allocation instead of us dropping a reusable buffer. - other.clear(); - *col = other; - bytemuck::allocation::pod_collect_to_vec::(&buf) - } - }; + let body = drain_to_aligned(col, len_bytes); let handle = pager::pageout_with(backend, &mut [body]); let bytes_out = handle.len_bytes(); metrics::observe_pageout(len_bytes, bytes_out); @@ -523,6 +681,38 @@ impl ColumnPager { // produces the refcounted `Bytes` that `ContainerBytes` expects. Column::from_bytes(BytesMut::from(decoded).freeze()) } + PagedColumn::Pooled { handle, meta } => { + let mut body: Vec = Vec::with_capacity(handle.len()); + handle.take(&mut body); + debug_assert_eq!(body.len() * 8, meta.len_bytes); + metrics::observe_pagein(meta.len_bytes); + self.policy.record(PageEvent::PagedIn { + bytes: meta.len_bytes, + }); + Column::Align(body) + } + } + } +} + +/// Drains `col` into the u64-aligned raw body shared by the uncompressed +/// pageout paths: a [`Column::Align`] moves its buffer out with no copy +/// (leaving `col` a refill-ready `Typed` default), while other variants +/// serialize via [`ContainerBytes::into_bytes`] and widen the bytes, handing +/// the cleared `Typed` allocation back to `col` for reuse. +fn drain_to_aligned(col: &mut Column, len_bytes: usize) -> Vec { + match std::mem::take(col) { + Column::Align(v) => v, + mut other => { + let mut buf = Vec::with_capacity(len_bytes); + other.into_bytes(&mut buf); + debug_assert_eq!(buf.len() % 8, 0); + // `into_bytes` only borrowed `other`; clear it in place and hand + // it back so the caller keeps the `Typed` allocation instead of + // us dropping a reusable buffer. + other.clear(); + *col = other; + bytemuck::allocation::pod_collect_to_vec::(&buf) } } } @@ -593,7 +783,7 @@ mod tests { impl PagingPolicy for TestPolicy { fn decide(&self, _hint: PageHint) -> PageDecision { - self.decision + self.decision.clone() } fn record(&self, event: PageEvent) { match event { @@ -672,6 +862,16 @@ mod tests { assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); } + /// Serializes tests that mutate the process-global pager configuration + /// (`POOL_MODE` / `COMPUTE_ENABLED` / `SWAP_PAGEOUT` / the singletons); + /// concurrent mutation makes their assertions race. Poison is recovered: + /// a prior test's panic doesn't invalidate the globals contract here. + fn global_config_lock() -> std::sync::MutexGuard<'static, ()> { + static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + LOCK.lock() + .unwrap_or_else(std::sync::PoisonError::into_inner) + } + /// With the swap-pageout flag on, the lz4 + swap path issues `MADV_PAGEOUT` /// over the compressed bytes; the round-trip must still reproduce the input /// (the advice is a non-destructive reclaim hint). Drives the global pager @@ -681,6 +881,7 @@ mod tests { #[mz_ore::test] #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function `madvise` on OS `linux` fn round_trip_swap_lz4_pageout() { + let _guard = global_config_lock(); apply_tiered_config(true, 0, Backend::Swap, Some(Codec::Lz4), true); let cp = global_pager(); let mut col = sample_typed(); @@ -735,6 +936,57 @@ mod tests { assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); } + /// Builds a small pool with a modest virtual reservation per size class. + fn test_pool(budget_bytes: usize) -> mz_ore::pool::Pool { + mz_ore::pool::Pool::new(mz_ore::pool::PoolConfig { + budget_bytes, + class_capacity_bytes: 64 << 20, + }) + .expect("pool creation") + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn round_trip_pooled() { + let pool = test_pool(256 << 20); + let cp = ColumnPager::pooled(pool.clone()); + let mut col = sample_typed(); + let paged = cp.page(&mut col); + let PagedColumn::Pooled { handle, meta } = &paged else { + panic!("expected Pooled"); + }; + assert_eq!(handle.len_bytes(), meta.len_bytes); + // Push the chunk out to its extent and poison the freed slots, so a + // `take` that read stale slot memory (the macOS `MADV_DONTNEED` + // hazard via free-list reuse) would fail the content check below. + pool.evict(handle); + pool.poison_free_slots(); + let rt = cp.take(paged); + assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); + let stats = pool.stats(); + assert_eq!(stats.inserts, 1); + assert_eq!(stats.faults, 1); + assert_eq!(stats.frees, 1); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn pooled_align_fast_path() { + let pool = test_pool(256 << 20); + let cp = ColumnPager::pooled(pool); + let body: Vec = (1u64..=512).collect(); + let mut col: Column = Column::Align(body.clone()); + let paged = cp.page(&mut col); + assert!(matches!(paged, PagedColumn::Pooled { .. })); + // After paging an Align variant, `col` is reset to the typed default. + assert!(matches!(col, Column::Typed(_))); + let rt = cp.take(paged); + match rt { + Column::Align(v) => assert_eq!(v, body), + other => panic!("expected Align, got {:?}", std::mem::discriminant(&other)), + } + } + #[mz_ore::test] fn align_variant_fast_path() { // Construct an Align column directly to exercise the move-only raw path. @@ -756,4 +1008,52 @@ mod tests { other => panic!("expected Align, got {:?}", std::mem::discriminant(&other)), } } + + /// Exercises the process-global mechanism switch end to end. Runs as one + /// test because it mutates the process-wide `POOL_MODE` / global pager; + /// serialized against the other global-mutating tests via + /// [`global_config_lock`]. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // unsupported operation: foreign function calls (mmap, madvise) + fn pool_mode_routing() { + let _guard = global_config_lock(); + // Pool mode on: the global pager and shared pager both go pooled. + let ok = apply_pool_config(PoolPagerConfig { + enabled: true, + budget_bytes: 1 << 30, + spill_threads: 0, + eager_backing: false, + rss_target_bytes: 0, + }); + assert!(ok, "pool reservation expected to succeed in tests"); + let mut col = sample_typed(); + let paged = global_pager().page(&mut col); + assert!(matches!(paged, PagedColumn::Pooled { .. })); + let rt = global_pager().take(paged); + assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); + + let mut col = sample_typed(); + let paged = shared_pager(true).page(&mut col); + assert!(matches!(paged, PagedColumn::Pooled { .. })); + drop(shared_pager(true).take(paged)); + + // Disabled consumers stay resident regardless of mechanism. + let mut col = sample_typed(); + let paged = shared_pager(false).page(&mut col); + assert!(matches!(paged, PagedColumn::Resident(_, _))); + drop(paged); + + // Tiered config flips the mechanism back: shared pager no longer pools. + apply_tiered_config(true, usize::MAX, Backend::Swap, None, false); + let mut col = sample_typed(); + let paged = shared_pager(true).page(&mut col); + assert!( + !matches!(paged, PagedColumn::Pooled { .. }), + "tiered mode must not hand out pooled columns", + ); + drop(paged); + + // Leave the globals in the disabled state for any future test runs. + apply_tiered_config(false, 0, Backend::Swap, None, false); + } } diff --git a/src/timely-util/src/column_pager/metrics.rs b/src/timely-util/src/column_pager/metrics.rs index 648b524e74cf7..d4c3058df9704 100644 --- a/src/timely-util/src/column_pager/metrics.rs +++ b/src/timely-util/src/column_pager/metrics.rs @@ -59,7 +59,8 @@ static METRICS: OnceLock = OnceLock::new(); /// after the first one are no-ops. Computed gauges read the singleton /// [`TieredPolicy`] atomics at scrape time; their values reflect the live /// policy whether or not the column-paged batcher is currently enabled. -pub fn register(registry: &MetricsRegistry, policy: &'static TieredPolicy) { +pub fn register(registry: &MetricsRegistry) { + let policy: &'static TieredPolicy = crate::column_pager::tiered_policy(); let _ = METRICS.get_or_init(|| { // Computed gauges: closures hold the &'static policy reference. let _budget_remaining: ComputedUIntGauge = registry.register_computed_gauge( @@ -79,6 +80,32 @@ pub fn register(registry: &MetricsRegistry, policy: &'static TieredPolicy) { move || u64::try_from(policy.configured_total()).unwrap_or(u64::MAX), ); + // Buffer-pool gauges peek at the process-wide pool's stats at scrape + // time, reporting zero until something else initializes the pool — + // a scrape must observe, not mmap an 8 TiB reservation into every + // process that happens to be monitored. The cumulative fields are + // exposed as computed gauges rather than counters because the pool + // owns the atomics; all are monotonic except resident/oversize bytes. + register_pool_gauge(registry, "resident_bytes", "Uncompressed bytes resident in the buffer pool.", |s| s.resident_bytes); + register_pool_gauge(registry, "oversize_bytes", "Bytes held by oversize chunks that bypass pool paging.", |s| s.oversize_bytes); + register_pool_gauge(registry, "inserts_total", "Chunks inserted into the buffer pool.", |s| s.inserts); + register_pool_gauge(registry, "frees_total", "Chunks freed from the buffer pool.", |s| s.frees); + register_pool_gauge(registry, "writes_elided_total", "Backing writes elided: chunks freed while unbacked, dead before any compression or extent write happened.", |s| s.writes_elided); + register_pool_gauge(registry, "evictions_compress_total", "Evictions that compressed a chunk into a new swap-backed extent.", |s| s.evictions_compress); + register_pool_gauge(registry, "evictions_cheap_total", "Evictions of already-backed chunks: physical pages released with no compression or extent write.", |s| s.evictions_cheap); + register_pool_gauge(registry, "faults_total", "Fault-ins decompressing a chunk from its extent back into its pool slot.", |s| s.faults); + register_pool_gauge(registry, "extent_bytes_written_total", "Compressed bytes written into swap-backed extents.", |s| s.extent_bytes_written); + register_pool_gauge(registry, "spill_scheduled_total", "Evictions handed to buffer-pool spill threads.", |s| s.spill_scheduled); + register_pool_gauge(registry, "spill_cancelled_total", "Scheduled evictions cancelled before completing (chunk freed or pinned).", |s| s.spill_cancelled); + register_pool_gauge(registry, "spill_in_flight", "Spill entries queued or being processed.", |s| s.spill_in_flight); + register_pool_gauge(registry, "slot_exhausted_fallbacks_total", "Inserts that fell back to unpageable heap chunks because their size class had no free slot.", |s| s.slot_exhausted_fallbacks); + register_pool_gauge(registry, "live_chunks", "Live pool chunks, whatever their residency: for backlog-shaped consumers, the un-drained backlog in chunks.", |s| s.live_chunks); + register_pool_gauge(registry, "warm_bytes", "Class bytes of free slots kept warm for fault-free reuse; RSS exceeds resident bytes by up to this bounded amount.", |s| s.warm_bytes); + register_pool_gauge(registry, "warm_reuses_total", "Slot allocations served from the warm list: no page faults, no kernel page zeroing.", |s| s.warm_reuses); + register_pool_gauge(registry, "eager_backs_total", "Chunks eagerly compressed to compressed-but-resident by idle spill threads; their later eviction is a pure page release.", |s| s.eager_backs); + register_pool_gauge(registry, "extent_resident_bytes", "Allocation bytes of compressed extents currently resident (the compressed-but-resident tier), bounded by the pool RSS target.", |s| s.extent_resident_bytes); + register_pool_gauge(registry, "extent_pageouts_total", "Extents pushed to the swap device by RSS-target enforcement.", |s| s.extent_pageouts); + PagerMetrics { skip_decisions_total: registry.register(metric!( name: "mz_column_pager_skip_decisions_total", @@ -121,6 +148,29 @@ pub fn register(registry: &MetricsRegistry, policy: &'static TieredPolicy) { }); } +/// Registers one computed gauge over a [`mz_ore::pool::PoolStats`] field, +/// named `mz_column_pool_{suffix}`. Peeks at the process-wide pool at scrape +/// time; zero until something initializes the pool (or if its reservation +/// failed). +fn register_pool_gauge( + registry: &MetricsRegistry, + suffix: &str, + help: &str, + field: fn(&mz_ore::pool::PoolStats) -> u64, +) { + let _gauge: ComputedUIntGauge = registry.register_computed_gauge( + metric!( + name: format!("mz_column_pool_{suffix}"), + help: help, + ), + move || { + crate::column_pager::global_pool_peek() + .map(|pool| field(&pool.stats())) + .unwrap_or(0) + }, + ); +} + #[inline] fn metrics() -> Option<&'static PagerMetrics> { METRICS.get() diff --git a/src/timely-util/src/column_pager/policy.rs b/src/timely-util/src/column_pager/policy.rs index 502b05b99d3bb..dbf1e989fd914 100644 --- a/src/timely-util/src/column_pager/policy.rs +++ b/src/timely-util/src/column_pager/policy.rs @@ -15,15 +15,19 @@ //! Concrete [`PagingPolicy`] implementations. //! -//! Today: [`TieredPolicy`], a single process-wide byte budget for resident +//! [`TieredPolicy`] is a single process-wide byte budget for resident //! columns. Resident columns can move between Timely workers, so the //! accounting cannot be thread-local; budget is held in a single //! [`AtomicUsize`] and credited back from whichever thread happens to drop //! the column. +//! +//! [`PoolPolicy`] routes every column into an [`mz_ore::pool::Pool`], which +//! enforces its own resident-bytes budget by evicting cold chunks. use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering}; use mz_ore::pager::Backend; +use mz_ore::pool::Pool; use crate::column_pager::{Codec, PageDecision, PageEvent, PageHint, PagingPolicy}; @@ -174,6 +178,39 @@ impl PagingPolicy for TieredPolicy { } } +/// Routes every non-empty column into a buffer [`Pool`]. +/// +/// The pool owns the budget: it evicts cold chunks into swap-backed extents +/// when its resident bytes exceed its configured bound, so the policy never +/// gates a decision on size. [`PagingPolicy::decide`] answers +/// [`PageDecision::Pool`] for every non-empty hint and keeps empty columns +/// resident; [`PagingPolicy::record`] is a no-op, since the column pager's +/// metrics observers already count page traffic and the pool tracks its own +/// residency in [`mz_ore::pool::PoolStats`]. +#[derive(Debug, Clone)] +pub struct PoolPolicy { + pool: Pool, +} + +impl PoolPolicy { + /// Constructs a policy backed by `pool`. + pub fn new(pool: Pool) -> Self { + Self { pool } + } +} + +impl PagingPolicy for PoolPolicy { + fn decide(&self, hint: PageHint) -> PageDecision { + if hint.len_bytes == 0 { + PageDecision::Skip + } else { + PageDecision::Pool(self.pool.clone()) + } + } + + fn record(&self, _event: PageEvent) {} +} + /// Atomically subtracts `want` from `atomic` if at least `want` is available. /// Returns `true` on success. fn try_consume(atomic: &AtomicUsize, want: usize) -> bool { @@ -182,17 +219,15 @@ fn try_consume(atomic: &AtomicUsize, want: usize) -> bool { if cur < want { return false; } - match atomic.compare_exchange_weak(cur, cur - want, Ordering::AcqRel, Ordering::Relaxed) { + // Relaxed: the budget is a pure counter; no memory is published or + // acquired through it. + match atomic.compare_exchange_weak(cur, cur - want, Ordering::Relaxed, Ordering::Relaxed) { Ok(_) => return true, Err(actual) => cur = actual, } } } -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/src/timely-util/src/columnar.rs b/src/timely-util/src/columnar.rs index cbde6d24a3918..cab196409634b 100644 --- a/src/timely-util/src/columnar.rs +++ b/src/timely-util/src/columnar.rs @@ -22,6 +22,7 @@ pub mod builder; pub mod builder_input; pub mod consolidate; pub mod merge_batcher; +pub mod paged_run; use std::hash::Hash; diff --git a/src/timely-util/src/columnar/merge_batcher.rs b/src/timely-util/src/columnar/merge_batcher.rs index 042cd3aa8a16f..8c2a4681af61a 100644 --- a/src/timely-util/src/columnar/merge_batcher.rs +++ b/src/timely-util/src/columnar/merge_batcher.rs @@ -69,6 +69,27 @@ const STASH_CAP: usize = 2; /// ones. const MAX_RECYCLE_BYTES: usize = 1 << 22; +/// Chains shorter than this (in chunks) keep their entries resident instead +/// of routing them through the pager. +/// +/// The rebalancing cascade in [`ColumnMergeBatcher::insert_chain`] merges +/// short chains almost immediately after they form, so paging their chunks +/// schedules work the next merge cancels: the chunk pays an insert into the +/// pool, an eviction nomination, and (under pressure) a spill-queue slot, +/// then dies before compression starts. Measured under hydration load, this +/// churn ran the spill queue to its cap with cancellation rates of hundreds +/// per second. Chunks enter the pager only once they land in a chain long +/// enough to sit out a few rebalance rounds. +/// +/// The resident overhead is bounded by the chain-stack shape: rebalancing +/// keeps the youngest chain under half its predecessor, so chains below +/// this threshold hold fewer than `MIN_PAGED_CHAIN_LEN` chunks between +/// them — at the ~2-4 MiB chunk band, single-digit MiB per batcher, paid +/// per worker per consumer. The value balances cancellation coverage +/// (chains of n chunks live roughly n push intervals before the cascade +/// consumes them) against that unevictable floor. +const MIN_PAGED_CHAIN_LEN: usize = 4; + /// Recycle `chunk` only if the stash isn't already at [`STASH_CAP`] and the /// chunk isn't oversize per [`MAX_RECYCLE_BYTES`]. `length_in_bytes` is /// measured before clear, so it reflects the data the chunk was carrying @@ -207,7 +228,8 @@ where /// `BatcherEvent` feeds the `mz_arrangement_batcher_*_raw` introspection /// tables, which downstream surface as memory-resource dashboards. Bytes /// living on swap or in a pager file aren't part of RSS and shouldn't be -/// reported there. +/// reported there. Pooled chunks likewise contribute zero: the buffer pool +/// budgets and accounts its own resident bytes. fn account_chunk(entry: &PagedColumn) -> (usize, usize, usize, usize) { match entry { PagedColumn::Resident(col, _) => { @@ -215,7 +237,9 @@ fn account_chunk(entry: &PagedColumn) -> (usize, usize, usize, u let bytes = col.length_in_bytes(); (records, bytes, bytes, 1) } - PagedColumn::Paged { .. } | PagedColumn::Compressed { .. } => (0, 0, 0, 0), + PagedColumn::Paged { .. } | PagedColumn::Compressed { .. } | PagedColumn::Pooled { .. } => { + (0, 0, 0, 0) + } } } @@ -250,6 +274,35 @@ where &mut self, upper: Antichain, ) -> (Vec, Description) { + let (chunks, description) = self.seal_paged(upper); + (chunks.collect(), description) + } + + fn frontier(&mut self) -> AntichainRef<'_, Self::Time> { + self.frontier.borrow() + } +} + +impl ColumnMergeBatcher +where + D: Columnar, + for<'a> columnar::Ref<'a, D>: Copy + Ord, + T: Columnar + Default + Timestamp + PartialOrder, + for<'a> columnar::Ref<'a, T>: Copy + Ord, + R: Columnar + Default + Semigroup + for<'a> Semigroup>, + for<'a> columnar::Ref<'a, R>: Ord, +{ + /// Seals like [`Batcher::seal`], but returns the ship side as + /// [`SealedChunks`]: chunk handles that stay paged until iterated, + /// rehydrating one chunk per `next` call. + /// + /// [`Batcher::seal`] materializes every shipped chunk into one `Vec`, so a + /// seal's transient memory is the full uncompressed ship side — unbounded + /// when a frontier advance releases a large backlog at once. A consumer + /// that processes chunks sequentially and drops each before the next keeps + /// that footprint to a single chunk by sealing through this method + /// instead. + pub fn seal_paged(&mut self, upper: Antichain) -> (SealedChunks, Description) { let pager = self.pager(); // Merge all remaining chains into one. while self.chains.len() > 1 { @@ -260,10 +313,9 @@ where } let merged = self.chain_pop().unwrap_or_default(); - // Extract `merged` into `readied` (ship side, materialized for the - // builder) and `kept_chain` (keep side, stays paged for the next - // round). - let mut readied: Vec> = Vec::new(); + // Extract `merged` into `readied` (ship side, still paged) and + // `kept_chain` (keep side, stays paged for the next round). + let mut readied: Vec> = Vec::new(); let mut kept_chain: VecDeque> = VecDeque::new(); self.frontier.clear(); { @@ -274,7 +326,7 @@ where FetchIter::new(merged, pager), upper.borrow(), frontier, - |paged| readied.push(pager.take(paged)), + |paged| readied.push(paged), |paged| kept_chain.push_back(paged), stash, ); @@ -299,14 +351,39 @@ where // may be a quiet stretch. self.stash.clear(); - (readied, description) + let chunks = SealedChunks { + chunks: readied.into_iter(), + pager, + }; + (chunks, description) } +} - fn frontier(&mut self) -> AntichainRef<'_, Self::Time> { - self.frontier.borrow() +/// The ship side of a [`ColumnMergeBatcher::seal_paged`] call: sorted, +/// consolidated chunks that stay paged until iterated. +/// +/// Each `next` call rehydrates exactly one chunk, so a consumer that drops +/// each chunk before requesting the next holds at most one chunk resident, +/// however large the sealed backlog. +pub struct SealedChunks { + chunks: std::vec::IntoIter>, + pager: ColumnPager, +} + +impl Iterator for SealedChunks { + type Item = Column<(D, T, R)>; + + fn next(&mut self) -> Option { + self.chunks.next().map(|paged| self.pager.take(paged)) + } + + fn size_hint(&self) -> (usize, Option) { + self.chunks.size_hint() } } +impl ExactSizeIterator for SealedChunks {} + impl PushInto> for ColumnMergeBatcher where D: Columnar, @@ -315,11 +392,14 @@ where for<'a> columnar::Ref<'a, T>: Copy + Ord, R: Columnar + Default + Semigroup + for<'a> Semigroup>, { - /// Accept an already-consolidated chunk from the upstream chunker, route - /// it through the pager, and insert it as a singleton chain. + /// Accept an already-consolidated chunk from the upstream chunker and + /// insert it as a singleton chain. The chunk stays resident — a + /// singleton is the shortest possible chain (see + /// [`MIN_PAGED_CHAIN_LEN`]), and the rebalance that follows consumes it + /// almost immediately; it reaches the pager once it lands in a chain + /// long enough to survive a few rounds. fn push_into(&mut self, mut chunk: Column<(D, T, R)>) { - let pager = self.pager(); - let paged = pager.page(&mut chunk); + let paged = ColumnPager::disabled().page(&mut chunk); self.insert_chain(VecDeque::from([paged])); } } @@ -359,7 +439,16 @@ where b: VecDeque>, ) -> VecDeque> { let mut output: VecDeque> = VecDeque::new(); - let pager = self.pager(); + // Short result chains keep their outputs resident: the cascade + // consumes them almost immediately, and paging chunks that die + // younger than the spill path's latency only schedules cancelled + // work (see MIN_PAGED_CHAIN_LEN). `take` is variant-driven, so the + // disabled pager rehydrates paged inputs just as well. + let pager = if a.len() + b.len() < MIN_PAGED_CHAIN_LEN { + ColumnPager::disabled() + } else { + self.pager() + }; let pager = &pager; let stash = &mut self.stash; merge_chains( @@ -963,6 +1052,89 @@ mod tests { assert_eq!(out_sorted, expected); } + #[mz_ore::test] + fn short_chains_stay_resident() { + let policy = ForcePagePolicy::new(); + let mut b: ColumnMergeBatcher<(u64, u64), u64, i64> = + differential_dataflow::trace::Batcher::new(None, 0); + b.set_pager(ColumnPager::new(policy.clone())); + + // Singleton pushes form chains far below MIN_PAGED_CHAIN_LEN; + // despite the force-page pager, every entry stays resident because + // short chains never route through it. + for i in 0..3u64 { + b.push_into(col(&[((i, 0), 0, 1)])); + } + for entry in b.chains.iter().flatten() { + assert!( + matches!(entry, PagedColumn::Resident(..)), + "short chains stay resident", + ); + } + + let chain = |range: std::ops::Range| -> VecDeque> { + range + .map(|i| { + let mut c = col(&[((i, 0), 0, 1)]); + ColumnPager::disabled().page(&mut c) + }) + .collect() + }; + + // A merge whose combined input is below the threshold keeps its + // outputs resident; one at the threshold routes them through the + // (force-page) pager. + let k: u64 = mz_ore::cast::CastFrom::cast_from(MIN_PAGED_CHAIN_LEN); + let out = b.merge_by(chain(0..1), chain(1..k - 1)); + assert!( + out.iter().all(|e| matches!(e, PagedColumn::Resident(..))), + "below-threshold merges keep outputs resident", + ); + let out = b.merge_by(chain(0..k / 2), chain(k / 2..k)); + assert!( + out.iter().any(|e| !matches!(e, PagedColumn::Resident(..))), + "at-threshold merges page their outputs", + ); + } + + #[mz_ore::test] + fn seal_paged_ships_lazily() { + let policy = ForcePagePolicy::new(); + let pager = ColumnPager::new(policy.clone()); + + let mut b: ColumnMergeBatcher<(u64, u64), u64, i64> = + differential_dataflow::trace::Batcher::new(None, 0); + b.set_pager(pager); + + let n: u64 = 200; + for i in 0..n { + b.push_into(col(&[((i, 0), i % 10, 1)])); + } + + let upper = Antichain::from_elem(5u64); + let (chunks, _description) = b.seal_paged(upper); + + // The ship side must come out paged; rehydration happens per `next`. + for paged in chunks.chunks.as_slice() { + assert!( + !matches!(paged, PagedColumn::Resident(..)), + "ship side must stay paged until iterated", + ); + } + + let mut out: Vec = Vec::new(); + for chunk in chunks { + out.extend(collect_column(&chunk)); + } + out.sort(); + let mut expected: Vec = (0..n) + .filter(|i| i % 10 < 5) + .map(|i| ((i, 0), i % 10, 1)) + .collect(); + expected.sort(); + assert_eq!(out, expected); + } + #[mz_ore::test] fn account_chunk_resident_vs_paged() { let policy = ForcePagePolicy::new(); @@ -1021,6 +1193,10 @@ mod tests { let _ = meta; 1 } + PagedColumn::Pooled { meta, .. } => { + let _ = meta; + 1 + } PagedColumn::Resident(_, _) => { panic!("kept chain entry was Resident under ForcePagePolicy"); } @@ -1031,4 +1207,43 @@ mod tests { assert!(policy.out.load(std::sync::atomic::Ordering::Relaxed) > 0); let _ = n; } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn batcher_seal_round_trip_pooled() { + // Zero-budget pool: every inserted chunk is evicted to its extent as + // soon as it lands, so the merge / seal path must fault everything + // back in from extents rather than reading pool slots. + let pool = mz_ore::pool::Pool::new(mz_ore::pool::PoolConfig { + budget_bytes: 0, + class_capacity_bytes: 64 << 20, + }) + .expect("pool creation"); + + let mut b: ColumnMergeBatcher<(u64, u64), u64, i64> = + differential_dataflow::trace::Batcher::new(None, 0); + b.set_pager(ColumnPager::pooled(pool.clone())); + + let n: u64 = 200; + for i in 0..n { + b.push_into(col(&[((i, 0), 0, 1)])); + } + let upper = Antichain::from_elem(u64::MAX); + let (chain, _description) = differential_dataflow::trace::Batcher::seal(&mut b, upper); + let mut out: Vec = chain.iter().flat_map(collect_column).collect(); + out.sort(); + let expected: Vec = (0..n).map(|i| ((i, 0u64), 0u64, 1i64)).collect(); + assert_eq!(out, expected); + + // The data really round-tripped through extents: the zero budget + // forced compressing evictions, and reading the chains back faulted + // chunks in from those extents. + let stats = pool.stats(); + assert!(stats.inserts > 0, "expected pool inserts: {stats:?}"); + assert!( + stats.evictions_compress > 0, + "expected compressing evictions: {stats:?}" + ); + assert!(stats.faults > 0, "expected extent fault-ins: {stats:?}"); + } } diff --git a/src/timely-util/src/columnar/paged_run.rs b/src/timely-util/src/columnar/paged_run.rs new file mode 100644 index 0000000000000..5d3cb3fd07f8f --- /dev/null +++ b/src/timely-util/src/columnar/paged_run.rs @@ -0,0 +1,677 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Paged sealed runs: Layer 3 of the buffer-managed state design +//! (`doc/developer/design/20260610_buffer_managed_state.md`) prototyped over +//! [`Column`] and the [`mz_ore::pool`] buffer pool. +//! +//! A [`PagedRun`] stores a sealed sorted run of `(D, T, R)` updates as +//! pool-resident column pages plus a small always-resident header: per-page +//! fence keys (the first and last `D` of each page, owned) and per-page update +//! counts. Pages are sealed into the pool and evicted eagerly, +//! hydration-style, so a freshly built run holds no resident data beyond its +//! header. +//! +//! The read paths follow the design's access-pattern story: +//! +//! * [`PagedRun::seek`] binary-searches the resident fence keys with zero I/O +//! and faults exactly the pages whose key range can contain the key — one +//! page in the common case, more only when the key straddles page +//! boundaries. +//! * [`PagedRun::iter`] scans the run pinning one page at a time, prefetching +//! the next page before reading the current one, and re-evicting each page +//! after consuming it so the resident window stays bounded. +//! * [`PagedRun::merge`] streams two runs into a new one, holding at most one +//! pinned input page per side plus one output accumulation column. Consumed +//! input pages are re-evicted cheaply (they are `BackedResident`, so +//! eviction is a pure page release). +//! +//! Reads from pinned pool memory are zero-copy: the borrowed columnar view is +//! reconstructed from the pinned `&[u64]` exactly as [`Column::borrow`] does +//! for its serialized variants. This is sound because a pinned chunk is never +//! evicted or relocated for the life of the borrow; views are re-derived from +//! a fresh pin on every access and never cached across pins. + +use std::marker::PhantomData; + +use columnar::bytes::indexed; +use columnar::{Borrow, BorrowedOf, Columnar, ContainerOf, FromBytes, Index, Len, Push}; +use differential_dataflow::difference::Semigroup; +use mz_ore::pool::{ChunkHandle, PinGuard, Pool, Residency}; +use timely::dataflow::channels::ContainerBytes; + +use crate::columnar::{Column, at_serialized_capacity}; + +/// Reconstructs the borrowed columnar view from serialized words, the same +/// zero-copy decode [`Column::borrow`] performs on its `Align` variant. The +/// words here come from pinned pool memory instead of an owned `Vec`. +fn borrow_words(words: &[u64]) -> BorrowedOf<'_, C> { + >::from_bytes(&mut indexed::decode(words)) +} + +/// Serializes a column into `u64` words for pool insertion. `Column::Align` +/// already is the wanted representation and moves with no copy; other +/// variants serialize through [`ContainerBytes::into_bytes`] and widen. +fn into_words(column: Column) -> Vec { + match column { + Column::Align(words) => words, + other => { + let len_bytes = other.length_in_bytes(); + let mut bytes = Vec::with_capacity(len_bytes); + other.into_bytes(&mut bytes); + debug_assert_eq!(bytes.len() % 8, 0); + bytemuck::allocation::pod_collect_to_vec(&bytes) + } + } +} + +/// A sealed sorted run of `(D, T, R)` updates, stored as eagerly evicted pool +/// pages plus an always-resident header of fence keys and update counts. +/// +/// Updates are sorted by `(D, T)` across the whole run. A single key's +/// updates may straddle consecutive pages, which is why the header keeps both +/// the first and the last key of each page. +pub struct PagedRun { + /// The pool holding this run's pages. + pool: Pool, + /// One serialized `Column<(D, T, R)>` per page. + chunks: Vec, + /// First key of each page, owned and always resident. + first_keys: Vec, + /// Last key of each page, owned and always resident. + last_keys: Vec, + /// Number of updates in each page. + update_counts: Vec, + /// The pages hold `(D, T, R)` updates; only `D` appears in the resident + /// header fields. + _marker: PhantomData<(T, R)>, +} + +/// Accumulates sealed pages and their header entries while a run is built. +struct RunBuilder<'a, D: Columnar, T: Columnar, R: Columnar> { + pool: &'a Pool, + chunks: Vec, + first_keys: Vec, + last_keys: Vec, + update_counts: Vec, + /// Output accumulation container for update-at-a-time building (merge). + current: ContainerOf<(D, T, R)>, + current_len: usize, +} + +impl<'a, D: Columnar, T: Columnar, R: Columnar> RunBuilder<'a, D, T, R> { + fn new(pool: &'a Pool) -> Self { + RunBuilder { + pool, + chunks: Vec::new(), + first_keys: Vec::new(), + last_keys: Vec::new(), + update_counts: Vec::new(), + current: Default::default(), + current_len: 0, + } + } + + /// Pushes one update into the accumulation container, sealing it as a + /// page once its serialized size reaches the ship target. + fn push(&mut self, update: &(D, T, R)) { + self.current.push(update); + self.current_len += 1; + if at_serialized_capacity(&self.current.borrow()) { + self.flush(); + } + } + + /// Seals the accumulation container as a page, if it holds any updates. + fn flush(&mut self) { + if self.current_len == 0 { + return; + } + let len = self.current_len; + self.current_len = 0; + let column = Column::Typed(std::mem::take(&mut self.current)); + self.seal(column, len); + } + + /// Seals one non-empty column as a page: captures its fence keys as + /// owned values, serializes it into the pool, and evicts it eagerly. + fn seal(&mut self, column: Column<(D, T, R)>, len: usize) { + debug_assert!(len > 0, "sealed pages are non-empty"); + let (first, last) = { + let borrow = column.borrow(); + let (first_d, _, _) = borrow.get(0); + let (last_d, _, _) = borrow.get(borrow.len() - 1); + (D::into_owned(first_d), D::into_owned(last_d)) + }; + let mut words = into_words(column); + let handle = self.pool.insert(&mut words); + self.pool.evict(&handle); + self.chunks.push(handle); + self.first_keys.push(first); + self.last_keys.push(last); + self.update_counts.push(len); + } + + fn finish(mut self) -> PagedRun { + self.flush(); + PagedRun { + pool: self.pool.clone(), + chunks: self.chunks, + first_keys: self.first_keys, + last_keys: self.last_keys, + update_counts: self.update_counts, + _marker: PhantomData, + } + } +} + +impl PagedRun { + /// Builds a run from columns that are globally sorted by `(D, T)` with + /// arbitrary breakpoints (the merge-batcher chain output contract). Each + /// non-empty column becomes one page, sealed into the pool and evicted + /// eagerly; empty columns are skipped. + /// + /// A column whose serialized size exceeds the pool's largest size class + /// becomes a [`Residency::Oversize`] page: always resident, exempt from + /// eviction and budget enforcement, and counted against the budget. The + /// prototype accepts this degradation rather than splitting pages; + /// `PoolStats::oversize_bytes` makes it observable. + pub fn build(pool: &Pool, columns: impl IntoIterator>) -> Self { + let mut builder = RunBuilder::new(pool); + for column in columns { + let len = column.borrow().len(); + if len == 0 { + continue; + } + builder.seal(column, len); + } + builder.finish() + } + + /// The number of pages in the run. + pub fn chunk_count(&self) -> usize { + self.chunks.len() + } + + /// The total number of updates in the run. + pub fn len(&self) -> usize { + self.update_counts.iter().sum() + } + + /// Returns `true` if the run holds no updates. + pub fn is_empty(&self) -> bool { + self.chunks.is_empty() + } + + /// The residency state of each page, in page order. + pub fn residencies(&self) -> Vec { + self.chunks.iter().map(|c| c.residency()).collect() + } + + /// Evicts every page of the run. Pages faulted in by reads are cheap to + /// re-evict (`BackedResident`); the call performs no compression I/O for + /// them. + pub fn evict_all(&self) { + for chunk in &self.chunks { + self.pool.evict(chunk); + } + } + + /// Returns the `(T, R)` pairs recorded for `key`, in update order. + /// + /// Binary-searches the resident fence keys with zero I/O for the page + /// range that can contain `key`. The range covers more than one page only + /// when the key's updates straddle page boundaries + /// (`last_keys[i] == key == first_keys[i + 1]`); a key falling in the gap + /// between two pages yields an empty range and faults nothing. Each + /// visited page is pinned (faulting it from its extent if evicted) and + /// binary-searched within; faulted pages stay `BackedResident` afterwards, + /// leaving re-eviction to the pool's budget enforcement. + pub fn seek(&self, key: &D) -> Vec<(T, R)> + where + D: Ord, + { + let lo = self.last_keys.partition_point(|k| k < key); + let hi = self.first_keys.partition_point(|k| k <= key); + let mut out = Vec::new(); + for chunk in &self.chunks[lo..hi] { + let pin = chunk.pin(); + let view = borrow_words::<(D, T, R)>(&pin); + let len = view.len(); + let start = partition_point(len, |i| { + let (d, _, _) = view.get(i); + D::into_owned(d) < *key + }); + for index in start..len { + let (d, t, r) = view.get(index); + if D::into_owned(d) != *key { + break; + } + out.push((T::into_owned(t), R::into_owned(r))); + } + } + out + } + + /// Iterates over the whole run in `(D, T)` order, yielding owned updates. + /// + /// Pins one page at a time and prefetches the next page before reading + /// the current one. Consumed pages are re-evicted so a scan keeps a + /// bounded resident window regardless of run size. + pub fn iter(&self) -> Iter<'_, D, T, R> { + Iter { + cursor: ChunkCursor::new(self), + } + } + + /// Merges two runs into a new one, consolidating updates with equal + /// `(D, T)` by adding their `R`s and dropping updates whose sum is zero, + /// as [`differential_dataflow::consolidation::consolidate_updates`] does. + /// + /// The merge streams: at most one pinned input page per side plus one + /// output accumulation column are held at a time. Output pages are cut at + /// the crate's serialized ship target, sealed into `pool`, and evicted + /// eagerly; fully consumed input pages are re-evicted (cheaply, they are + /// `BackedResident`). The resident window is therefore bounded regardless + /// of the input run sizes. + pub fn merge(pool: &Pool, a: &Self, b: &Self) -> Self + where + D: Ord, + T: Ord, + R: Semigroup, + { + let mut builder = RunBuilder::new(pool); + let mut a_cur = ChunkCursor::new(a); + let mut b_cur = ChunkCursor::new(b); + let mut pending: Option<(D, T, R)> = None; + loop { + a_cur.fill(); + b_cur.fill(); + let take_a = match (&a_cur.head, &b_cur.head) { + (Some(x), Some(y)) => (&x.0, &x.1) <= (&y.0, &y.1), + (Some(_), None) => true, + (None, Some(_)) => false, + (None, None) => break, + }; + let head = if take_a { + &mut a_cur.head + } else { + &mut b_cur.head + }; + let update = head.take().expect("head filled"); + match &mut pending { + Some(p) if p.0 == update.0 && p.1 == update.1 => p.2.plus_equals(&update.2), + _ => { + if let Some(prev) = pending.take() { + if !prev.2.is_zero() { + builder.push(&prev); + } + } + pending = Some(update); + } + } + } + if let Some(prev) = pending.take() { + if !prev.2.is_zero() { + builder.push(&prev); + } + } + builder.finish() + } +} + +/// Index of the first element in `0..len` for which `pred` is false, assuming +/// `pred` is true for a prefix and false for the rest. `partition_point` over +/// indices rather than a slice, for searching borrowed columnar views. +fn partition_point(len: usize, mut pred: impl FnMut(usize) -> bool) -> usize { + let (mut lo, mut hi) = (0, len); + while lo < hi { + let mid = lo + (hi - lo) / 2; + if pred(mid) { + lo = mid + 1; + } else { + hi = mid; + } + } + lo +} + +/// Streams a run's updates page by page, holding at most one pin. +/// +/// Opening page `i` first prefetches page `i + 1`, the readahead of the +/// design's scan path. A fully consumed page is re-evicted, keeping the +/// resident window at one page. +/// +/// The borrowed columnar view cannot be stored next to the pin it borrows +/// from, so it is reconstructed per access; the decode is a handful of slice +/// splits over the pinned words. +struct ChunkCursor<'a, D: Columnar, T: Columnar, R: Columnar> { + run: &'a PagedRun, + chunk: usize, + pos: usize, + pin: Option>, + head: Option<(D, T, R)>, +} + +impl<'a, D: Columnar, T: Columnar, R: Columnar> ChunkCursor<'a, D, T, R> { + fn new(run: &'a PagedRun) -> Self { + ChunkCursor { + run, + chunk: 0, + pos: 0, + pin: None, + head: None, + } + } + + /// Ensures `head` holds the next update, if any remain. Advances across + /// page boundaries, re-evicting each consumed page. + fn fill(&mut self) { + while self.head.is_none() && self.chunk < self.run.chunks.len() { + if self.pin.is_none() { + if let Some(next) = self.run.chunks.get(self.chunk + 1) { + next.prefetch(); + } + self.pin = Some(self.run.chunks[self.chunk].pin()); + self.pos = 0; + } + let pin = self.pin.as_ref().expect("pinned above"); + let view = borrow_words::<(D, T, R)>(pin); + if self.pos < view.len() { + let (d, t, r) = view.get(self.pos); + self.pos += 1; + self.head = Some((D::into_owned(d), T::into_owned(t), R::into_owned(r))); + } else { + self.pin = None; + self.run.pool.evict(&self.run.chunks[self.chunk]); + self.chunk += 1; + } + } + } +} + +/// Iterator over a [`PagedRun`]'s updates, returned by [`PagedRun::iter`]. +pub struct Iter<'a, D: Columnar, T: Columnar, R: Columnar> { + cursor: ChunkCursor<'a, D, T, R>, +} + +impl Iterator for Iter<'_, D, T, R> { + type Item = (D, T, R); + + fn next(&mut self) -> Option<(D, T, R)> { + self.cursor.fill(); + self.cursor.head.take() + } +} + +#[cfg(test)] +mod tests { + use mz_ore::pool::PoolConfig; + + use super::*; + + type Update = (u64, u64, i64); + + /// Pool with a small virtual reservation per class, suitable for tests. + fn test_pool(budget_bytes: usize) -> Pool { + Pool::new(PoolConfig { + budget_bytes, + class_capacity_bytes: 64 << 20, + }) + .expect("pool creation") + } + + fn column(updates: &[Update]) -> Column { + Column::Typed(Columnar::as_columns(updates.iter())) + } + + /// Three sorted pages with key 100 straddling the first page boundary and + /// a key gap between the second and third pages. + fn straddle_input() -> Vec> { + let mut page0: Vec = (0..100).map(|d| (d, d, 1)).collect(); + page0.push((100, 0, 1)); + page0.push((100, 1, 2)); + let page1: Vec = [(100, 2, 3), (100, 3, 4)] + .into_iter() + .chain((101..200).map(|d| (d, d, 1))) + .collect(); + let page2: Vec = (250..300).map(|d| (d, d, 1)).collect(); + vec![page0, page1, page2] + } + + fn straddle_run(pool: &Pool) -> PagedRun { + PagedRun::build(pool, straddle_input().iter().map(|p| column(p))) + } + + /// Reference merge: sort by `(D, T)`, sum `R` for equal `(D, T)`, drop + /// zero sums. + fn reference_merge(mut updates: Vec) -> Vec { + updates.sort(); + let mut out: Vec = Vec::new(); + for (d, t, r) in updates { + match out.last_mut() { + Some(prev) if prev.0 == d && prev.1 == t => prev.2 += r, + _ => out.push((d, t, r)), + } + if out.last().map_or(false, |u| u.2 == 0) { + out.pop(); + } + } + out + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn build_then_seek() { + let pool = test_pool(256 << 20); + let run = straddle_run(&pool); + assert_eq!(run.chunk_count(), 3); + assert_eq!(run.len(), 100 + 2 + 2 + 99 + 50); + assert!(!run.is_empty()); + assert!( + run.residencies().iter().all(|r| *r == Residency::Evicted), + "all pages are evicted after build" + ); + + // First key of the run: one page faulted. + let before = pool.stats().faults; + assert_eq!(run.seek(&0), vec![(0, 1)]); + assert_eq!(pool.stats().faults - before, 1); + + // Straddling key: exactly the two straddled pages faulted. + run.evict_all(); + let before = pool.stats().faults; + assert_eq!(run.seek(&100), vec![(0, 1), (1, 2), (2, 3), (3, 4)]); + assert_eq!(pool.stats().faults - before, 2); + + // Interior and last keys: one page each. + run.evict_all(); + let before = pool.stats().faults; + assert_eq!(run.seek(&150), vec![(150, 1)]); + assert_eq!(pool.stats().faults - before, 1); + run.evict_all(); + let before = pool.stats().faults; + assert_eq!(run.seek(&299), vec![(299, 1)]); + assert_eq!(pool.stats().faults - before, 1); + + // Absent keys: in the gap between pages and past the end. The fence + // search rejects both without faulting anything. + run.evict_all(); + let before = pool.stats().faults; + assert_eq!(run.seek(&225), Vec::new()); + assert_eq!(run.seek(&1000), Vec::new()); + assert_eq!(pool.stats().faults - before, 0); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn iter_matches_input() { + let pool = test_pool(256 << 20); + let input = straddle_input(); + let run = PagedRun::build(&pool, input.iter().map(|p| column(p))); + let expected: Vec = input.into_iter().flatten().collect(); + assert_eq!(run.iter().collect::>(), expected); + assert!( + run.residencies().iter().all(|r| *r == Residency::Evicted), + "a scan re-evicts every page it consumed" + ); + + // macOS `MADV_DONTNEED` may leave freed slot contents intact and the + // free list may hand a faulting chunk its previous slot, so a bug + // that skipped the extent read could pass by accident. Poison the + // free slots to prove the reads come from the extents. + pool.poison_free_slots(); + assert_eq!(run.iter().collect::>(), expected); + assert_eq!(run.seek(&100), vec![(0, 1), (1, 2), (2, 3), (3, 4)]); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn merge_matches_reference() { + // Budget of three 64 KiB-class pages: two pinned inputs plus slack. + let budget = 3 * (64 << 10); + let pool = test_pool(budget); + + // Two overlapping runs, split into 1000-update pages (24 KB each, + // landing in the 64 KiB class). Keys divisible by six cancel exactly. + let a_updates: Vec = (0..6000).step_by(2).map(|d| (d, 0, 1)).collect(); + let b_updates: Vec = (0..6000) + .step_by(3) + .flat_map(|d| [(d, 0, -1), (d, 1, 1)]) + .collect(); + let a = PagedRun::build(&pool, a_updates.chunks(1000).map(column)); + let b = PagedRun::build(&pool, b_updates.chunks(1000).map(column)); + assert!(a.chunk_count() > 1 && b.chunk_count() > 1); + + let merged = PagedRun::merge(&pool, &a, &b); + let expected = reference_merge( + a_updates + .iter() + .chain(b_updates.iter()) + .copied() + .collect::>(), + ); + assert!(expected.iter().any(|u| u.1 == 1), "some updates survive"); + assert_eq!(merged.iter().collect::>(), expected); + + let stats = pool.stats(); + assert!( + stats.resident_bytes <= u64::try_from(budget).expect("fits"), + "resident {} exceeds budget {}", + stats.resident_bytes, + budget, + ); + assert!( + stats.evictions_cheap > 0, + "consumed input pages are re-evicted cheaply" + ); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn merge_of_empty_runs() { + let pool = test_pool(256 << 20); + let empty = PagedRun::::build(&pool, [column(&[])]); + assert!(empty.is_empty()); + assert_eq!(empty.chunk_count(), 0); + assert_eq!(empty.seek(&0), Vec::new()); + assert_eq!(empty.iter().count(), 0); + + let run = straddle_run(&pool); + let merged = PagedRun::merge(&pool, &empty, &run); + assert_eq!( + merged.iter().collect::>(), + reference_merge(run.iter().collect()), + ); + let both_empty = PagedRun::merge(&pool, &empty, &empty); + assert!(both_empty.is_empty()); + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn seek_traffic_respects_budget() { + // Read-only probing of a run much larger than the budget: seeks + // perform no inserts, so only fault-in-triggered budget enforcement + // keeps the faulted pages from accumulating without bound. + let budget = 2 * (64 << 10); + let pool = test_pool(budget); + let updates: Vec = (0..20_000).map(|d| (d, d, 1)).collect(); + let run = PagedRun::build(&pool, updates.chunks(1000).map(column)); + assert!(run.chunk_count() >= 10, "run spans many pages"); + assert!( + run.len() * 24 > 2 * budget, + "run is larger than twice the budget" + ); + for d in (0..20_000).step_by(97) { + assert_eq!(run.seek(&d), vec![(d, 1)]); + let resident = pool.stats().resident_bytes; + assert!( + resident <= u64::try_from(budget).expect("fits"), + "resident {resident} exceeds budget {budget} under seek-only traffic", + ); + } + } + + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn oversize_pages_stay_resident() { + // A page whose serialized size exceeds the largest pool size class + // lands `Oversize`: always resident, exempt from eviction and budget + // enforcement. The prototype accepts this degradation; this test + // pins the behavior, that reads stay correct in it, and that the + // `oversize_bytes` gauge surfaces it. + let budget = 64 << 10; + let pool = test_pool(budget); + // Ten 1 MiB values serialize past the largest (8 MiB) size class. + let updates: Vec<(Vec, u64, i64)> = (0..10u8) + .map(|d| (vec![d; 1 << 20], u64::from(d), 1)) + .collect(); + let run = PagedRun::build(&pool, [Column::Typed(Columnar::as_columns(updates.iter()))]); + assert_eq!(run.chunk_count(), 1); + assert_eq!(run.residencies(), vec![Residency::Oversize]); + let stats = pool.stats(); + assert!( + stats.oversize_bytes > 8 << 20, + "oversize gauge reflects the page: {stats:?}" + ); + assert!( + stats.resident_bytes > u64::try_from(budget).expect("fits"), + "oversize pages escape the budget: {stats:?}" + ); + // Eviction and enforcement are no-ops for oversize pages. + run.evict_all(); + pool.enforce_budget(); + assert_eq!(run.residencies(), vec![Residency::Oversize]); + // Reads remain correct in the degraded mode. + assert_eq!(run.seek(&vec![3u8; 1 << 20]), vec![(3u64, 1i64)]); + assert_eq!(run.iter().count(), updates.len()); + } + + /// Pages may relocate across evict/fault-in cycles (slots are scoped to + /// residency); correctness rests on pin-mediated access, so contents must + /// round-trip regardless of where a page lands. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] // mmap and madvise are foreign calls + fn pages_round_trip_across_eviction() { + let pool = test_pool(256 << 20); + let run = straddle_run(&pool); + let before: Vec = run.chunks[0].pin().to_vec(); + pool.evict(&run.chunks[0]); + assert_eq!(run.chunks[0].residency(), Residency::Evicted); + pool.poison_free_slots(); + let pin = run.chunks[0].pin(); + assert_eq!(&*pin, &before[..], "contents survive relocation"); + } +}