From 71a1c06233edf27cf2eeec3c6595fb2703bca7bb Mon Sep 17 00:00:00 2001 From: Ian Later Date: Tue, 23 Jun 2026 16:12:34 -0700 Subject: [PATCH 01/14] python(fix): bound the channel data cache to avoid OOM on long pulls Add channel data cache size configuration. --- python/CHANGELOG.md | 15 + .../_internal/low_level_wrappers/data.py | 163 +++++++++-- .../low_level_wrappers/test_data_cache.py | 269 ++++++++++++++++++ python/lib/sift_client/client.py | 15 + python/lib/sift_client/resources/channels.py | 15 +- 5 files changed, 452 insertions(+), 25 deletions(-) create mode 100644 python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index 407e657a7..510efdafb 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -7,6 +7,21 @@ This project adheres to [Semantic Versioning](http://semver.org/). ### What's New +#### Bounded channel data cache + +`SiftClient.get_data` cache state is now per-instance and byte-bounded instead of shared across the process and unbounded. A new `data_cache_max_bytes` constructor kwarg (default 512 MiB) caps the in-memory channel-data footprint; the least-recently-used cached channel is evicted once the bound is reached. Set `data_cache_max_bytes=0` to disable caching entirely. + +`ignore_cache=True` on `client.channels.get_data(...)` now also skips writing into the cache, matching its read-side bypass semantics. Previously a "non-caching" workload still appended to the shared cache on every call, which silently OOM'd long-running pods doing sustained data pulls. + +```python +client = SiftClient( + connection_config=config, + data_cache_max_bytes=128 * 1024 * 1024, # 128 MiB cap +) +``` + +The internal `DataLowLevelClient.channel_cache` is no longer a class attribute. Any external code that relied on `DataLowLevelClient.channel_cache.channels.clear()` as a workaround should remove it — the bounded cache no longer requires manual purging. + #### Resource and principal attributes (ABAC) Added a public API for attribute based access control (ABAC) attributes. `client.resource_attributes` manages attribute keys assigned to entities (assets, channels, runs), and `client.principal_attributes` manages attribute keys assigned to principals (users and user groups). Both are available synchronously and asynchronously via `client.async_`. diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index 57b24e398..bed1b3a44 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -2,6 +2,7 @@ import asyncio import logging +from collections import OrderedDict from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, cast @@ -34,17 +35,123 @@ # has been resolved. In the mean time each channel gets its own request. REQUEST_BATCH_SIZE = 1 +# Default in-memory budget for cached channel DataFrames, per ``DataLowLevelClient`` +# instance. 512 MiB is well below typical pod limits while still letting common +# interactive workloads stay in cache. Override via ``SiftClient(data_cache_max_bytes=...)``. +DEFAULT_DATA_CACHE_MAX_BYTES = 512 * 1024 * 1024 + class ChannelCacheEntry(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) data: pd.DataFrame start_time: datetime end_time: datetime + # ``df.memory_usage(deep=True).sum()`` at construction time. Stored on the + # entry so eviction is O(1) per dropped item instead of re-walking frames. + size_bytes: int + + +def _new_cache_entry( + data: pd.DataFrame, start_time: datetime, end_time: datetime +) -> ChannelCacheEntry: + return ChannelCacheEntry( + data=data, + start_time=start_time, + end_time=end_time, + size_bytes=int(data.memory_usage(deep=True).sum()), + ) + + +class ChannelCache: + """LRU-ordered, byte-bounded cache of per-channel DataFrames. + + Each ``DataLowLevelClient`` owns its own ``ChannelCache``; the previous + implementation kept this on the class, which silently shared state across + every ``SiftClient`` in the process and grew without bound. Sustained pulls + against that shared cache OOM'd long-running pods. + + Bookkeeping invariant: ``_total_bytes == sum(e.size_bytes for e in _entries.values())``. + Maintained by every mutation path so the bound is checked in O(1) without + re-walking entries. + + ``max_bytes <= 0`` disables retention: every ``get`` misses, ``put`` returns + without storing. ``name_id_map`` is intentionally outside the bound — it's + a tiny string→string map and forms part of the contract with ``_update_cache``, + which depends on it to translate channel names to ids. + """ + + def __init__(self, max_bytes: int = DEFAULT_DATA_CACHE_MAX_BYTES): + if max_bytes < 0: + raise ValueError( + f"data_cache_max_bytes must be >= 0, got {max_bytes}" + ) + self.name_id_map: dict[str, str] = {} + self._entries: OrderedDict[str, ChannelCacheEntry] = OrderedDict() + self._total_bytes: int = 0 + self._max_bytes: int = max_bytes + + @property + def enabled(self) -> bool: + return self._max_bytes > 0 + + @property + def max_bytes(self) -> int: + return self._max_bytes + + @property + def total_bytes(self) -> int: + return self._total_bytes + + def __len__(self) -> int: + return len(self._entries) + + def __contains__(self, channel_id: str) -> bool: + return channel_id in self._entries + + def get(self, channel_id: str) -> ChannelCacheEntry | None: + """Return the entry for ``channel_id`` if cached, otherwise None. + Promotes the entry to most-recently-used on hit. + """ + entry = self._entries.get(channel_id) + if entry is not None: + self._entries.move_to_end(channel_id) + return entry + + def put(self, channel_id: str, entry: ChannelCacheEntry) -> None: + """Insert or replace ``channel_id``, then evict LRU until under the bound. -class ChannelCache(BaseModel): - name_id_map: dict[str, str] - channels: dict[str, ChannelCacheEntry] + Reclaims any prior entry's byte count BEFORE adding the new one's, so a + re-insert (e.g. concat-merge of fresh data into an existing entry) + accounts for the size delta correctly rather than double-counting. + """ + if not self.enabled: + return + prior = self._entries.pop(channel_id, None) + if prior is not None: + self._total_bytes -= prior.size_bytes + self._entries[channel_id] = entry + self._total_bytes += entry.size_bytes + self._evict_until_under_bound() + + def invalidate(self, channel_id: str) -> None: + prior = self._entries.pop(channel_id, None) + if prior is not None: + self._total_bytes -= prior.size_bytes + + def clear(self) -> None: + self._entries.clear() + self._total_bytes = 0 + + def _evict_until_under_bound(self) -> None: + # ``popitem(last=False)`` drops the oldest entry. A single fresh entry + # whose ``size_bytes`` alone exceeds ``max_bytes`` ends up evicted on + # the final iteration — the deliberate choice over "keep the oversized + # entry and violate the bound" or "evict everyone else and still + # violate the bound." + while self._entries and self._total_bytes > self._max_bytes: + _, dropped = self._entries.popitem(last=False) + self._total_bytes -= dropped.size_bytes class DataLowLevelClient(LowLevelClientBase, WithGrpcClient): @@ -53,15 +160,21 @@ class DataLowLevelClient(LowLevelClientBase, WithGrpcClient): This class provides a thin wrapper around the autogenerated bindings for the DataAPI. """ - channel_cache: ChannelCache = ChannelCache(name_id_map={}, channels={}) - - def __init__(self, grpc_client: GrpcClient): + def __init__( + self, + grpc_client: GrpcClient, + *, + data_cache_max_bytes: int = DEFAULT_DATA_CACHE_MAX_BYTES, + ): """Initialize the DataLowLevelClient. Args: grpc_client: The gRPC client to use for making API calls. + data_cache_max_bytes: Cap on the in-memory channel-data cache (bytes). + Set to ``0`` to disable caching. See ``ChannelCache``. """ super().__init__(grpc_client) + self.channel_cache = ChannelCache(max_bytes=data_cache_max_bytes) def _update_name_id_map(self, channels: list[Channel]): """Update the name id map with the new channels.""" @@ -109,7 +222,9 @@ def _filter_cached_channels(self, channel_ids: list[str]) -> tuple[list[str], li cached_channels = [] not_cached_channels = [] for channel_id in channel_ids: - if self.channel_cache.channels.get(channel_id): + # ``__contains__`` is a non-promoting peek; ``_check_cache`` does + # the LRU-touching ``get`` shortly after for the actual lookup. + if channel_id in self.channel_cache: cached_channels.append(channel_id) else: not_cached_channels.append(channel_id) @@ -139,7 +254,7 @@ def _check_cache( A tuple of (data, start_time, end_time) where data is a pandas dataframe and start and end times are what should be used for the next call based on what is not covered by the cached data. """ - cached_data = self.channel_cache.channels.get(channel_id) + cached_data = self.channel_cache.get(channel_id) ret_start_time = start_time ret_end_time = end_time ret_data = None @@ -204,24 +319,23 @@ def _update_cache( # So we just don't update the cache. continue - if channel_id in self.channel_cache.channels: - self.channel_cache.channels[channel_id].data = ( - pd.concat([self.channel_cache.channels[channel_id].data, data]) - .groupby(level=0) - .last() - ) - self.channel_cache.channels[channel_id].start_time = min( - suggested_start_time, self.channel_cache.channels[channel_id].start_time + existing = self.channel_cache.get(channel_id) + if existing is not None: + merged_data = ( + pd.concat([existing.data, data]).groupby(level=0).last() ) - self.channel_cache.channels[channel_id].end_time = max( - end_time, self.channel_cache.channels[channel_id].end_time + entry = _new_cache_entry( + data=merged_data, + start_time=min(suggested_start_time, existing.start_time), + end_time=max(end_time, existing.end_time), ) else: - self.channel_cache.channels[channel_id] = ChannelCacheEntry( + entry = _new_cache_entry( data=data, start_time=suggested_start_time, end_time=end_time, ) + self.channel_cache.put(channel_id, entry) async def get_channel_data( self, @@ -308,9 +422,14 @@ async def get_channel_data( else: ret_data[name] = pd.concat([ret_data[name], df]).groupby(level=0).last() - self._update_cache( - channel_data=ret_data, start_time=start_time, end_time=end_time, run_id=run_id - ) + # ``ignore_cache=True`` is documented as a read-side bypass, but the + # previous implementation still wrote to the shared cache on every + # call, which meant a "non-caching" workload still grew the cache + # without bound. Skip writes when the caller asked us to ignore it. + if not ignore_cache: + self._update_cache( + channel_data=ret_data, start_time=start_time, end_time=end_time, run_id=run_id + ) return ret_data diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py new file mode 100644 index 000000000..97fa70ddb --- /dev/null +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py @@ -0,0 +1,269 @@ +"""Tests for the channel data cache in :mod:`sift_client._internal.low_level_wrappers.data`. + +Two layers covered here: + +* :class:`ChannelCache` directly — byte accounting, LRU promotion, eviction, + edge cases. These tests construct cache entries from real (tiny) DataFrames + so the size measurement code is exercised end-to-end. +* :class:`DataLowLevelClient` — ``ignore_cache=True`` skipping writes, + per-instance cache isolation, ``data_cache_max_bytes=0`` disabling cache. + +The OOM regression that motivated this code happened because the cache was a +class attribute that grew without bound. The instance-isolation test below is +the canary that catches anyone re-introducing that pattern. +""" + +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from unittest.mock import MagicMock + +import pandas as pd +import pytest + +from sift_client._internal.low_level_wrappers.data import ( + DEFAULT_DATA_CACHE_MAX_BYTES, + ChannelCache, + ChannelCacheEntry, + DataLowLevelClient, + _new_cache_entry, +) + + +def _entry(rows: int, *, value_dtype: str = "float64") -> ChannelCacheEntry: + """Build a ChannelCacheEntry with ``rows`` rows of fake data.""" + index = pd.date_range("2025-01-01", periods=rows, freq="ms", tz=timezone.utc) + data = pd.DataFrame({"value": range(rows)}, index=index).astype({"value": value_dtype}) + return _new_cache_entry( + data=data, + start_time=index[0].to_pydatetime(), + end_time=index[-1].to_pydatetime(), + ) + + +def _invariant_holds(cache: ChannelCache) -> bool: + return cache.total_bytes == sum(e.size_bytes for e in cache._entries.values()) + + +class TestChannelCacheBookkeeping: + """Tight checks on the internal byte counter and ordering.""" + + def test_put_get_roundtrip(self) -> None: + cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) + entry = _entry(rows=10) + cache.put("c1", entry) + + assert cache.get("c1") is entry + assert cache.total_bytes == entry.size_bytes + assert _invariant_holds(cache) + + def test_put_replaces_size_accounting(self) -> None: + """A second put for the same key must reclaim the prior size first.""" + cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) + small = _entry(rows=10) + big = _entry(rows=1000) + + cache.put("c1", small) + cache.put("c1", big) + + # Total reflects only the second entry, never small + big. + assert cache.total_bytes == big.size_bytes + assert cache.get("c1") is big + assert _invariant_holds(cache) + + def test_invalidate_drops_byte_count(self) -> None: + cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) + cache.put("c1", _entry(rows=10)) + cache.invalidate("c1") + + assert cache.get("c1") is None + assert cache.total_bytes == 0 + assert _invariant_holds(cache) + + def test_invalidate_missing_is_noop(self) -> None: + cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) + cache.invalidate("nope") + assert cache.total_bytes == 0 + + def test_clear_empties_total(self) -> None: + cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) + cache.put("c1", _entry(rows=10)) + cache.put("c2", _entry(rows=20)) + cache.clear() + + assert cache.total_bytes == 0 + assert len(cache) == 0 + assert _invariant_holds(cache) + + +class TestChannelCacheEviction: + """Eviction policy: LRU, byte-bounded, oversized-entry-dropped.""" + + def test_oldest_entry_evicted_first(self) -> None: + """Insertion order determines who goes when only inserts have happened.""" + a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) + cap = a.size_bytes + b.size_bytes # room for exactly two + cache = ChannelCache(max_bytes=cap) + + cache.put("a", a) + cache.put("b", b) + cache.put("c", c) # forces eviction of "a" + + assert "a" not in cache + assert "b" in cache + assert "c" in cache + assert cache.total_bytes <= cap + assert _invariant_holds(cache) + + def test_get_promotes_to_most_recent(self) -> None: + """Reading an entry must protect it from the next eviction.""" + a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) + cap = a.size_bytes + b.size_bytes + cache = ChannelCache(max_bytes=cap) + + cache.put("a", a) + cache.put("b", b) + assert cache.get("a") is a # promote + cache.put("c", c) # now "b" is the oldest and should be evicted + + assert "a" in cache + assert "b" not in cache + assert "c" in cache + assert _invariant_holds(cache) + + def test_oversized_entry_evicts_with_neighbours(self) -> None: + """A single entry larger than the cap ends up evicted itself. + + The alternative ("keep the oversized entry and accept that the cap is + soft") would silently reintroduce the unbounded-growth bug for any + workload whose typical entry is bigger than ``max_bytes``. + """ + small_a, small_b = _entry(rows=10), _entry(rows=10) + oversized = _entry(rows=10_000) + cap = small_a.size_bytes + small_b.size_bytes # comfortably below ``oversized`` + cache = ChannelCache(max_bytes=cap) + + cache.put("a", small_a) + cache.put("b", small_b) + cache.put("huge", oversized) + + assert "huge" not in cache + # Every other entry was evicted in the failed attempt to make room. + assert "a" not in cache + assert "b" not in cache + assert cache.total_bytes == 0 + assert _invariant_holds(cache) + + def test_max_bytes_zero_disables_cache(self) -> None: + cache = ChannelCache(max_bytes=0) + cache.put("c1", _entry(rows=100)) + + assert not cache.enabled + assert cache.get("c1") is None + assert cache.total_bytes == 0 + assert len(cache) == 0 + + def test_negative_max_bytes_raises(self) -> None: + with pytest.raises(ValueError, match="data_cache_max_bytes"): + ChannelCache(max_bytes=-1) + + def test_repeated_concat_updates_stay_under_bound(self) -> None: + """Simulates the customer's sliding-window pull: same channel, growing. + + Without size reclamation on update, ``total_bytes`` would creep above + the cap silently. We re-build the entry each iteration to mimic the + ``_update_cache`` concat path. + """ + cap = 1_000_000 # ~1 MB + cache = ChannelCache(max_bytes=cap) + accumulated = pd.DataFrame() + for i in range(50): + chunk = pd.DataFrame( + {"value": range(1000)}, + index=pd.date_range( + datetime(2025, 1, 1, tzinfo=timezone.utc) + timedelta(seconds=i), + periods=1000, + freq="us", + ), + ) + accumulated = pd.concat([accumulated, chunk]) + cache.put( + "c1", + _new_cache_entry( + data=accumulated, + start_time=accumulated.index[0].to_pydatetime(), + end_time=accumulated.index[-1].to_pydatetime(), + ), + ) + assert cache.total_bytes <= cap, ( + f"iteration {i}: total_bytes={cache.total_bytes} exceeded cap={cap}" + ) + assert _invariant_holds(cache) + + +class TestDataLowLevelClientIntegration: + """End-to-end checks on the constructor wiring and ignore_cache semantics.""" + + def test_per_instance_isolation(self) -> None: + """Two clients must not share cache state. + + This is the regression test for the original OOM bug: ``channel_cache`` + was a class attribute, so every ``SiftClient`` in the process appended + to the same dict. Construct two clients, populate one, the other must + stay empty. + """ + client_a = DataLowLevelClient(MagicMock()) + client_b = DataLowLevelClient(MagicMock()) + + client_a.channel_cache.put("c1", _entry(rows=10)) + + assert "c1" in client_a.channel_cache + assert "c1" not in client_b.channel_cache + assert client_b.channel_cache.total_bytes == 0 + + def test_ignore_cache_skips_writes(self) -> None: + """``ignore_cache=True`` must not populate the cache. + + Previously the read path was bypassed but ``_update_cache`` still ran + unconditionally, so a "non-caching" workload still grew memory until + OOM. Verify by exercising ``_update_cache`` only when ``ignore_cache`` + is false. + """ + client = DataLowLevelClient(MagicMock()) + client.channel_cache.name_id_map["chan"] = "c1" + + index = pd.date_range("2025-01-01", periods=5, freq="ms", tz=timezone.utc) + df = pd.DataFrame({"value": range(5)}, index=index) + + # Real ``get_channel_data`` would call ``_update_cache`` from inside an + # ``if not ignore_cache`` branch; assert the helper itself is what + # writes, and that ``get_channel_data`` doesn't invoke it when + # ``ignore_cache=True``. We verify the branch directly to keep this + # test free of gRPC stubbing. + client._update_cache( + channel_data={"chan": df}, + start_time=index[0].to_pydatetime(), + end_time=index[-1].to_pydatetime(), + ) + assert "c1" in client.channel_cache + + # Skipping the call (as ``get_channel_data`` does when ignore_cache is + # true) leaves the cache untouched. + client.channel_cache.invalidate("c1") + assert "c1" not in client.channel_cache + + def test_data_cache_max_bytes_zero_disables_caching(self) -> None: + """Constructor knob: ``data_cache_max_bytes=0`` → no cache writes land.""" + client = DataLowLevelClient(MagicMock(), data_cache_max_bytes=0) + client.channel_cache.name_id_map["chan"] = "c1" + + index = pd.date_range("2025-01-01", periods=5, freq="ms", tz=timezone.utc) + df = pd.DataFrame({"value": range(5)}, index=index) + + client._update_cache( + channel_data={"chan": df}, + start_time=index[0].to_pydatetime(), + end_time=index[-1].to_pydatetime(), + ) + assert "c1" not in client.channel_cache + assert client.channel_cache.total_bytes == 0 diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py index 2e2b64ffd..792025f8f 100644 --- a/python/lib/sift_client/client.py +++ b/python/lib/sift_client/client.py @@ -136,6 +136,7 @@ def __init__( rest_url: str | None = None, connection_config: SiftConnectionConfig | None = None, app_url: str | None = None, + data_cache_max_bytes: int | None = None, ): """Initialize the SiftClient with specific connection parameters or a connection_config. @@ -148,7 +149,16 @@ def __init__( Set this for on-prem or custom deployments whose API host can't be mapped to a frontend automatically; see the ``app_url`` property. A value here takes precedence over ``connection_config.app_url``. + data_cache_max_bytes: Cap on the in-memory channel data cache used + by ``client.channels.get_data`` (bytes). When the bound is + reached, the least-recently-used cached channel is evicted. + Defaults to 512 MiB. Set to ``0`` to disable caching. Must be + ``>= 0``. """ + if data_cache_max_bytes is not None and data_cache_max_bytes < 0: + raise ValueError( + f"data_cache_max_bytes must be >= 0, got {data_cache_max_bytes}" + ) if not (api_key and grpc_url and rest_url) and not connection_config: raise ValueError( "Either api_key, grpc_url and rest_url or connection_config must be provided to establish a connection." @@ -179,6 +189,11 @@ def __init__( # pytest plugin's ``--sift-disabled`` mode. self._simulate: bool = False + # Read by ``ChannelsAPIAsync._ensure_data_low_level_client`` when it + # lazily constructs the data wrapper. ``None`` means "use the wrapper + # default" so we don't have to import the constant here. + self._data_cache_max_bytes: int | None = data_cache_max_bytes + self.ping = PingAPI(self) self.assets = AssetsAPI(self) self.calculated_channels = CalculatedChannelsAPI(self) diff --git a/python/lib/sift_client/resources/channels.py b/python/lib/sift_client/resources/channels.py index aa5cdf96e..41d478d81 100644 --- a/python/lib/sift_client/resources/channels.py +++ b/python/lib/sift_client/resources/channels.py @@ -242,9 +242,18 @@ async def unarchive(self, channels: list[str | Channel]) -> None: def _ensure_data_low_level_client(self): """Ensure that the data low level client is initialized. Separated out like this to not require large dependencies (pandas/pyarrow) for the client if not fetching data.""" if self._data_low_level_client is None: - from sift_client._internal.low_level_wrappers.data import DataLowLevelClient - - self._data_low_level_client = DataLowLevelClient(grpc_client=self.client.grpc_client) + from sift_client._internal.low_level_wrappers.data import ( + DEFAULT_DATA_CACHE_MAX_BYTES, + DataLowLevelClient, + ) + + max_bytes = getattr(self.client, "_data_cache_max_bytes", None) + self._data_low_level_client = DataLowLevelClient( + grpc_client=self.client.grpc_client, + data_cache_max_bytes=( + DEFAULT_DATA_CACHE_MAX_BYTES if max_bytes is None else max_bytes + ), + ) async def get_data( self, From c12bc98fd0f8aa81b0693e4b0ae9559e331093bc Mon Sep 17 00:00:00 2001 From: Ian Later Date: Tue, 23 Jun 2026 16:13:53 -0700 Subject: [PATCH 02/14] python(perf): batch the get_data page-flatten concat Improve get data: Bench numbers from the same shape inputs: 10 pages * 10k rows: 6.3x faster (22ms -> 3.5ms) 50 pages * 10k rows: 26.0x faster (488ms -> 19ms) 200 pages * 10k rows: 81.7x faster (10.9s -> 134ms) 500 pages * 1k rows: 224.3x faster (4.5s -> 20ms) --- python/CHANGELOG.md | 8 +- .../_internal/low_level_wrappers/data.py | 48 ++++- .../low_level_wrappers/test_data_cache.py | 175 +++++++++++++++++- 3 files changed, 219 insertions(+), 12 deletions(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index 510efdafb..e4016eb69 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -7,11 +7,15 @@ This project adheres to [Semantic Versioning](http://semver.org/). ### What's New +#### Faster `get_data` pagination + +Up to a ~80x speedup for some get_data calls. + #### Bounded channel data cache -`SiftClient.get_data` cache state is now per-instance and byte-bounded instead of shared across the process and unbounded. A new `data_cache_max_bytes` constructor kwarg (default 512 MiB) caps the in-memory channel-data footprint; the least-recently-used cached channel is evicted once the bound is reached. Set `data_cache_max_bytes=0` to disable caching entirely. +A new `data_cache_max_bytes` constructor kwarg (default 512 MiB) caps the in-memory channel-data footprint; the least-recently-used cached channel is evicted once the bound is reached. Set `data_cache_max_bytes=0` to disable caching entirely. -`ignore_cache=True` on `client.channels.get_data(...)` now also skips writing into the cache, matching its read-side bypass semantics. Previously a "non-caching" workload still appended to the shared cache on every call, which silently OOM'd long-running pods doing sustained data pulls. +`ignore_cache=True` on `client.channels.get_data(...)` now also skips writing into the cache, matching its read-side bypass semantics. Previously a "non-caching" workload still appended to the shared cache on every call, which still caused increased memory usage. ```python client = SiftClient( diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index bed1b3a44..c14ba6266 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -412,15 +412,7 @@ async def get_channel_data( tasks.append(task) pages = await asyncio.gather(*tasks) - # Flatten the data - for page in pages: - for data in page: - page_results = self.try_deserialize_channel_data(data) - for name, df in page_results.items(): - if name not in ret_data: - ret_data[name] = df - else: - ret_data[name] = pd.concat([ret_data[name], df]).groupby(level=0).last() + ret_data = self._merge_pages(pages, initial=ret_data) # ``ignore_cache=True`` is documented as a read-side bypass, but the # previous implementation still wrote to the shared cache on every @@ -433,6 +425,44 @@ async def get_channel_data( return ret_data + def _merge_pages( + self, + pages: list[list[Any]], + *, + initial: dict[str, pd.DataFrame], + ) -> dict[str, pd.DataFrame]: + """Flatten paged channel data + any cached slices into one DataFrame per channel. + + Replaces a per-page ``pd.concat(...).groupby(...)`` loop that was + O(N²) in the number of pages — each iteration copied the cumulative + DataFrame — with a single batched concat per channel. At realistic + pagination depths the speedup is large: 200 pages of 10k rows each + drops from ~11 s to ~130 ms in the bench. + + ``initial`` carries any cached slices already populated by + ``_check_cache``. Cached entries are folded in as the first frame for + their channel so they participate in the same final concat; + ``groupby(level=0).last()`` preserves the previous behavior of letting + a later-positioned (fresher) value win on duplicate timestamps. + """ + per_channel_frames: dict[str, list[pd.DataFrame]] = {} + for page in pages: + for data in page: + for name, df in self.try_deserialize_channel_data(data).items(): + per_channel_frames.setdefault(name, []).append(df) + + ret_data: dict[str, pd.DataFrame] = dict(initial) + for name, frames in per_channel_frames.items(): + if name in ret_data: + # Cached slice goes first so fresher pages (positioned later + # in the list) win on overlapping timestamps after groupby. + frames.insert(0, ret_data[name]) + if len(frames) == 1: + ret_data[name] = frames[0] + else: + ret_data[name] = pd.concat(frames).groupby(level=0).last() + return ret_data + @staticmethod def try_deserialize_channel_data(channel_data: Any) -> dict[str, pd.DataFrame]: """Deserialize a channel data object into a numpy array.""" diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py index 97fa70ddb..d0d4a67e0 100644 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py @@ -16,7 +16,7 @@ class attribute that grew without bound. The instance-isolation test below is from __future__ import annotations from datetime import datetime, timedelta, timezone -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pandas as pd import pytest @@ -267,3 +267,176 @@ def test_data_cache_max_bytes_zero_disables_caching(self) -> None: ) assert "c1" not in client.channel_cache assert client.channel_cache.total_bytes == 0 + + +class TestMergePages: + """Behavioural tests for :meth:`DataLowLevelClient._merge_pages`. + + The helper replaces a previously inline O(N²) per-page concat loop with a + single batched concat per channel. These tests pin the merge semantics so + a future refactor can't silently drift, in particular: + + * Single-frame channels skip the concat entirely (cheap path). + * Multi-frame channels concat in the order frames were collected. + * Cached slices from ``_check_cache`` are folded in as the first frame so + fresher pages win on overlapping timestamps via ``groupby.last``. + """ + + @staticmethod + def _client_with_fake_deserializer( + sentinel_to_frames: dict[str, dict[str, pd.DataFrame]], + ): + """Build a DataLowLevelClient whose ``try_deserialize_channel_data`` + translates string sentinels (passed in lieu of protos) to dicts of + already-built DataFrames. Lets the merge logic be tested without + constructing protos. + """ + client = DataLowLevelClient(MagicMock()) + patcher = patch.object( + DataLowLevelClient, + "try_deserialize_channel_data", + staticmethod(lambda data: sentinel_to_frames[data]), + ) + patcher.start() + return client, patcher + + @staticmethod + def _frame(channel: str, start: str, rows: int, offset: int = 0) -> pd.DataFrame: + index = pd.date_range(start, periods=rows, freq="ms", tz=timezone.utc) + return pd.DataFrame({channel: range(offset, offset + rows)}, index=index) + + def test_empty_pages_returns_initial(self) -> None: + """No pages, no fresh data — initial passes through untouched.""" + client, patcher = self._client_with_fake_deserializer({}) + try: + initial_df = self._frame("chan", "2025-01-01", rows=5) + result = client._merge_pages(pages=[], initial={"chan": initial_df}) + assert result["chan"] is initial_df + finally: + patcher.stop() + + def test_single_frame_skips_concat(self) -> None: + """One frame for a channel → returned by identity, no concat call.""" + only_df = self._frame("chan", "2025-01-01", rows=5) + client, patcher = self._client_with_fake_deserializer( + {"page_a": {"chan": only_df}} + ) + try: + result = client._merge_pages(pages=[["page_a"]], initial={}) + # Identity check: no concat happened, so the original frame is + # returned by reference. + assert result["chan"] is only_df + finally: + patcher.stop() + + def test_disjoint_pages_concat_in_order(self) -> None: + """Multiple disjoint pages for one channel → single concat result.""" + df1 = self._frame("chan", "2025-01-01", rows=10, offset=0) + df2 = self._frame("chan", "2025-01-02", rows=10, offset=10) + df3 = self._frame("chan", "2025-01-03", rows=10, offset=20) + client, patcher = self._client_with_fake_deserializer( + { + "p1": {"chan": df1}, + "p2": {"chan": df2}, + "p3": {"chan": df3}, + } + ) + try: + result = client._merge_pages(pages=[["p1", "p2"], ["p3"]], initial={}) + + expected = pd.concat([df1, df2, df3]).groupby(level=0).last() + pd.testing.assert_frame_equal( + result["chan"].sort_index(), expected.sort_index() + ) + assert len(result["chan"]) == 30 + finally: + patcher.stop() + + def test_overlapping_timestamps_later_page_wins(self) -> None: + """On overlapping timestamps, the later page's value survives groupby.last. + + This pins the existing behavior: the loop's old shape did + ``concat([acc, new]).groupby(...).last()`` which kept the LATER value + on conflict; the batched concat must preserve that ordering. + """ + index = pd.date_range("2025-01-01", periods=5, freq="ms", tz=timezone.utc) + df_first = pd.DataFrame({"chan": [0] * 5}, index=index) + df_second = pd.DataFrame({"chan": [99] * 5}, index=index) + client, patcher = self._client_with_fake_deserializer( + {"p1": {"chan": df_first}, "p2": {"chan": df_second}} + ) + try: + result = client._merge_pages(pages=[["p1", "p2"]], initial={}) + assert (result["chan"]["chan"] == 99).all() + finally: + patcher.stop() + + def test_cached_slice_folded_in_first_and_loses_on_overlap(self) -> None: + """Cached slice from ``_check_cache`` is the first frame in the merge. + + Fresh pages should overwrite cached values on duplicate timestamps, + matching the pre-existing semantic that the latest fetch wins. + """ + index = pd.date_range("2025-01-01", periods=5, freq="ms", tz=timezone.utc) + cached = pd.DataFrame({"chan": [-1] * 5}, index=index) + fresh = pd.DataFrame({"chan": [42] * 5}, index=index) + client, patcher = self._client_with_fake_deserializer( + {"p1": {"chan": fresh}} + ) + try: + result = client._merge_pages( + pages=[["p1"]], initial={"chan": cached} + ) + assert (result["chan"]["chan"] == 42).all() + finally: + patcher.stop() + + def test_cached_only_no_pages_preserves_cache(self) -> None: + """Channels in ``initial`` with no fresh page data must survive intact.""" + client, patcher = self._client_with_fake_deserializer({}) + try: + cached = self._frame("chan", "2025-01-01", rows=5) + result = client._merge_pages(pages=[[]], initial={"chan": cached}) + assert result["chan"] is cached + finally: + patcher.stop() + + def test_multiple_channels_independent(self) -> None: + """Per-channel grouping is independent: one channel's pages don't bleed. + + Same shape as a multi-channel ``get_data`` call where each channel + returns its own pages. + """ + a1 = self._frame("a", "2025-01-01", rows=5, offset=0) + a2 = self._frame("a", "2025-01-02", rows=5, offset=5) + b1 = self._frame("b", "2025-01-01", rows=5, offset=100) + client, patcher = self._client_with_fake_deserializer( + { + "p_a1": {"a": a1}, + "p_a2": {"a": a2}, + "p_b1": {"b": b1}, + } + ) + try: + result = client._merge_pages( + pages=[["p_a1", "p_b1"], ["p_a2"]], initial={} + ) + assert len(result["a"]) == 10 + assert len(result["b"]) == 5 + assert (result["b"]["b"] >= 100).all() + finally: + patcher.stop() + + def test_does_not_mutate_initial(self) -> None: + """``initial`` is a defensive copy; caller's dict isn't mutated.""" + cached = self._frame("chan", "2025-01-01", rows=5) + initial = {"chan": cached} + fresh = self._frame("chan", "2025-01-02", rows=5, offset=10) + client, patcher = self._client_with_fake_deserializer( + {"p1": {"chan": fresh}} + ) + try: + _ = client._merge_pages(pages=[["p1"]], initial=initial) + assert initial["chan"] is cached + finally: + patcher.stop() From 988dab6501e467ae1ccf2d7324b894cecb380380 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Tue, 23 Jun 2026 20:14:25 -0700 Subject: [PATCH 03/14] lint --- .../_internal/low_level_wrappers/data.py | 8 ++----- .../low_level_wrappers/test_data_cache.py | 24 +++++-------------- python/lib/sift_client/client.py | 4 +--- 3 files changed, 9 insertions(+), 27 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index c14ba6266..238e6477c 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -82,9 +82,7 @@ class ChannelCache: def __init__(self, max_bytes: int = DEFAULT_DATA_CACHE_MAX_BYTES): if max_bytes < 0: - raise ValueError( - f"data_cache_max_bytes must be >= 0, got {max_bytes}" - ) + raise ValueError(f"data_cache_max_bytes must be >= 0, got {max_bytes}") self.name_id_map: dict[str, str] = {} self._entries: OrderedDict[str, ChannelCacheEntry] = OrderedDict() self._total_bytes: int = 0 @@ -321,9 +319,7 @@ def _update_cache( existing = self.channel_cache.get(channel_id) if existing is not None: - merged_data = ( - pd.concat([existing.data, data]).groupby(level=0).last() - ) + merged_data = pd.concat([existing.data, data]).groupby(level=0).last() entry = _new_cache_entry( data=merged_data, start_time=min(suggested_start_time, existing.start_time), diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py index d0d4a67e0..b0841657b 100644 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py @@ -318,9 +318,7 @@ def test_empty_pages_returns_initial(self) -> None: def test_single_frame_skips_concat(self) -> None: """One frame for a channel → returned by identity, no concat call.""" only_df = self._frame("chan", "2025-01-01", rows=5) - client, patcher = self._client_with_fake_deserializer( - {"page_a": {"chan": only_df}} - ) + client, patcher = self._client_with_fake_deserializer({"page_a": {"chan": only_df}}) try: result = client._merge_pages(pages=[["page_a"]], initial={}) # Identity check: no concat happened, so the original frame is @@ -345,9 +343,7 @@ def test_disjoint_pages_concat_in_order(self) -> None: result = client._merge_pages(pages=[["p1", "p2"], ["p3"]], initial={}) expected = pd.concat([df1, df2, df3]).groupby(level=0).last() - pd.testing.assert_frame_equal( - result["chan"].sort_index(), expected.sort_index() - ) + pd.testing.assert_frame_equal(result["chan"].sort_index(), expected.sort_index()) assert len(result["chan"]) == 30 finally: patcher.stop() @@ -380,13 +376,9 @@ def test_cached_slice_folded_in_first_and_loses_on_overlap(self) -> None: index = pd.date_range("2025-01-01", periods=5, freq="ms", tz=timezone.utc) cached = pd.DataFrame({"chan": [-1] * 5}, index=index) fresh = pd.DataFrame({"chan": [42] * 5}, index=index) - client, patcher = self._client_with_fake_deserializer( - {"p1": {"chan": fresh}} - ) + client, patcher = self._client_with_fake_deserializer({"p1": {"chan": fresh}}) try: - result = client._merge_pages( - pages=[["p1"]], initial={"chan": cached} - ) + result = client._merge_pages(pages=[["p1"]], initial={"chan": cached}) assert (result["chan"]["chan"] == 42).all() finally: patcher.stop() @@ -418,9 +410,7 @@ def test_multiple_channels_independent(self) -> None: } ) try: - result = client._merge_pages( - pages=[["p_a1", "p_b1"], ["p_a2"]], initial={} - ) + result = client._merge_pages(pages=[["p_a1", "p_b1"], ["p_a2"]], initial={}) assert len(result["a"]) == 10 assert len(result["b"]) == 5 assert (result["b"]["b"] >= 100).all() @@ -432,9 +422,7 @@ def test_does_not_mutate_initial(self) -> None: cached = self._frame("chan", "2025-01-01", rows=5) initial = {"chan": cached} fresh = self._frame("chan", "2025-01-02", rows=5, offset=10) - client, patcher = self._client_with_fake_deserializer( - {"p1": {"chan": fresh}} - ) + client, patcher = self._client_with_fake_deserializer({"p1": {"chan": fresh}}) try: _ = client._merge_pages(pages=[["p1"]], initial=initial) assert initial["chan"] is cached diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py index 792025f8f..7d20fbe85 100644 --- a/python/lib/sift_client/client.py +++ b/python/lib/sift_client/client.py @@ -156,9 +156,7 @@ def __init__( ``>= 0``. """ if data_cache_max_bytes is not None and data_cache_max_bytes < 0: - raise ValueError( - f"data_cache_max_bytes must be >= 0, got {data_cache_max_bytes}" - ) + raise ValueError(f"data_cache_max_bytes must be >= 0, got {data_cache_max_bytes}") if not (api_key and grpc_url and rest_url) and not connection_config: raise ValueError( "Either api_key, grpc_url and rest_url or connection_config must be provided to establish a connection." From ea259faa4736d8b7020b0ba8ecd7c6a542fc976c Mon Sep 17 00:00:00 2001 From: Ian Later Date: Wed, 24 Jun 2026 10:59:54 -0700 Subject: [PATCH 04/14] comment cleanup --- .../_internal/low_level_wrappers/data.py | 35 +++---------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index 238e6477c..381b6667d 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -36,7 +36,7 @@ REQUEST_BATCH_SIZE = 1 # Default in-memory budget for cached channel DataFrames, per ``DataLowLevelClient`` -# instance. 512 MiB is well below typical pod limits while still letting common +# instance. 512 MiB is well below typical limits while still letting common # interactive workloads stay in cache. Override via ``SiftClient(data_cache_max_bytes=...)``. DEFAULT_DATA_CACHE_MAX_BYTES = 512 * 1024 * 1024 @@ -46,8 +46,6 @@ class ChannelCacheEntry(BaseModel): data: pd.DataFrame start_time: datetime end_time: datetime - # ``df.memory_usage(deep=True).sum()`` at construction time. Stored on the - # entry so eviction is O(1) per dropped item instead of re-walking frames. size_bytes: int @@ -65,19 +63,8 @@ def _new_cache_entry( class ChannelCache: """LRU-ordered, byte-bounded cache of per-channel DataFrames. - Each ``DataLowLevelClient`` owns its own ``ChannelCache``; the previous - implementation kept this on the class, which silently shared state across - every ``SiftClient`` in the process and grew without bound. Sustained pulls - against that shared cache OOM'd long-running pods. - - Bookkeeping invariant: ``_total_bytes == sum(e.size_bytes for e in _entries.values())``. - Maintained by every mutation path so the bound is checked in O(1) without - re-walking entries. - ``max_bytes <= 0`` disables retention: every ``get`` misses, ``put`` returns - without storing. ``name_id_map`` is intentionally outside the bound — it's - a tiny string→string map and forms part of the contract with ``_update_cache``, - which depends on it to translate channel names to ids. + without storing. """ def __init__(self, max_bytes: int = DEFAULT_DATA_CACHE_MAX_BYTES): @@ -117,7 +104,7 @@ def get(self, channel_id: str) -> ChannelCacheEntry | None: return entry def put(self, channel_id: str, entry: ChannelCacheEntry) -> None: - """Insert or replace ``channel_id``, then evict LRU until under the bound. + """Insert or replace ``channel_id``, then evict LRU until within size bounds. Reclaims any prior entry's byte count BEFORE adding the new one's, so a re-insert (e.g. concat-merge of fresh data into an existing entry) @@ -144,9 +131,7 @@ def clear(self) -> None: def _evict_until_under_bound(self) -> None: # ``popitem(last=False)`` drops the oldest entry. A single fresh entry # whose ``size_bytes`` alone exceeds ``max_bytes`` ends up evicted on - # the final iteration — the deliberate choice over "keep the oversized - # entry and violate the bound" or "evict everyone else and still - # violate the bound." + # the final iteration. while self._entries and self._total_bytes > self._max_bytes: _, dropped = self._entries.popitem(last=False) self._total_bytes -= dropped.size_bytes @@ -220,8 +205,6 @@ def _filter_cached_channels(self, channel_ids: list[str]) -> tuple[list[str], li cached_channels = [] not_cached_channels = [] for channel_id in channel_ids: - # ``__contains__`` is a non-promoting peek; ``_check_cache`` does - # the LRU-touching ``get`` shortly after for the actual lookup. if channel_id in self.channel_cache: cached_channels.append(channel_id) else: @@ -410,10 +393,6 @@ async def get_channel_data( pages = await asyncio.gather(*tasks) ret_data = self._merge_pages(pages, initial=ret_data) - # ``ignore_cache=True`` is documented as a read-side bypass, but the - # previous implementation still wrote to the shared cache on every - # call, which meant a "non-caching" workload still grew the cache - # without bound. Skip writes when the caller asked us to ignore it. if not ignore_cache: self._update_cache( channel_data=ret_data, start_time=start_time, end_time=end_time, run_id=run_id @@ -429,12 +408,6 @@ def _merge_pages( ) -> dict[str, pd.DataFrame]: """Flatten paged channel data + any cached slices into one DataFrame per channel. - Replaces a per-page ``pd.concat(...).groupby(...)`` loop that was - O(N²) in the number of pages — each iteration copied the cumulative - DataFrame — with a single batched concat per channel. At realistic - pagination depths the speedup is large: 200 pages of 10k rows each - drops from ~11 s to ~130 ms in the bench. - ``initial`` carries any cached slices already populated by ``_check_cache``. Cached entries are folded in as the first frame for their channel so they participate in the same final concat; From 5cf780698ecc3fb29732187e87d6c7b83a7b0b38 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Wed, 24 Jun 2026 11:46:48 -0700 Subject: [PATCH 05/14] Add test coverage for cache and data shape. --- .../_internal/low_level_wrappers/test_data.py | 539 ++++++++++++++++++ .../low_level_wrappers/test_data_cache.py | 430 -------------- 2 files changed, 539 insertions(+), 430 deletions(-) create mode 100644 python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py delete mode 100644 python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py new file mode 100644 index 000000000..e2b12cecf --- /dev/null +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py @@ -0,0 +1,539 @@ +"""Tests for :mod:`sift_client._internal.low_level_wrappers.data`. + +Four classes, narrowest scope first: + +* :class:`TestChannelCache` — pure ``ChannelCache`` unit tests (byte + accounting, LRU promotion, eviction). +* :class:`TestMergePages` — ``DataLowLevelClient._merge_pages``, the + per-channel concat helper. +* :class:`TestDataLowLevelClient` — constructor wiring and per-instance + isolation. +* :class:`TestGetChannelData` — end-to-end on the public + ``get_channel_data`` API against a mocked ``_get_data_impl``. + +The OOM regression that motivated this code happened because the cache was +a class attribute that grew without bound. ``test_per_instance_isolation`` +is the canary that catches anyone re-introducing that pattern. +""" + +from __future__ import annotations + +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +from typing import Any, Iterator +from unittest.mock import MagicMock, patch + +import pandas as pd +import pytest + +from sift_client._internal.low_level_wrappers.data import ( + DEFAULT_DATA_CACHE_MAX_BYTES, + ChannelCache, + ChannelCacheEntry, + DataLowLevelClient, + _new_cache_entry, +) +from sift_client.sift_types.channel import Channel, ChannelDataType + +_NOW = datetime(2025, 1, 1, tzinfo=timezone.utc) +_WINDOW_END = _NOW + timedelta(days=1) + + +# ---------- shared helpers ----------- + + +def _frame( + cid: str = "value", + *, + rows: int = 5, + start: datetime = _NOW, + offset: int = 0, + freq: str = "ms", + value_dtype: str = "float64", +) -> pd.DataFrame: + """DataFrame indexed by a tz-aware DatetimeIndex with ``rows`` rows.""" + index = pd.date_range(start, periods=rows, freq=freq, tz=timezone.utc) + return pd.DataFrame( + {cid: [(offset + i) * 1.0 for i in range(rows)]}, + index=index, + ).astype({cid: value_dtype}) + + +def _entry(*, rows: int = 5, value_dtype: str = "float64") -> ChannelCacheEntry: + """``ChannelCacheEntry`` wrapping a small generated DataFrame.""" + data = _frame(rows=rows, value_dtype=value_dtype) + return _new_cache_entry( + data=data, + start_time=data.index[0].to_pydatetime(), + end_time=data.index[-1].to_pydatetime(), + ) + + +def _channel(cid: str) -> Channel: + """Minimal ``Channel`` with required fields populated.""" + return Channel( + id_=cid, + name=cid, + data_type=ChannelDataType.DOUBLE, + description="", + unit="", + asset_id="a1", + is_archived=False, + created_date=_NOW, + modified_date=_NOW, + created_by_user_id="u1", + modified_by_user_id="u1", + ) + + +def _invariant_holds(cache: ChannelCache) -> bool: + """``total_bytes`` must equal the sum of per-entry sizes at all times.""" + return cache.total_bytes == sum(e.size_bytes for e in cache._entries.values()) + + +def _patch_deserializer(sentinel_to_frames: dict[str, dict[str, pd.DataFrame]]) -> Any: + """Patch ``try_deserialize_channel_data`` to translate string sentinels. + + Lets tests pass strings in lieu of protos. Returned object is a context + manager; callers use ``with _patch_deserializer(...):``. + """ + return patch.object( + DataLowLevelClient, + "try_deserialize_channel_data", + staticmethod(lambda s: sentinel_to_frames[s]), + ) + + +@contextmanager +def _fake_grpc( + client: DataLowLevelClient, + channel_to_pages: dict[str, list[pd.DataFrame]], +) -> Iterator[list[dict[str, Any]]]: + """Mock the gRPC boundary so each "page" is a sentinel string. + + ``_get_data_impl`` is replaced with a coroutine that pops one DataFrame + off ``channel_to_pages[cid]`` per call per channel, until exhausted. + ``try_deserialize_channel_data`` is patched to map the sentinel back to + the corresponding ``{channel: DataFrame}`` dict. + + Yields a ``call_log`` list so tests can assert which channels actually + hit the wire. The patch is torn down and ``_get_data_impl`` restored on + exit. + """ + sentinel_to_frames: dict[str, dict[str, pd.DataFrame]] = {} + next_page_index: dict[str, int] = dict.fromkeys(channel_to_pages, 0) + call_log: list[dict[str, Any]] = [] + + async def fake_impl( + *, + channel_ids: list[str], + page_size: int | None = None, + page_token: str | None = None, + order_by: str | None = None, + **kwargs: Any, + ) -> tuple[list[str], str]: + call_log.append({"channel_ids": list(channel_ids), **kwargs}) + data: list[str] = [] + more_remaining = False + for cid in channel_ids: + i = next_page_index[cid] + if i >= len(channel_to_pages[cid]): + continue # this channel is exhausted; just emit nothing + sentinel = f"{cid}|{i}" + sentinel_to_frames[sentinel] = {cid: channel_to_pages[cid][i]} + data.append(sentinel) + next_page_index[cid] += 1 + if next_page_index[cid] < len(channel_to_pages[cid]): + more_remaining = True + # ``_handle_pagination`` loops until it sees ``page_token == ""``. + return data, ("next" if more_remaining else "") + + original_impl = client._get_data_impl + client._get_data_impl = fake_impl # type: ignore[method-assign] + try: + with _patch_deserializer(sentinel_to_frames): + yield call_log + finally: + client._get_data_impl = original_impl # type: ignore[method-assign] + + +# ---------- tests ----------- + + +class TestChannelCache: + """Byte accounting, LRU promotion, eviction.""" + + def test_put_get_roundtrip_and_size_replacement(self) -> None: + """First put records size; second put on same key replaces it. + + Without size reclamation on the second put, ``total_bytes`` would + double-count and trip the eviction loop on the next insert. + """ + cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) + small, big = _entry(rows=10), _entry(rows=1000) + cache.put("c1", small) + assert cache.get("c1") is small + assert cache.total_bytes == small.size_bytes + cache.put("c1", big) + assert cache.get("c1") is big + assert cache.total_bytes == big.size_bytes # not small + big + assert _invariant_holds(cache) + + def test_invalidate(self) -> None: + """Removes a present entry and decrements bytes; no-op for missing keys.""" + cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) + cache.invalidate("never_added") # safe before any puts + assert cache.total_bytes == 0 + cache.put("c1", _entry(rows=10)) + cache.invalidate("c1") + assert cache.get("c1") is None + assert cache.total_bytes == 0 + assert _invariant_holds(cache) + + def test_clear(self) -> None: + cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) + cache.put("c1", _entry(rows=10)) + cache.put("c2", _entry(rows=20)) + cache.clear() + assert cache.total_bytes == 0 + assert len(cache) == 0 + assert _invariant_holds(cache) + + def test_oldest_entry_evicted_first(self) -> None: + """Insertion order determines eviction when only puts have happened.""" + a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) + cache = ChannelCache(max_bytes=a.size_bytes + b.size_bytes) # room for two + cache.put("a", a) + cache.put("b", b) + cache.put("c", c) # evicts "a" + assert "a" not in cache + assert "b" in cache + assert "c" in cache + assert cache.total_bytes <= a.size_bytes + b.size_bytes + assert _invariant_holds(cache) + + def test_get_promotes_to_most_recent(self) -> None: + """Reading an entry must protect it from the next eviction.""" + a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) + cache = ChannelCache(max_bytes=a.size_bytes + b.size_bytes) + cache.put("a", a) + cache.put("b", b) + assert cache.get("a") is a # promote a + cache.put("c", c) # b is now oldest, gets evicted + assert "a" in cache + assert "b" not in cache + assert "c" in cache + assert _invariant_holds(cache) + + def test_oversized_entry_evicts_with_neighbours(self) -> None: + """A single entry larger than the cap ends up evicted itself. + + The alternative ("keep the oversized entry and accept that the cap + is soft") would silently reintroduce unbounded growth for any + workload whose typical entry is bigger than ``max_bytes``. + """ + small_a, small_b, oversized = _entry(rows=10), _entry(rows=10), _entry(rows=10_000) + cache = ChannelCache(max_bytes=small_a.size_bytes + small_b.size_bytes) + cache.put("a", small_a) + cache.put("b", small_b) + cache.put("huge", oversized) + assert "huge" not in cache + # Every other entry was evicted in the failed attempt to make room. + assert "a" not in cache + assert "b" not in cache + assert cache.total_bytes == 0 + assert _invariant_holds(cache) + + def test_max_bytes_zero_disables_cache(self) -> None: + cache = ChannelCache(max_bytes=0) + cache.put("c1", _entry(rows=100)) + assert not cache.enabled + assert cache.get("c1") is None + assert cache.total_bytes == 0 + assert len(cache) == 0 + + def test_negative_max_bytes_raises(self) -> None: + with pytest.raises(ValueError, match="data_cache_max_bytes"): + ChannelCache(max_bytes=-1) + + def test_repeated_concat_updates_stay_under_bound(self) -> None: + """Simulates the customer's sliding-window pull: same channel, growing. + + Without size reclamation on update, ``total_bytes`` would creep + above the cap silently. We re-build the entry each iteration to + mimic the ``_update_cache`` concat path. + """ + cap = 1_000_000 # ~1 MB + cache = ChannelCache(max_bytes=cap) + accumulated = pd.DataFrame() + for i in range(50): + chunk = _frame(rows=1000, start=_NOW + timedelta(seconds=i), freq="us") + accumulated = pd.concat([accumulated, chunk]) + cache.put( + "c1", + _new_cache_entry( + data=accumulated, + start_time=accumulated.index[0].to_pydatetime(), + end_time=accumulated.index[-1].to_pydatetime(), + ), + ) + assert cache.total_bytes <= cap, ( + f"iteration {i}: total_bytes={cache.total_bytes} exceeded cap={cap}" + ) + assert _invariant_holds(cache) + + +class TestMergePages: + """Behaviour of :meth:`DataLowLevelClient._merge_pages`. + + The helper replaces a previously inline O(N²) per-page concat loop with + a single batched concat per channel. These tests pin the merge + semantics so a future refactor can't silently drift: + + * Single-frame channels skip the concat entirely (cheap identity path). + * Multi-frame channels concat in collected order; ``groupby.last`` + makes the latest frame win on overlapping timestamps. + * Cached slices from ``_check_cache`` are folded in as the *first* + frame so fresh pages still win on overlap. + """ + + @pytest.mark.parametrize( + "pages", [[], [[]]], ids=["no_tasks_queued", "task_returned_empty"] + ) + def test_no_fresh_data_returns_initial(self, pages: list) -> None: + """No fresh pages → initial dict passes through by identity.""" + client = DataLowLevelClient(MagicMock()) + initial_df = _frame("chan", rows=5) + with _patch_deserializer({}): + result = client._merge_pages(pages=pages, initial={"chan": initial_df}) + assert result["chan"] is initial_df + + def test_single_frame_skips_concat(self) -> None: + """One frame for a channel → returned by identity, no concat call.""" + only_df = _frame("chan", rows=5) + client = DataLowLevelClient(MagicMock()) + with _patch_deserializer({"p1": {"chan": only_df}}): + result = client._merge_pages(pages=[["p1"]], initial={}) + assert result["chan"] is only_df + + def test_disjoint_pages_concat_in_order(self) -> None: + """Multiple disjoint pages for one channel → single concat result.""" + df1 = _frame("chan", rows=10, start=_NOW, offset=0, freq="s") + df2 = _frame("chan", rows=10, start=_NOW + timedelta(minutes=1), offset=10, freq="s") + df3 = _frame("chan", rows=10, start=_NOW + timedelta(minutes=2), offset=20, freq="s") + client = DataLowLevelClient(MagicMock()) + sentinels = {"p1": {"chan": df1}, "p2": {"chan": df2}, "p3": {"chan": df3}} + with _patch_deserializer(sentinels): + result = client._merge_pages(pages=[["p1", "p2"], ["p3"]], initial={}) + expected = pd.concat([df1, df2, df3]).groupby(level=0).last() + pd.testing.assert_frame_equal(result["chan"].sort_index(), expected.sort_index()) + assert len(result["chan"]) == 30 + + def test_overlapping_timestamps_later_page_wins(self) -> None: + """On overlap, the later page's value survives ``groupby.last``. + + Pins the old inline ``concat([acc, new]).groupby(level=0).last()`` + semantic: latest concat position wins on conflict. + """ + index = pd.date_range(_NOW, periods=5, freq="ms", tz=timezone.utc) + df_first = pd.DataFrame({"chan": [0] * 5}, index=index) + df_second = pd.DataFrame({"chan": [99] * 5}, index=index) + client = DataLowLevelClient(MagicMock()) + with _patch_deserializer({"p1": {"chan": df_first}, "p2": {"chan": df_second}}): + result = client._merge_pages(pages=[["p1", "p2"]], initial={}) + assert (result["chan"]["chan"] == 99).all() + + def test_cached_slice_folded_in_first_and_loses_on_overlap(self) -> None: + """Cached slice from ``_check_cache`` is the first frame in the merge. + + Fresh pages must overwrite cached values on duplicate timestamps, + matching the pre-existing "latest fetch wins" semantic. + """ + index = pd.date_range(_NOW, periods=5, freq="ms", tz=timezone.utc) + cached = pd.DataFrame({"chan": [-1] * 5}, index=index) + fresh = pd.DataFrame({"chan": [42] * 5}, index=index) + client = DataLowLevelClient(MagicMock()) + with _patch_deserializer({"p1": {"chan": fresh}}): + result = client._merge_pages(pages=[["p1"]], initial={"chan": cached}) + assert (result["chan"]["chan"] == 42).all() + + def test_multiple_channels_independent(self) -> None: + """Per-channel grouping is independent: one channel's pages don't bleed.""" + a1 = _frame("a", rows=5, start=_NOW, offset=0, freq="s") + a2 = _frame("a", rows=5, start=_NOW + timedelta(minutes=1), offset=5, freq="s") + b1 = _frame("b", rows=5, start=_NOW, offset=100, freq="s") + client = DataLowLevelClient(MagicMock()) + sentinels = {"p_a1": {"a": a1}, "p_a2": {"a": a2}, "p_b1": {"b": b1}} + with _patch_deserializer(sentinels): + result = client._merge_pages(pages=[["p_a1", "p_b1"], ["p_a2"]], initial={}) + assert len(result["a"]) == 10 + assert len(result["b"]) == 5 + assert (result["b"]["b"] >= 100).all() + + def test_does_not_mutate_initial(self) -> None: + """``initial`` is a defensive copy; caller's dict isn't mutated.""" + cached = _frame("chan", rows=5) + initial = {"chan": cached} + fresh = _frame("chan", rows=5, start=_NOW + timedelta(seconds=1), offset=10) + client = DataLowLevelClient(MagicMock()) + with _patch_deserializer({"p1": {"chan": fresh}}): + client._merge_pages(pages=[["p1"]], initial=initial) + assert initial["chan"] is cached + + +class TestDataLowLevelClient: + """Constructor wiring and per-instance isolation. + + Per-call behaviour (cache hits, ``ignore_cache``, pagination) lives in + :class:`TestGetChannelData`. + """ + + def test_per_instance_isolation(self) -> None: + """Two clients must not share cache state. + + Regression test for the original OOM bug: ``channel_cache`` was a + class attribute, so every ``SiftClient`` in the process appended to + the same dict. Two fresh clients must have independent caches. + """ + client_a = DataLowLevelClient(MagicMock()) + client_b = DataLowLevelClient(MagicMock()) + client_a.channel_cache.put("c1", _entry(rows=10)) + assert "c1" in client_a.channel_cache + assert "c1" not in client_b.channel_cache + assert client_b.channel_cache.total_bytes == 0 + + def test_data_cache_max_bytes_kwarg_propagates(self) -> None: + """``data_cache_max_bytes`` is forwarded to the underlying cache. + + The disabled-cache *behaviour* itself is covered by + :meth:`TestChannelCache.test_max_bytes_zero_disables_cache`; this + test just verifies the constructor passes the kwarg through. + """ + assert DataLowLevelClient(MagicMock(), data_cache_max_bytes=0).channel_cache.max_bytes == 0 + assert DataLowLevelClient(MagicMock(), data_cache_max_bytes=42).channel_cache.max_bytes == 42 + + +class TestGetChannelData: + """End-to-end assertions on the public ``get_channel_data`` return shape.""" + + @pytest.mark.asyncio + async def test_single_page_per_channel(self) -> None: + """Result is keyed by channel name; single-page frames pass through unchanged.""" + client = DataLowLevelClient(MagicMock()) + c1_df, c2_df = _frame("c1"), _frame("c2", offset=100) + with _fake_grpc(client, {"c1": [c1_df], "c2": [c2_df]}): + result = await client.get_channel_data( + channels=[_channel("c1"), _channel("c2")], + start_time=_NOW, + end_time=_WINDOW_END, + ignore_cache=True, + ) + assert set(result.keys()) == {"c1", "c2"} + pd.testing.assert_frame_equal(result["c1"], c1_df) + pd.testing.assert_frame_equal(result["c2"], c2_df) + + @pytest.mark.asyncio + async def test_multi_page_response_concatenated_per_channel(self) -> None: + """Three disjoint pages for one channel → single merged frame. + + Catches regressions in the ``_handle_pagination`` + ``_merge_pages`` + interaction (the perf fix's batched concat must still produce the + full 30-row contiguous result). + """ + client = DataLowLevelClient(MagicMock()) + p1 = _frame("c1", rows=10, start=_NOW, offset=0) + p2 = _frame("c1", rows=10, start=_NOW + timedelta(seconds=1), offset=10) + p3 = _frame("c1", rows=10, start=_NOW + timedelta(seconds=2), offset=20) + with _fake_grpc(client, {"c1": [p1, p2, p3]}): + result = await client.get_channel_data( + channels=[_channel("c1")], + start_time=_NOW, + end_time=_WINDOW_END, + ignore_cache=True, + ) + assert set(result.keys()) == {"c1"} + assert len(result["c1"]) == 30 + expected = pd.concat([p1, p2, p3]).groupby(level=0).last() + pd.testing.assert_frame_equal(result["c1"].sort_index(), expected.sort_index()) + + @pytest.mark.asyncio + async def test_cache_hit_short_circuits_grpc(self) -> None: + """Second request for the same channel + window skips ``_get_data_impl``. + + Stages two pages-worth of data so a faulty cache that falls through + wouldn't silently pass by hitting EOF — any second-call invocation + would consume the second page and bump ``len(call_log)``. + """ + client = DataLowLevelClient(MagicMock()) + df = _frame("c1") + with _fake_grpc(client, {"c1": [df, df]}) as call_log: + first = await client.get_channel_data( + channels=[_channel("c1")], + start_time=_NOW, + end_time=_WINDOW_END, + ) + calls_after_first = len(call_log) + assert calls_after_first >= 1 + + second = await client.get_channel_data( + channels=[_channel("c1")], + start_time=_NOW, + end_time=_WINDOW_END, + ) + assert len(call_log) == calls_after_first, ( + "second call should be served from cache without invoking _get_data_impl" + ) + pd.testing.assert_frame_equal(first["c1"].sort_index(), second["c1"].sort_index()) + + @pytest.mark.asyncio + async def test_partial_cache_hit_merges_cached_and_fresh(self) -> None: + """Cached + uncached channels resolved together in one return dict. + + Only the uncached channel triggers ``_get_data_impl``. + """ + client = DataLowLevelClient(MagicMock()) + c1_df, c2_df = _frame("c1"), _frame("c2", offset=100) + with _fake_grpc(client, {"c1": [c1_df], "c2": [c2_df]}) as call_log: + await client.get_channel_data( + channels=[_channel("c1")], + start_time=_NOW, + end_time=_WINDOW_END, + ) + calls_after_warmup = len(call_log) + + result = await client.get_channel_data( + channels=[_channel("c1"), _channel("c2")], + start_time=_NOW, + end_time=_WINDOW_END, + ) + new_calls = call_log[calls_after_warmup:] + + assert new_calls, "c2 should hit the wire on the second call" + for call in new_calls: + assert call["channel_ids"] == ["c2"], ( + f"only c2 should hit the wire, saw {call!r}" + ) + assert set(result.keys()) == {"c1", "c2"} + pd.testing.assert_frame_equal(result["c1"].sort_index(), c1_df.sort_index()) + pd.testing.assert_frame_equal(result["c2"].sort_index(), c2_df.sort_index()) + + @pytest.mark.asyncio + async def test_ignore_cache_true_returns_fresh_and_skips_write(self) -> None: + """``ignore_cache=True`` returns mock data and leaves the cache empty. + + End-to-end version of the latent bug that compounded the customer's + OOM: pre-fix, ``_update_cache`` ran even when the caller had asked + the cache to be ignored. + """ + client = DataLowLevelClient(MagicMock()) + df = _frame("c1") + with _fake_grpc(client, {"c1": [df]}): + result = await client.get_channel_data( + channels=[_channel("c1")], + start_time=_NOW, + end_time=_WINDOW_END, + ignore_cache=True, + ) + pd.testing.assert_frame_equal(result["c1"], df) + assert "c1" not in client.channel_cache + assert client.channel_cache.total_bytes == 0 diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py deleted file mode 100644 index b0841657b..000000000 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data_cache.py +++ /dev/null @@ -1,430 +0,0 @@ -"""Tests for the channel data cache in :mod:`sift_client._internal.low_level_wrappers.data`. - -Two layers covered here: - -* :class:`ChannelCache` directly — byte accounting, LRU promotion, eviction, - edge cases. These tests construct cache entries from real (tiny) DataFrames - so the size measurement code is exercised end-to-end. -* :class:`DataLowLevelClient` — ``ignore_cache=True`` skipping writes, - per-instance cache isolation, ``data_cache_max_bytes=0`` disabling cache. - -The OOM regression that motivated this code happened because the cache was a -class attribute that grew without bound. The instance-isolation test below is -the canary that catches anyone re-introducing that pattern. -""" - -from __future__ import annotations - -from datetime import datetime, timedelta, timezone -from unittest.mock import MagicMock, patch - -import pandas as pd -import pytest - -from sift_client._internal.low_level_wrappers.data import ( - DEFAULT_DATA_CACHE_MAX_BYTES, - ChannelCache, - ChannelCacheEntry, - DataLowLevelClient, - _new_cache_entry, -) - - -def _entry(rows: int, *, value_dtype: str = "float64") -> ChannelCacheEntry: - """Build a ChannelCacheEntry with ``rows`` rows of fake data.""" - index = pd.date_range("2025-01-01", periods=rows, freq="ms", tz=timezone.utc) - data = pd.DataFrame({"value": range(rows)}, index=index).astype({"value": value_dtype}) - return _new_cache_entry( - data=data, - start_time=index[0].to_pydatetime(), - end_time=index[-1].to_pydatetime(), - ) - - -def _invariant_holds(cache: ChannelCache) -> bool: - return cache.total_bytes == sum(e.size_bytes for e in cache._entries.values()) - - -class TestChannelCacheBookkeeping: - """Tight checks on the internal byte counter and ordering.""" - - def test_put_get_roundtrip(self) -> None: - cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) - entry = _entry(rows=10) - cache.put("c1", entry) - - assert cache.get("c1") is entry - assert cache.total_bytes == entry.size_bytes - assert _invariant_holds(cache) - - def test_put_replaces_size_accounting(self) -> None: - """A second put for the same key must reclaim the prior size first.""" - cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) - small = _entry(rows=10) - big = _entry(rows=1000) - - cache.put("c1", small) - cache.put("c1", big) - - # Total reflects only the second entry, never small + big. - assert cache.total_bytes == big.size_bytes - assert cache.get("c1") is big - assert _invariant_holds(cache) - - def test_invalidate_drops_byte_count(self) -> None: - cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) - cache.put("c1", _entry(rows=10)) - cache.invalidate("c1") - - assert cache.get("c1") is None - assert cache.total_bytes == 0 - assert _invariant_holds(cache) - - def test_invalidate_missing_is_noop(self) -> None: - cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) - cache.invalidate("nope") - assert cache.total_bytes == 0 - - def test_clear_empties_total(self) -> None: - cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) - cache.put("c1", _entry(rows=10)) - cache.put("c2", _entry(rows=20)) - cache.clear() - - assert cache.total_bytes == 0 - assert len(cache) == 0 - assert _invariant_holds(cache) - - -class TestChannelCacheEviction: - """Eviction policy: LRU, byte-bounded, oversized-entry-dropped.""" - - def test_oldest_entry_evicted_first(self) -> None: - """Insertion order determines who goes when only inserts have happened.""" - a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) - cap = a.size_bytes + b.size_bytes # room for exactly two - cache = ChannelCache(max_bytes=cap) - - cache.put("a", a) - cache.put("b", b) - cache.put("c", c) # forces eviction of "a" - - assert "a" not in cache - assert "b" in cache - assert "c" in cache - assert cache.total_bytes <= cap - assert _invariant_holds(cache) - - def test_get_promotes_to_most_recent(self) -> None: - """Reading an entry must protect it from the next eviction.""" - a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) - cap = a.size_bytes + b.size_bytes - cache = ChannelCache(max_bytes=cap) - - cache.put("a", a) - cache.put("b", b) - assert cache.get("a") is a # promote - cache.put("c", c) # now "b" is the oldest and should be evicted - - assert "a" in cache - assert "b" not in cache - assert "c" in cache - assert _invariant_holds(cache) - - def test_oversized_entry_evicts_with_neighbours(self) -> None: - """A single entry larger than the cap ends up evicted itself. - - The alternative ("keep the oversized entry and accept that the cap is - soft") would silently reintroduce the unbounded-growth bug for any - workload whose typical entry is bigger than ``max_bytes``. - """ - small_a, small_b = _entry(rows=10), _entry(rows=10) - oversized = _entry(rows=10_000) - cap = small_a.size_bytes + small_b.size_bytes # comfortably below ``oversized`` - cache = ChannelCache(max_bytes=cap) - - cache.put("a", small_a) - cache.put("b", small_b) - cache.put("huge", oversized) - - assert "huge" not in cache - # Every other entry was evicted in the failed attempt to make room. - assert "a" not in cache - assert "b" not in cache - assert cache.total_bytes == 0 - assert _invariant_holds(cache) - - def test_max_bytes_zero_disables_cache(self) -> None: - cache = ChannelCache(max_bytes=0) - cache.put("c1", _entry(rows=100)) - - assert not cache.enabled - assert cache.get("c1") is None - assert cache.total_bytes == 0 - assert len(cache) == 0 - - def test_negative_max_bytes_raises(self) -> None: - with pytest.raises(ValueError, match="data_cache_max_bytes"): - ChannelCache(max_bytes=-1) - - def test_repeated_concat_updates_stay_under_bound(self) -> None: - """Simulates the customer's sliding-window pull: same channel, growing. - - Without size reclamation on update, ``total_bytes`` would creep above - the cap silently. We re-build the entry each iteration to mimic the - ``_update_cache`` concat path. - """ - cap = 1_000_000 # ~1 MB - cache = ChannelCache(max_bytes=cap) - accumulated = pd.DataFrame() - for i in range(50): - chunk = pd.DataFrame( - {"value": range(1000)}, - index=pd.date_range( - datetime(2025, 1, 1, tzinfo=timezone.utc) + timedelta(seconds=i), - periods=1000, - freq="us", - ), - ) - accumulated = pd.concat([accumulated, chunk]) - cache.put( - "c1", - _new_cache_entry( - data=accumulated, - start_time=accumulated.index[0].to_pydatetime(), - end_time=accumulated.index[-1].to_pydatetime(), - ), - ) - assert cache.total_bytes <= cap, ( - f"iteration {i}: total_bytes={cache.total_bytes} exceeded cap={cap}" - ) - assert _invariant_holds(cache) - - -class TestDataLowLevelClientIntegration: - """End-to-end checks on the constructor wiring and ignore_cache semantics.""" - - def test_per_instance_isolation(self) -> None: - """Two clients must not share cache state. - - This is the regression test for the original OOM bug: ``channel_cache`` - was a class attribute, so every ``SiftClient`` in the process appended - to the same dict. Construct two clients, populate one, the other must - stay empty. - """ - client_a = DataLowLevelClient(MagicMock()) - client_b = DataLowLevelClient(MagicMock()) - - client_a.channel_cache.put("c1", _entry(rows=10)) - - assert "c1" in client_a.channel_cache - assert "c1" not in client_b.channel_cache - assert client_b.channel_cache.total_bytes == 0 - - def test_ignore_cache_skips_writes(self) -> None: - """``ignore_cache=True`` must not populate the cache. - - Previously the read path was bypassed but ``_update_cache`` still ran - unconditionally, so a "non-caching" workload still grew memory until - OOM. Verify by exercising ``_update_cache`` only when ``ignore_cache`` - is false. - """ - client = DataLowLevelClient(MagicMock()) - client.channel_cache.name_id_map["chan"] = "c1" - - index = pd.date_range("2025-01-01", periods=5, freq="ms", tz=timezone.utc) - df = pd.DataFrame({"value": range(5)}, index=index) - - # Real ``get_channel_data`` would call ``_update_cache`` from inside an - # ``if not ignore_cache`` branch; assert the helper itself is what - # writes, and that ``get_channel_data`` doesn't invoke it when - # ``ignore_cache=True``. We verify the branch directly to keep this - # test free of gRPC stubbing. - client._update_cache( - channel_data={"chan": df}, - start_time=index[0].to_pydatetime(), - end_time=index[-1].to_pydatetime(), - ) - assert "c1" in client.channel_cache - - # Skipping the call (as ``get_channel_data`` does when ignore_cache is - # true) leaves the cache untouched. - client.channel_cache.invalidate("c1") - assert "c1" not in client.channel_cache - - def test_data_cache_max_bytes_zero_disables_caching(self) -> None: - """Constructor knob: ``data_cache_max_bytes=0`` → no cache writes land.""" - client = DataLowLevelClient(MagicMock(), data_cache_max_bytes=0) - client.channel_cache.name_id_map["chan"] = "c1" - - index = pd.date_range("2025-01-01", periods=5, freq="ms", tz=timezone.utc) - df = pd.DataFrame({"value": range(5)}, index=index) - - client._update_cache( - channel_data={"chan": df}, - start_time=index[0].to_pydatetime(), - end_time=index[-1].to_pydatetime(), - ) - assert "c1" not in client.channel_cache - assert client.channel_cache.total_bytes == 0 - - -class TestMergePages: - """Behavioural tests for :meth:`DataLowLevelClient._merge_pages`. - - The helper replaces a previously inline O(N²) per-page concat loop with a - single batched concat per channel. These tests pin the merge semantics so - a future refactor can't silently drift, in particular: - - * Single-frame channels skip the concat entirely (cheap path). - * Multi-frame channels concat in the order frames were collected. - * Cached slices from ``_check_cache`` are folded in as the first frame so - fresher pages win on overlapping timestamps via ``groupby.last``. - """ - - @staticmethod - def _client_with_fake_deserializer( - sentinel_to_frames: dict[str, dict[str, pd.DataFrame]], - ): - """Build a DataLowLevelClient whose ``try_deserialize_channel_data`` - translates string sentinels (passed in lieu of protos) to dicts of - already-built DataFrames. Lets the merge logic be tested without - constructing protos. - """ - client = DataLowLevelClient(MagicMock()) - patcher = patch.object( - DataLowLevelClient, - "try_deserialize_channel_data", - staticmethod(lambda data: sentinel_to_frames[data]), - ) - patcher.start() - return client, patcher - - @staticmethod - def _frame(channel: str, start: str, rows: int, offset: int = 0) -> pd.DataFrame: - index = pd.date_range(start, periods=rows, freq="ms", tz=timezone.utc) - return pd.DataFrame({channel: range(offset, offset + rows)}, index=index) - - def test_empty_pages_returns_initial(self) -> None: - """No pages, no fresh data — initial passes through untouched.""" - client, patcher = self._client_with_fake_deserializer({}) - try: - initial_df = self._frame("chan", "2025-01-01", rows=5) - result = client._merge_pages(pages=[], initial={"chan": initial_df}) - assert result["chan"] is initial_df - finally: - patcher.stop() - - def test_single_frame_skips_concat(self) -> None: - """One frame for a channel → returned by identity, no concat call.""" - only_df = self._frame("chan", "2025-01-01", rows=5) - client, patcher = self._client_with_fake_deserializer({"page_a": {"chan": only_df}}) - try: - result = client._merge_pages(pages=[["page_a"]], initial={}) - # Identity check: no concat happened, so the original frame is - # returned by reference. - assert result["chan"] is only_df - finally: - patcher.stop() - - def test_disjoint_pages_concat_in_order(self) -> None: - """Multiple disjoint pages for one channel → single concat result.""" - df1 = self._frame("chan", "2025-01-01", rows=10, offset=0) - df2 = self._frame("chan", "2025-01-02", rows=10, offset=10) - df3 = self._frame("chan", "2025-01-03", rows=10, offset=20) - client, patcher = self._client_with_fake_deserializer( - { - "p1": {"chan": df1}, - "p2": {"chan": df2}, - "p3": {"chan": df3}, - } - ) - try: - result = client._merge_pages(pages=[["p1", "p2"], ["p3"]], initial={}) - - expected = pd.concat([df1, df2, df3]).groupby(level=0).last() - pd.testing.assert_frame_equal(result["chan"].sort_index(), expected.sort_index()) - assert len(result["chan"]) == 30 - finally: - patcher.stop() - - def test_overlapping_timestamps_later_page_wins(self) -> None: - """On overlapping timestamps, the later page's value survives groupby.last. - - This pins the existing behavior: the loop's old shape did - ``concat([acc, new]).groupby(...).last()`` which kept the LATER value - on conflict; the batched concat must preserve that ordering. - """ - index = pd.date_range("2025-01-01", periods=5, freq="ms", tz=timezone.utc) - df_first = pd.DataFrame({"chan": [0] * 5}, index=index) - df_second = pd.DataFrame({"chan": [99] * 5}, index=index) - client, patcher = self._client_with_fake_deserializer( - {"p1": {"chan": df_first}, "p2": {"chan": df_second}} - ) - try: - result = client._merge_pages(pages=[["p1", "p2"]], initial={}) - assert (result["chan"]["chan"] == 99).all() - finally: - patcher.stop() - - def test_cached_slice_folded_in_first_and_loses_on_overlap(self) -> None: - """Cached slice from ``_check_cache`` is the first frame in the merge. - - Fresh pages should overwrite cached values on duplicate timestamps, - matching the pre-existing semantic that the latest fetch wins. - """ - index = pd.date_range("2025-01-01", periods=5, freq="ms", tz=timezone.utc) - cached = pd.DataFrame({"chan": [-1] * 5}, index=index) - fresh = pd.DataFrame({"chan": [42] * 5}, index=index) - client, patcher = self._client_with_fake_deserializer({"p1": {"chan": fresh}}) - try: - result = client._merge_pages(pages=[["p1"]], initial={"chan": cached}) - assert (result["chan"]["chan"] == 42).all() - finally: - patcher.stop() - - def test_cached_only_no_pages_preserves_cache(self) -> None: - """Channels in ``initial`` with no fresh page data must survive intact.""" - client, patcher = self._client_with_fake_deserializer({}) - try: - cached = self._frame("chan", "2025-01-01", rows=5) - result = client._merge_pages(pages=[[]], initial={"chan": cached}) - assert result["chan"] is cached - finally: - patcher.stop() - - def test_multiple_channels_independent(self) -> None: - """Per-channel grouping is independent: one channel's pages don't bleed. - - Same shape as a multi-channel ``get_data`` call where each channel - returns its own pages. - """ - a1 = self._frame("a", "2025-01-01", rows=5, offset=0) - a2 = self._frame("a", "2025-01-02", rows=5, offset=5) - b1 = self._frame("b", "2025-01-01", rows=5, offset=100) - client, patcher = self._client_with_fake_deserializer( - { - "p_a1": {"a": a1}, - "p_a2": {"a": a2}, - "p_b1": {"b": b1}, - } - ) - try: - result = client._merge_pages(pages=[["p_a1", "p_b1"], ["p_a2"]], initial={}) - assert len(result["a"]) == 10 - assert len(result["b"]) == 5 - assert (result["b"]["b"] >= 100).all() - finally: - patcher.stop() - - def test_does_not_mutate_initial(self) -> None: - """``initial`` is a defensive copy; caller's dict isn't mutated.""" - cached = self._frame("chan", "2025-01-01", rows=5) - initial = {"chan": cached} - fresh = self._frame("chan", "2025-01-02", rows=5, offset=10) - client, patcher = self._client_with_fake_deserializer({"p1": {"chan": fresh}}) - try: - _ = client._merge_pages(pages=[["p1"]], initial=initial) - assert initial["chan"] is cached - finally: - patcher.stop() From 526844073ce8434f067146aee7304a4048b47a8b Mon Sep 17 00:00:00 2001 From: Ian Later Date: Wed, 24 Jun 2026 11:56:01 -0700 Subject: [PATCH 06/14] move lru config to resource. --- python/CHANGELOG.md | 14 ++++--- .../_internal/low_level_wrappers/data.py | 14 +++++++ .../_internal/low_level_wrappers/test_data.py | 36 ++++++++++++---- .../_tests/resources/test_channels.py | 37 +++++++++++++++++ python/lib/sift_client/client.py | 18 +++----- python/lib/sift_client/resources/channels.py | 41 +++++++++++++++---- 6 files changed, 126 insertions(+), 34 deletions(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index e4016eb69..d58cc818b 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -13,17 +13,19 @@ Up to a ~80x speedup for some get_data calls. #### Bounded channel data cache -A new `data_cache_max_bytes` constructor kwarg (default 512 MiB) caps the in-memory channel-data footprint; the least-recently-used cached channel is evicted once the bound is reached. Set `data_cache_max_bytes=0` to disable caching entirely. +The in-memory channel data cache used by `client.channels.get_data(...)` is now byte-bounded with LRU eviction (default 512 MiB). Once the bound is reached, the least-recently-used cached channel is evicted. -`ignore_cache=True` on `client.channels.get_data(...)` now also skips writing into the cache, matching its read-side bypass semantics. Previously a "non-caching" workload still appended to the shared cache on every call, which still caused increased memory usage. +Configure the bound on the `channels` resource: ```python -client = SiftClient( - connection_config=config, - data_cache_max_bytes=128 * 1024 * 1024, # 128 MiB cap -) +client.channels.configure_data_cache(max_bytes=128 * 1024 * 1024) # 128 MiB cap +client.channels.configure_data_cache(max_bytes=0) # disable caching ``` +`configure_data_cache` may be called at any time; if the cache is already populated, the new bound is applied immediately and excess entries are evicted. + +`ignore_cache=True` on `client.channels.get_data(...)` now also skips writing into the cache, matching its read-side bypass semantics. Previously a "non-caching" workload still appended to the shared cache on every call, which still caused increased memory usage. + The internal `DataLowLevelClient.channel_cache` is no longer a class attribute. Any external code that relied on `DataLowLevelClient.channel_cache.channels.clear()` as a workaround should remove it — the bounded cache no longer requires manual purging. #### Resource and principal attributes (ABAC) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index 381b6667d..97baf522c 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -83,6 +83,20 @@ def enabled(self) -> bool: def max_bytes(self) -> int: return self._max_bytes + @max_bytes.setter + def max_bytes(self, value: int) -> None: + """Reconfigure the byte cap and immediately evict any excess. + + Used by ``ChannelsAPIAsync.configure_data_cache`` to retune a live + cache. Lowering the cap below ``total_bytes`` triggers LRU eviction + in the same loop ``put`` uses, so the invariant ``total_bytes <= + max_bytes`` is restored before the setter returns. + """ + if value < 0: + raise ValueError(f"data_cache_max_bytes must be >= 0, got {value}") + self._max_bytes = value + self._evict_until_under_bound() + @property def total_bytes(self) -> int: return self._total_bytes diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py index e2b12cecf..1f7f022fb 100644 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py @@ -256,6 +256,30 @@ def test_negative_max_bytes_raises(self) -> None: with pytest.raises(ValueError, match="data_cache_max_bytes"): ChannelCache(max_bytes=-1) + def test_set_max_bytes_lower_evicts_immediately(self) -> None: + """Lowering ``max_bytes`` below ``total_bytes`` evicts LRU until it fits. + + Used by ``ChannelsAPIAsync.configure_data_cache`` to retune a live + cache without forcing the caller to call ``clear()`` first. + """ + a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) + cache = ChannelCache(max_bytes=a.size_bytes + b.size_bytes + c.size_bytes) + cache.put("a", a) + cache.put("b", b) + cache.put("c", c) + # Lower the cap to fit only one entry; LRU "a" and "b" must drop. + cache.max_bytes = c.size_bytes + assert cache.max_bytes == c.size_bytes + assert "a" not in cache + assert "b" not in cache + assert "c" in cache + assert _invariant_holds(cache) + + def test_set_max_bytes_negative_raises(self) -> None: + cache = ChannelCache(max_bytes=100) + with pytest.raises(ValueError, match="data_cache_max_bytes"): + cache.max_bytes = -1 + def test_repeated_concat_updates_stay_under_bound(self) -> None: """Simulates the customer's sliding-window pull: same channel, growing. @@ -297,9 +321,7 @@ class TestMergePages: frame so fresh pages still win on overlap. """ - @pytest.mark.parametrize( - "pages", [[], [[]]], ids=["no_tasks_queued", "task_returned_empty"] - ) + @pytest.mark.parametrize("pages", [[], [[]]], ids=["no_tasks_queued", "task_returned_empty"]) def test_no_fresh_data_returns_initial(self, pages: list) -> None: """No fresh pages → initial dict passes through by identity.""" client = DataLowLevelClient(MagicMock()) @@ -410,7 +432,9 @@ def test_data_cache_max_bytes_kwarg_propagates(self) -> None: test just verifies the constructor passes the kwarg through. """ assert DataLowLevelClient(MagicMock(), data_cache_max_bytes=0).channel_cache.max_bytes == 0 - assert DataLowLevelClient(MagicMock(), data_cache_max_bytes=42).channel_cache.max_bytes == 42 + assert ( + DataLowLevelClient(MagicMock(), data_cache_max_bytes=42).channel_cache.max_bytes == 42 + ) class TestGetChannelData: @@ -510,9 +534,7 @@ async def test_partial_cache_hit_merges_cached_and_fresh(self) -> None: assert new_calls, "c2 should hit the wire on the second call" for call in new_calls: - assert call["channel_ids"] == ["c2"], ( - f"only c2 should hit the wire, saw {call!r}" - ) + assert call["channel_ids"] == ["c2"], f"only c2 should hit the wire, saw {call!r}" assert set(result.keys()) == {"c1", "c2"} pd.testing.assert_frame_equal(result["c1"].sort_index(), c1_df.sort_index()) pd.testing.assert_frame_equal(result["c2"].sort_index(), c2_df.sort_index()) diff --git a/python/lib/sift_client/_tests/resources/test_channels.py b/python/lib/sift_client/_tests/resources/test_channels.py index f337bd3f5..0bc3e1122 100644 --- a/python/lib/sift_client/_tests/resources/test_channels.py +++ b/python/lib/sift_client/_tests/resources/test_channels.py @@ -501,3 +501,40 @@ async def fake_update_channel(update): api._units_low_level_client.create_unit.assert_not_awaited() assert captured["update"].unit == "" + + +class TestConfigureDataCache: + """``configure_data_cache`` is the resource-level knob for the in-memory + channel data cache. Before the cache is initialized, it stashes the value + for the lazy-init path; after, it retunes the live cache. + """ + + def test_before_lazy_init_propagates_to_cache(self): + """Configuring before the first ``get_data`` lands on the cache at init.""" + api = _make_api() + api.configure_data_cache(max_bytes=123) + assert api._data_low_level_client is None # still lazy + api._ensure_data_low_level_client() + assert api._data_low_level_client.channel_cache.max_bytes == 123 + + def test_after_lazy_init_updates_live_cache(self): + """Configuring after first use retunes the live cache in place.""" + api = _make_api() + api._ensure_data_low_level_client() + original_client = api._data_low_level_client + api.configure_data_cache(max_bytes=456) + # Same wrapper instance — we mutated, not replaced. + assert api._data_low_level_client is original_client + assert api._data_low_level_client.channel_cache.max_bytes == 456 + + def test_zero_disables_cache_via_resource(self): + """Resource-level ``max_bytes=0`` end-to-end disables the cache.""" + api = _make_api() + api.configure_data_cache(max_bytes=0) + api._ensure_data_low_level_client() + assert not api._data_low_level_client.channel_cache.enabled + + def test_negative_raises(self): + api = _make_api() + with pytest.raises(ValueError, match="max_bytes"): + api.configure_data_cache(max_bytes=-1) diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py index 7d20fbe85..6afc36386 100644 --- a/python/lib/sift_client/client.py +++ b/python/lib/sift_client/client.py @@ -136,7 +136,6 @@ def __init__( rest_url: str | None = None, connection_config: SiftConnectionConfig | None = None, app_url: str | None = None, - data_cache_max_bytes: int | None = None, ): """Initialize the SiftClient with specific connection parameters or a connection_config. @@ -149,14 +148,12 @@ def __init__( Set this for on-prem or custom deployments whose API host can't be mapped to a frontend automatically; see the ``app_url`` property. A value here takes precedence over ``connection_config.app_url``. - data_cache_max_bytes: Cap on the in-memory channel data cache used - by ``client.channels.get_data`` (bytes). When the bound is - reached, the least-recently-used cached channel is evicted. - Defaults to 512 MiB. Set to ``0`` to disable caching. Must be - ``>= 0``. + + Resource-specific knobs live on the resource itself. For example, + to tune the channel data cache used by ``client.channels.get_data``:: + + client.channels.configure_data_cache(max_bytes=128 * 1024 * 1024) """ - if data_cache_max_bytes is not None and data_cache_max_bytes < 0: - raise ValueError(f"data_cache_max_bytes must be >= 0, got {data_cache_max_bytes}") if not (api_key and grpc_url and rest_url) and not connection_config: raise ValueError( "Either api_key, grpc_url and rest_url or connection_config must be provided to establish a connection." @@ -187,11 +184,6 @@ def __init__( # pytest plugin's ``--sift-disabled`` mode. self._simulate: bool = False - # Read by ``ChannelsAPIAsync._ensure_data_low_level_client`` when it - # lazily constructs the data wrapper. ``None`` means "use the wrapper - # default" so we don't have to import the constant here. - self._data_cache_max_bytes: int | None = data_cache_max_bytes - self.ping = PingAPI(self) self.assets = AssetsAPI(self) self.calculated_channels = CalculatedChannelsAPI(self) diff --git a/python/lib/sift_client/resources/channels.py b/python/lib/sift_client/resources/channels.py index 41d478d81..26ff2da65 100644 --- a/python/lib/sift_client/resources/channels.py +++ b/python/lib/sift_client/resources/channels.py @@ -64,6 +64,32 @@ def __init__(self, sift_client: SiftClient): self._low_level_client = ChannelsLowLevelClient(grpc_client=self.client.grpc_client) self._units_low_level_client = UnitsLowLevelClient(grpc_client=self.client.grpc_client) self._data_low_level_client = None + # Caller-supplied cache size; ``None`` means "use the wrapper default + # at lazy-init time" so we don't have to import ``data.py`` (and + # therefore pandas) just to remember the default. + self._data_cache_max_bytes: int | None = None + + def configure_data_cache(self, *, max_bytes: int) -> None: + """Configure the in-memory channel data cache used by ``get_data``. + + Args: + max_bytes: Byte cap on the cache. ``0`` disables caching + (every ``get_data`` call goes to the wire). Defaults to + 512 MiB until explicitly configured. Must be ``>= 0``. + + Safe to call before or after the first ``get_data``. If the cache is + already live, the new cap is applied immediately and least-recently- + used entries are evicted until ``total_bytes`` fits. + + Example: + client.channels.configure_data_cache(max_bytes=128 * 1024 * 1024) + client.channels.configure_data_cache(max_bytes=0) # disable + """ + if max_bytes < 0: + raise ValueError(f"max_bytes must be >= 0, got {max_bytes}") + self._data_cache_max_bytes = max_bytes + if self._data_low_level_client is not None: + self._data_low_level_client.channel_cache.max_bytes = max_bytes async def get( self, @@ -242,17 +268,16 @@ async def unarchive(self, channels: list[str | Channel]) -> None: def _ensure_data_low_level_client(self): """Ensure that the data low level client is initialized. Separated out like this to not require large dependencies (pandas/pyarrow) for the client if not fetching data.""" if self._data_low_level_client is None: - from sift_client._internal.low_level_wrappers.data import ( - DEFAULT_DATA_CACHE_MAX_BYTES, - DataLowLevelClient, - ) + from sift_client._internal.low_level_wrappers.data import DataLowLevelClient - max_bytes = getattr(self.client, "_data_cache_max_bytes", None) + # Pass the kwarg only when explicitly configured so the wrapper's + # own default (currently 512 MiB) remains the single source of truth. + kwargs = {} + if self._data_cache_max_bytes is not None: + kwargs["data_cache_max_bytes"] = self._data_cache_max_bytes self._data_low_level_client = DataLowLevelClient( grpc_client=self.client.grpc_client, - data_cache_max_bytes=( - DEFAULT_DATA_CACHE_MAX_BYTES if max_bytes is None else max_bytes - ), + **kwargs, ) async def get_data( From bd3213e067c130958111937fcc9d5c2845bc5c78 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Wed, 24 Jun 2026 12:39:59 -0700 Subject: [PATCH 07/14] add opt in diskcache support and test coverage --- python/CHANGELOG.md | 26 ++ .../_internal/low_level_wrappers/data.py | 290 ++++++++++++++++-- .../_internal/low_level_wrappers/test_data.py | 219 ++++++++++++- .../_tests/resources/test_channels.py | 87 ++++++ python/lib/sift_client/resources/channels.py | 101 +++++- python/pyproject.toml | 7 + python/uv.lock | 13 +- 7 files changed, 714 insertions(+), 29 deletions(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index d58cc818b..8e61faa1b 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -28,6 +28,32 @@ client.channels.configure_data_cache(max_bytes=0) # disable cac The internal `DataLowLevelClient.channel_cache` is no longer a class attribute. Any external code that relied on `DataLowLevelClient.channel_cache.channels.clear()` as a workaround should remove it — the bounded cache no longer requires manual purging. +#### On-disk channel data cache (opt-in) + +The channel data cache can now optionally persist to disk, surviving process restarts. The disk tier is a second-chance layer beneath the in-memory cache: on a memory miss, `get_data` checks disk before going to the wire. Re-running the same workload in a new session picks up the previously-cached windows for free. + +```python +# Enable disk persistence at the default tmp location. +client.channels.enable_data_cache_disk() + +# Or pick a custom directory and byte cap. +client.channels.enable_data_cache_disk(path="/data/sift-cache", max_bytes=2 * 1024 ** 3) + +# Stop persisting (does not delete on-disk data). +client.channels.disable_data_cache_disk() +``` + +To remove a stale cache directory from a previous session: + +```python +client.channels.clear_data_cache_on_disk() # default tmp path +client.channels.clear_data_cache_on_disk("/data/sift-cache") # custom path +``` + +`clear_data_cache_on_disk` refuses to delete directories that don't look like a sift channel data cache (missing the `diskcache` marker), so a typo'd path won't wipe unrelated data. + +The disk tier is powered by [`diskcache`](https://grantjenks.com/docs/diskcache/) (pure-Python, SQLite-backed) and has its own independent byte cap with LRU eviction. The in-memory tier remains the fast path — disk is only consulted on a memory miss. + #### Resource and principal attributes (ABAC) Added a public API for attribute based access control (ABAC) attributes. `client.resource_attributes` manages attribute keys assigned to entities (assets, channels, runs), and `client.principal_attributes` manages attribute keys assigned to principals (users and user groups). Both are available synchronously and asynchronously via `client.async_`. diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index 97baf522c..ae0bbf6e5 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -2,8 +2,12 @@ import asyncio import logging +import os +import shutil +import tempfile from collections import OrderedDict from datetime import datetime, timezone +from pathlib import Path from typing import TYPE_CHECKING, Any, cast import pandas as pd @@ -23,6 +27,8 @@ from sift_client.transport import WithGrpcClient if TYPE_CHECKING: + import diskcache + from sift_client.transport.grpc_transport import GrpcClient # Configure logging @@ -61,22 +67,106 @@ def _new_cache_entry( class ChannelCache: - """LRU-ordered, byte-bounded cache of per-channel DataFrames. + """Two-tier cache of per-channel DataFrames. + + Tier 1: an LRU-ordered, byte-bounded in-memory dict (hot path). ``max_bytes + <= 0`` disables this tier: ``get`` always misses memory, ``put`` doesn't + populate it. - ``max_bytes <= 0`` disables retention: every ``get`` misses, ``put`` returns - without storing. + Tier 2 (optional, see ``enable_disk``): a ``diskcache``-backed write-through + layer that survives process restarts. When enabled, ``put`` writes to both + tiers, ``get`` falls back to disk on a memory miss (promoting the hit back + into memory), and ``invalidate``/``clear`` cascade to disk. The disk tier + has its own byte cap that ``diskcache`` enforces with LRU eviction. + + The two tiers are independent: setting ``max_bytes=0`` keeps the disk layer + active, useful for "cold storage only" workloads. """ - def __init__(self, max_bytes: int = DEFAULT_DATA_CACHE_MAX_BYTES): + #: Default directory for the on-disk tier. Lives under + #: ``tempfile.gettempdir()`` so it survives across sessions of the same + #: user but doesn't pollute the user's home dir. The suffix is fixed so + #: multiple processes (different ``SiftClient`` instances, notebooks, etc.) + #: naturally share the same store and can read each other's prior sessions. + DEFAULT_DISK_PATH: str = os.path.join(tempfile.gettempdir(), "sift-channel-data-cache") + + #: Default byte cap for the disk tier when ``enable_disk`` is called + #: without an explicit ``max_bytes``. 4 GiB is a generous ceiling for the + #: typical ``/tmp`` filesystem; ``diskcache`` enforces it with its own + #: SQLite-backed LRU eviction once the bound is reached. + DEFAULT_DISK_MAX_BYTES: int = 4 * 1024 * 1024 * 1024 + + #: Marker file ``diskcache`` writes inside every cache directory. We + #: sanity-check for this before any ``shutil.rmtree`` so a typo in the + #: ``clear_disk`` ``path`` argument can't wipe out an unrelated directory. + _DISKCACHE_MARKER: str = "cache.db" + + def __init__( + self, + max_bytes: int = DEFAULT_DATA_CACHE_MAX_BYTES, + *, + disk_path: str | os.PathLike[str] | None = None, + disk_max_bytes: int | None = None, + ): + """Construct an in-memory cache, optionally backed by disk. + + Args: + max_bytes: Byte cap on the in-memory tier. ``0`` disables it. + disk_path: Directory for the disk tier. ``None`` (the default) + disables disk. A previously-populated directory is reused, + so subsequent sessions can read from existing entries. + disk_max_bytes: Byte cap on the disk tier. ``None`` falls back to + ``DEFAULT_DISK_MAX_BYTES``. Ignored when ``disk_path`` is + ``None``. + """ if max_bytes < 0: raise ValueError(f"data_cache_max_bytes must be >= 0, got {max_bytes}") self.name_id_map: dict[str, str] = {} self._entries: OrderedDict[str, ChannelCacheEntry] = OrderedDict() self._total_bytes: int = 0 self._max_bytes: int = max_bytes + self._disk: diskcache.Cache | None = None + self._disk_path: str | None = None + self._disk_max_bytes: int | None = None + if disk_path is not None: + self._open_disk( + str(disk_path), + disk_max_bytes if disk_max_bytes is not None else self.DEFAULT_DISK_MAX_BYTES, + ) + + @classmethod + def clear_disk(cls, path: str | os.PathLike[str] | None = None) -> None: + """Delete a previously-persisted on-disk cache directory. + + Use this to drop stale caches from previous sessions, recover from a + corrupt cache, or reclaim disk space. The directory is removed + entirely; a future ``enable_disk`` call at the same path will see a + fresh empty cache. + + Args: + path: Directory of the cache to clear. ``None`` (the default) + targets :attr:`DEFAULT_DISK_PATH`. + + Raises: + ValueError: If ``path`` exists but does not look like a sift + channel data cache directory (missing the ``diskcache`` + marker file). This guard makes accidental misuse a hard + error rather than silent data loss. + """ + target = Path(path) if path is not None else Path(cls.DEFAULT_DISK_PATH) + if not target.exists(): + return + if not (target / cls._DISKCACHE_MARKER).exists(): + raise ValueError( + f"{str(target)!r} does not look like a sift channel data cache " + f"directory (missing {cls._DISKCACHE_MARKER!r} marker). " + f"Refusing to delete." + ) + shutil.rmtree(target) @property def enabled(self) -> bool: + """Whether the in-memory tier accepts writes (``max_bytes > 0``).""" return self._max_bytes > 0 @property @@ -85,12 +175,13 @@ def max_bytes(self) -> int: @max_bytes.setter def max_bytes(self, value: int) -> None: - """Reconfigure the byte cap and immediately evict any excess. + """Reconfigure the in-memory byte cap and immediately evict any excess. Used by ``ChannelsAPIAsync.configure_data_cache`` to retune a live cache. Lowering the cap below ``total_bytes`` triggers LRU eviction in the same loop ``put`` uses, so the invariant ``total_bytes <= - max_bytes`` is restored before the setter returns. + max_bytes`` is restored before the setter returns. Does not touch + the disk tier. """ if value < 0: raise ValueError(f"data_cache_max_bytes must be >= 0, got {value}") @@ -101,46 +192,161 @@ def max_bytes(self, value: int) -> None: def total_bytes(self) -> int: return self._total_bytes + @property + def disk_enabled(self) -> bool: + """Whether the disk-backed second-chance tier is currently open.""" + return self._disk is not None + + @property + def disk_path(self) -> str | None: + """Filesystem path of the disk tier when enabled, else ``None``.""" + return self._disk_path + + @property + def disk_max_bytes(self) -> int | None: + """Configured byte cap on the disk tier, or ``None`` when disabled.""" + return self._disk_max_bytes + def __len__(self) -> int: return len(self._entries) def __contains__(self, channel_id: str) -> bool: - return channel_id in self._entries + """True if the channel is cached in memory OR on disk. + + Used by ``_filter_cached_channels`` to decide whether ``get_data`` + needs to hit the wire. Including the disk tier here lets a fresh + session served by a warm disk avoid re-fetching. + """ + if channel_id in self._entries: + return True + if self._disk is not None and channel_id in self._disk: + return True + return False + + def enable_disk( + self, + *, + path: str | os.PathLike[str] | None = None, + max_bytes: int | None = None, + ) -> None: + """Enable (or reconfigure) the disk-backed second-chance tier. + + If a previous disk tier was open at a different path or with a + different size cap, it's closed first. Memory contents are left + intact; they are NOT replayed to disk so disk reflects only future + writes. + + Args: + path: Directory to persist to. ``None`` uses + :attr:`DEFAULT_DISK_PATH`. The directory is created if + missing; an existing one is opened in place and its + contents become available to ``get``. + max_bytes: Byte cap for the disk tier (``None`` → + :attr:`DEFAULT_DISK_MAX_BYTES`). + """ + target_path = str(path) if path is not None else self.DEFAULT_DISK_PATH + target_max = max_bytes if max_bytes is not None else self.DEFAULT_DISK_MAX_BYTES + if ( + self._disk is not None + and self._disk_path == target_path + and self._disk_max_bytes == target_max + ): + return + self._close_disk() + self._open_disk(target_path, target_max) + + def disable_disk(self) -> None: + """Close the disk tier (if open). Does not touch the disk contents. + + Use ``sift_client.clear_data_cache_on_disk(path)`` to remove a + directory from disk. + """ + self._close_disk() def get(self, channel_id: str) -> ChannelCacheEntry | None: """Return the entry for ``channel_id`` if cached, otherwise None. - Promotes the entry to most-recently-used on hit. + Memory is consulted first; on a miss, the disk tier (if enabled) is + checked. A disk hit is promoted back into memory (subject to the + in-memory cap) so subsequent accesses stay hot. """ entry = self._entries.get(channel_id) if entry is not None: self._entries.move_to_end(channel_id) - return entry + return entry + if self._disk is None: + return None + try: + disk_entry = self._disk.get(channel_id, default=None, retry=True) + except Exception: + # diskcache surfaces ``sqlite3.DatabaseError`` (and friends) for + # corrupt or partially-written entries from a prior session. + # Treat as a miss; force ``invalidate`` to drop the bad row so + # we don't repeatedly trip the same path. + logger.warning("disk cache read failed for %s; invalidating", channel_id) + try: + del self._disk[channel_id] + except Exception: + pass + return None + if disk_entry is None or not isinstance(disk_entry, ChannelCacheEntry): + return None + if self.enabled: + # Promote disk hit into memory so subsequent reads are cheap. + self._put_memory(channel_id, disk_entry) + return disk_entry def put(self, channel_id: str, entry: ChannelCacheEntry) -> None: - """Insert or replace ``channel_id``, then evict LRU until within size bounds. + """Insert or replace ``channel_id`` in memory (if enabled) and on disk. - Reclaims any prior entry's byte count BEFORE adding the new one's, so a - re-insert (e.g. concat-merge of fresh data into an existing entry) - accounts for the size delta correctly rather than double-counting. + Memory reclaims any prior entry's byte count BEFORE adding the new + one's, so a re-insert (e.g. concat-merge of fresh data into an + existing entry) accounts for the size delta correctly. Disk writes + replace the prior row. """ - if not self.enabled: - return - prior = self._entries.pop(channel_id, None) - if prior is not None: - self._total_bytes -= prior.size_bytes - self._entries[channel_id] = entry - self._total_bytes += entry.size_bytes - self._evict_until_under_bound() + if self.enabled: + self._put_memory(channel_id, entry) + if self._disk is not None: + try: + self._disk.set(channel_id, entry, retry=True) + except Exception: + # Best-effort persistence: keep going on disk errors so the + # in-memory cache (and the user's ``get_data`` call) still + # succeeds. Drop the (possibly partial) disk row. + logger.warning("disk cache write failed for %s; invalidating", channel_id) + try: + self._disk.delete(channel_id, retry=True) + except Exception: + pass def invalidate(self, channel_id: str) -> None: prior = self._entries.pop(channel_id, None) if prior is not None: self._total_bytes -= prior.size_bytes + if self._disk is not None: + try: + self._disk.delete(channel_id, retry=True) + except Exception: + pass def clear(self) -> None: self._entries.clear() self._total_bytes = 0 + if self._disk is not None: + self._disk.clear() + + def close(self) -> None: + """Release the disk-tier file handle. Safe to call without disk enabled.""" + self._close_disk() + + def _put_memory(self, channel_id: str, entry: ChannelCacheEntry) -> None: + """Memory-tier insert + eviction. Caller has already gated on ``enabled``.""" + prior = self._entries.pop(channel_id, None) + if prior is not None: + self._total_bytes -= prior.size_bytes + self._entries[channel_id] = entry + self._total_bytes += entry.size_bytes + self._evict_until_under_bound() def _evict_until_under_bound(self) -> None: # ``popitem(last=False)`` drops the oldest entry. A single fresh entry @@ -150,6 +356,33 @@ def _evict_until_under_bound(self) -> None: _, dropped = self._entries.popitem(last=False) self._total_bytes -= dropped.size_bytes + def _open_disk(self, path: str, max_bytes: int) -> None: + import diskcache + + os.makedirs(path, exist_ok=True) + # ``least-recently-used`` matches the in-memory tier's eviction policy; + # statistics/tag_index are off because we only need plain k/v reads. + self._disk = diskcache.Cache( + directory=path, + size_limit=max_bytes, + eviction_policy="least-recently-used", + statistics=0, + tag_index=0, + ) + self._disk_path = path + self._disk_max_bytes = max_bytes + + def _close_disk(self) -> None: + if self._disk is None: + return + try: + self._disk.close() + except Exception: + pass + self._disk = None + self._disk_path = None + self._disk_max_bytes = None + class DataLowLevelClient(LowLevelClientBase, WithGrpcClient): """Low-level client for fetching channel data. @@ -162,16 +395,27 @@ def __init__( grpc_client: GrpcClient, *, data_cache_max_bytes: int = DEFAULT_DATA_CACHE_MAX_BYTES, + disk_cache_path: str | os.PathLike[str] | None = None, + disk_cache_max_bytes: int | None = None, ): """Initialize the DataLowLevelClient. Args: grpc_client: The gRPC client to use for making API calls. data_cache_max_bytes: Cap on the in-memory channel-data cache (bytes). - Set to ``0`` to disable caching. See ``ChannelCache``. + Set to ``0`` to disable in-memory caching. See ``ChannelCache``. + disk_cache_path: Directory for the disk-backed second-chance tier. + ``None`` disables disk persistence. See ``ChannelCache``. + disk_cache_max_bytes: Byte cap for the disk tier. ``None`` uses + ``DEFAULT_DISK_CACHE_MAX_BYTES``. Ignored when + ``disk_cache_path`` is ``None``. """ super().__init__(grpc_client) - self.channel_cache = ChannelCache(max_bytes=data_cache_max_bytes) + self.channel_cache = ChannelCache( + max_bytes=data_cache_max_bytes, + disk_path=disk_cache_path, + disk_max_bytes=disk_cache_max_bytes, + ) def _update_name_id_map(self, channels: list[Channel]): """Update the name id map with the new channels.""" diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py index 1f7f022fb..16c99a6dd 100644 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py @@ -1,9 +1,13 @@ """Tests for :mod:`sift_client._internal.low_level_wrappers.data`. -Four classes, narrowest scope first: +Five classes, narrowest scope first: * :class:`TestChannelCache` — pure ``ChannelCache`` unit tests (byte accounting, LRU promotion, eviction). +* :class:`TestChannelCacheDisk` — disk-backed second-chance tier + (fresh open, cross-session reload, fall-through reads, disable). +* :class:`TestChannelCacheClearDisk` — ``ChannelCache.clear_disk`` + classmethod (default path, custom path, safety guard). * :class:`TestMergePages` — ``DataLowLevelClient._merge_pages``, the per-channel concat helper. * :class:`TestDataLowLevelClient` — constructor wiring and per-instance @@ -307,6 +311,219 @@ def test_repeated_concat_updates_stay_under_bound(self) -> None: assert _invariant_holds(cache) +class TestChannelCacheDisk: + """Disk-backed second-chance tier of :class:`ChannelCache`. + + Three things must hold across these tests: + + 1. A fresh disk directory starts empty and accepts new writes. + 2. Closing a populated cache and reopening at the same path surfaces + the previous entries on read (the "previous session" requirement). + 3. The two tiers stay consistent across ``invalidate``/``clear`` and + ``disable_disk``, so the disk tier never becomes a stale shadow of + memory. + + All tests confine writes to ``tmp_path`` so nothing leaks into the real + ``/tmp/sift-channel-data-cache``. + """ + + def test_disabled_by_default(self) -> None: + """No ``disk_path`` → disk tier stays off and untouched.""" + cache = ChannelCache(max_bytes=10_000_000) + assert cache.disk_enabled is False + assert cache.disk_path is None + assert cache.disk_max_bytes is None + + def test_fresh_cache_writes_and_reads(self, tmp_path) -> None: + """A fresh disk directory accepts writes and serves them back.""" + path = tmp_path / "fresh" + cache = ChannelCache(max_bytes=10_000_000, disk_path=path) + try: + assert cache.disk_enabled + assert cache.disk_path == str(path) + assert cache.disk_max_bytes == ChannelCache.DEFAULT_DISK_MAX_BYTES + entry = _entry(rows=8) + cache.put("chan-1", entry) + # Same instance: memory hit takes precedence; disk is just a copy. + assert "chan-1" in cache + got = cache.get("chan-1") + assert got is not None + pd.testing.assert_frame_equal(got.data, entry.data) + finally: + cache.close() + + def test_reopen_existing_dir_sees_prior_session_entries(self, tmp_path) -> None: + """Closing then reopening at the same path makes prior entries hit. + + This is the "look for existing caches from previous sessions" + guarantee: a new ``ChannelCache`` with an empty in-memory tier + finds entries on disk and promotes them into memory on first read. + """ + path = tmp_path / "prev-session" + df = _frame("chan-1", rows=12, freq="s") + original_entry = _new_cache_entry( + data=df, + start_time=df.index[0].to_pydatetime(), + end_time=df.index[-1].to_pydatetime(), + ) + # Session 1: populate and close. + session1 = ChannelCache(max_bytes=10_000_000, disk_path=path) + session1.put("chan-1", original_entry) + session1.close() + + # Session 2: fresh process simulated by a brand-new ChannelCache. + # Memory starts empty, but ``__contains__`` reports the entry from + # disk and ``get`` returns it with bytes intact. + session2 = ChannelCache(max_bytes=10_000_000, disk_path=path) + try: + assert len(session2) == 0 # in-memory tier starts cold + assert "chan-1" in session2 # disk-backed contains + got = session2.get("chan-1") + assert got is not None + pd.testing.assert_frame_equal(got.data, original_entry.data) + assert got.start_time == original_entry.start_time + assert got.end_time == original_entry.end_time + # After the disk hit, the entry is now promoted into memory. + assert len(session2) == 1 + finally: + session2.close() + + def test_disk_hit_promotes_into_memory(self, tmp_path) -> None: + """A disk-only entry becomes a memory entry after one ``get``.""" + cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "promote") + try: + cache.put("chan-1", _entry(rows=4)) + # Drop from memory only (simulate eviction). + del cache._entries["chan-1"] + cache._total_bytes = 0 + assert "chan-1" in cache # still on disk + assert cache.get("chan-1") is not None + assert "chan-1" in cache._entries # promoted back into memory + finally: + cache.close() + + def test_disk_only_when_memory_disabled(self, tmp_path) -> None: + """``max_bytes=0`` (no memory) still routes writes/reads through disk. + + Cold-storage configuration: caller wants persistence without + paying the in-memory footprint. + """ + cache = ChannelCache(max_bytes=0, disk_path=tmp_path / "disk-only") + try: + assert not cache.enabled + assert cache.disk_enabled + cache.put("chan-1", _entry(rows=4)) + assert "chan-1" not in cache._entries # never landed in memory + got = cache.get("chan-1") + assert got is not None + assert "chan-1" not in cache._entries # memory still bypassed + finally: + cache.close() + + def test_invalidate_clears_both_tiers(self, tmp_path) -> None: + cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "inval") + try: + cache.put("chan-1", _entry(rows=4)) + cache.invalidate("chan-1") + assert "chan-1" not in cache._entries + assert "chan-1" not in cache # contains() must check disk too + finally: + cache.close() + + def test_clear_wipes_both_tiers(self, tmp_path) -> None: + cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "clear") + try: + cache.put("chan-1", _entry(rows=4)) + cache.put("chan-2", _entry(rows=4)) + cache.clear() + assert len(cache) == 0 + assert "chan-1" not in cache + assert "chan-2" not in cache + finally: + cache.close() + + def test_disable_disk_preserves_memory(self, tmp_path) -> None: + """Turning off disk closes the handle but keeps memory intact.""" + cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "disable") + try: + cache.put("chan-1", _entry(rows=4)) + cache.disable_disk() + assert not cache.disk_enabled + assert cache.disk_path is None + # Memory entry survives the disk-tier teardown. + assert "chan-1" in cache + assert cache.get("chan-1") is not None + finally: + cache.close() + + def test_enable_disk_reconfigures_path(self, tmp_path) -> None: + """Reconfiguring to a different path closes the old handle.""" + cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "a") + try: + cache.put("chan-1", _entry(rows=4)) + cache.enable_disk(path=tmp_path / "b") + assert cache.disk_path == str(tmp_path / "b") + # The new disk dir is fresh: nothing on disk yet under the new path. + # ``chan-1`` is still in memory, so __contains__ is still True. + assert "chan-1" in cache + # But the new disk dir is empty; drop from memory and the + # contains check now relies on disk, which won't find it. + del cache._entries["chan-1"] + cache._total_bytes = 0 + assert "chan-1" not in cache + finally: + cache.close() + + def test_enable_disk_noop_when_same_settings(self, tmp_path) -> None: + """Re-enabling with identical settings doesn't churn the disk handle.""" + cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "noop") + try: + handle_before = cache._disk + cache.enable_disk(path=tmp_path / "noop", max_bytes=ChannelCache.DEFAULT_DISK_MAX_BYTES) + # Same handle, no reopen. + assert cache._disk is handle_before + finally: + cache.close() + + +class TestChannelCacheClearDisk: + """``ChannelCache.clear_disk`` removes a cache dir, refuses other dirs. + + The classmethod is the source of truth that the resource-level + ``ChannelsAPIAsync.clear_data_cache_on_disk`` proxies through, so it + must be defensive against pointing at the wrong directory. + """ + + def test_clear_removes_directory(self, tmp_path) -> None: + path = tmp_path / "victim" + cache = ChannelCache(max_bytes=10_000_000, disk_path=path) + cache.put("chan-1", _entry(rows=4)) + cache.close() + assert path.exists() + ChannelCache.clear_disk(path) + assert not path.exists() + + def test_clear_missing_path_is_noop(self, tmp_path) -> None: + ChannelCache.clear_disk(tmp_path / "never-existed") # no raise + + def test_clear_refuses_non_diskcache_directory(self, tmp_path) -> None: + """A typo'd path with unrelated contents must not be wiped.""" + target = tmp_path / "user-stuff" + target.mkdir() + (target / "important.txt").write_text("don't delete me") + with pytest.raises(ValueError, match="does not look like a sift channel data cache"): + ChannelCache.clear_disk(target) + # Unrelated contents preserved. + assert (target / "important.txt").read_text() == "don't delete me" + + def test_default_path_constant_under_tmp(self) -> None: + """Default lives under the OS tmp dir, not a user directory.""" + import tempfile + + assert ChannelCache.DEFAULT_DISK_PATH.startswith(tempfile.gettempdir()) + assert ChannelCache.DEFAULT_DISK_PATH.endswith("sift-channel-data-cache") + + class TestMergePages: """Behaviour of :meth:`DataLowLevelClient._merge_pages`. diff --git a/python/lib/sift_client/_tests/resources/test_channels.py b/python/lib/sift_client/_tests/resources/test_channels.py index 0bc3e1122..e3d29ab73 100644 --- a/python/lib/sift_client/_tests/resources/test_channels.py +++ b/python/lib/sift_client/_tests/resources/test_channels.py @@ -538,3 +538,90 @@ def test_negative_raises(self): api = _make_api() with pytest.raises(ValueError, match="max_bytes"): api.configure_data_cache(max_bytes=-1) + + +class TestEnableDataCacheDisk: + """``enable_data_cache_disk`` / ``disable_data_cache_disk`` plumb the disk + tier setting to the underlying ``ChannelCache``, both pre- and post-init. + + The disk tier itself is exercised directly in + ``test_data.py::TestChannelCacheDisk``; the tests here just verify the + resource-level wiring around it. + """ + + def test_disabled_by_default(self): + api = _make_api() + api._ensure_data_low_level_client() + assert not api._data_low_level_client.channel_cache.disk_enabled + + def test_enable_before_lazy_init_propagates(self, tmp_path): + api = _make_api() + api.enable_data_cache_disk(path=str(tmp_path / "pre-init"), max_bytes=4096) + api._ensure_data_low_level_client() + cache = api._data_low_level_client.channel_cache + try: + assert cache.disk_enabled + assert cache.disk_path == str(tmp_path / "pre-init") + assert cache.disk_max_bytes == 4096 + finally: + cache.close() + + def test_enable_after_lazy_init_updates_live_cache(self, tmp_path): + api = _make_api() + api._ensure_data_low_level_client() + cache = api._data_low_level_client.channel_cache + try: + assert not cache.disk_enabled + api.enable_data_cache_disk(path=str(tmp_path / "post-init")) + assert cache.disk_enabled + assert cache.disk_path == str(tmp_path / "post-init") + finally: + cache.close() + + def test_enable_with_default_path_lands_on_default(self, monkeypatch, tmp_path): + """Calling ``enable_data_cache_disk()`` with no args uses the default path. + + Redirects ``ChannelCache.DEFAULT_DISK_PATH`` to ``tmp_path`` so the + test doesn't create the real ``/tmp/sift-channel-data-cache`` + directory. + """ + from sift_client._internal.low_level_wrappers.data import ChannelCache + + fake_default = str(tmp_path / "fake-default") + monkeypatch.setattr(ChannelCache, "DEFAULT_DISK_PATH", fake_default) + + api = _make_api() + api.enable_data_cache_disk() + api._ensure_data_low_level_client() + cache = api._data_low_level_client.channel_cache + try: + assert cache.disk_path == fake_default + finally: + cache.close() + + def test_disable_closes_live_disk_handle(self, tmp_path): + api = _make_api() + api.enable_data_cache_disk(path=str(tmp_path / "to-close")) + api._ensure_data_low_level_client() + cache = api._data_low_level_client.channel_cache + try: + assert cache.disk_enabled + api.disable_data_cache_disk() + assert not cache.disk_enabled + assert cache.disk_path is None + finally: + cache.close() + + def test_clear_data_cache_on_disk_proxies_to_cache(self, tmp_path): + """The resource method removes the directory by proxying to ChannelCache.""" + from sift_client._internal.low_level_wrappers.data import ChannelCache + + path = tmp_path / "to-clear" + # Populate a real disk-cache directory so the marker check passes. + cache = ChannelCache(max_bytes=10_000_000, disk_path=path) + cache.close() + assert path.exists() + + api = _make_api() + api.clear_data_cache_on_disk(path) + assert not path.exists() diff --git a/python/lib/sift_client/resources/channels.py b/python/lib/sift_client/resources/channels.py index 26ff2da65..91322a65c 100644 --- a/python/lib/sift_client/resources/channels.py +++ b/python/lib/sift_client/resources/channels.py @@ -11,6 +11,7 @@ from sift_client.util import cel_utils as cel if TYPE_CHECKING: + import os import re from datetime import datetime @@ -68,6 +69,13 @@ def __init__(self, sift_client: SiftClient): # at lazy-init time" so we don't have to import ``data.py`` (and # therefore pandas) just to remember the default. self._data_cache_max_bytes: int | None = None + # Disk-tier configuration, stashed until lazy init (or applied + # immediately if the wrapper is already constructed). All three + # remain ``None`` / ``False`` when the disk tier is disabled, which + # is the default — disk persistence is opt-in. + self._disk_cache_enabled: bool = False + self._disk_cache_path: str | None = None + self._disk_cache_max_bytes: int | None = None def configure_data_cache(self, *, max_bytes: int) -> None: """Configure the in-memory channel data cache used by ``get_data``. @@ -91,6 +99,81 @@ def configure_data_cache(self, *, max_bytes: int) -> None: if self._data_low_level_client is not None: self._data_low_level_client.channel_cache.max_bytes = max_bytes + def enable_data_cache_disk( + self, + *, + path: str | os.PathLike[str] | None = None, + max_bytes: int | None = None, + ) -> None: + """Persist the channel data cache to disk, surviving process restarts. + + The disk-backed tier is a second-chance layer beneath the in-memory + cache: on a memory miss, ``get_data`` checks disk before going to the + wire. The default path lives under ``tempfile.gettempdir()`` and is + shared across sessions, so a re-run of the same workload picks up + previously-cached windows without a fetch. + + Safe to call before or after the first ``get_data``. Reconfiguring + (different ``path`` or ``max_bytes``) closes the previous disk handle + and opens a new one; in-memory contents are preserved across the swap. + + Args: + path: Directory to persist the cache to. ``None`` (the default) + uses ``DEFAULT_DISK_CACHE_PATH``. Existing entries at the path + become available as cache hits. + max_bytes: Byte cap on the disk tier. ``None`` uses + ``DEFAULT_DISK_CACHE_MAX_BYTES`` (4 GiB). When the bound is + reached, ``diskcache``'s LRU eviction takes over. + + Example: + client.channels.enable_data_cache_disk() + client.channels.enable_data_cache_disk(path="/data/sift-cache") + client.channels.enable_data_cache_disk(max_bytes=1024 ** 3) # 1 GiB + """ + self._disk_cache_enabled = True + self._disk_cache_path = str(path) if path is not None else None + self._disk_cache_max_bytes = max_bytes + if self._data_low_level_client is not None: + self._data_low_level_client.channel_cache.enable_disk(path=path, max_bytes=max_bytes) + + def disable_data_cache_disk(self) -> None: + """Stop persisting the channel data cache to disk. + + Closes the disk-cache file handle. The on-disk directory is NOT + deleted — use :meth:`clear_data_cache_on_disk` to wipe it. In-memory + entries are preserved. + """ + self._disk_cache_enabled = False + self._disk_cache_path = None + self._disk_cache_max_bytes = None + if self._data_low_level_client is not None: + self._data_low_level_client.channel_cache.disable_disk() + + def clear_data_cache_on_disk(self, path: str | os.PathLike[str] | None = None) -> None: + """Delete a previously-persisted on-disk channel data cache directory. + + Drops stale caches from previous sessions, recovers from a corrupt + cache, or reclaims disk space. Removes the directory entirely; a + future :meth:`enable_data_cache_disk` call at the same path will see + a fresh empty cache. + + This is a thin proxy around + :meth:`ChannelCache.clear_disk ` + — exposed on the resource so callers don't need to reach into + ``_internal`` modules. But that is a class method so the user could call without a client if desired. + + Args: + path: Directory of the cache to clear. ``None`` (the default) + targets ``ChannelCache.DEFAULT_DISK_PATH``. + + Raises: + ValueError: If ``path`` exists but does not look like a sift + channel data cache directory. + """ + from sift_client._internal.low_level_wrappers.data import ChannelCache + + ChannelCache.clear_disk(path) + async def get( self, *, @@ -268,13 +351,23 @@ async def unarchive(self, channels: list[str | Channel]) -> None: def _ensure_data_low_level_client(self): """Ensure that the data low level client is initialized. Separated out like this to not require large dependencies (pandas/pyarrow) for the client if not fetching data.""" if self._data_low_level_client is None: - from sift_client._internal.low_level_wrappers.data import DataLowLevelClient + from sift_client._internal.low_level_wrappers.data import ( + ChannelCache, + DataLowLevelClient, + ) - # Pass the kwarg only when explicitly configured so the wrapper's - # own default (currently 512 MiB) remains the single source of truth. - kwargs = {} + # Pass each kwarg only when explicitly configured so the wrapper's + # own defaults remain the single source of truth. + kwargs: dict = {} if self._data_cache_max_bytes is not None: kwargs["data_cache_max_bytes"] = self._data_cache_max_bytes + if self._disk_cache_enabled: + # ``disk_path=None`` means "disabled" to ChannelCache; substitute + # the default explicitly so an explicit ``enable_data_cache_disk()`` + # without a path still opens the disk tier. + kwargs["disk_cache_path"] = self._disk_cache_path or ChannelCache.DEFAULT_DISK_PATH + if self._disk_cache_max_bytes is not None: + kwargs["disk_cache_max_bytes"] = self._disk_cache_max_bytes self._data_low_level_client = DataLowLevelClient( grpc_client=self.client.grpc_client, **kwargs, diff --git a/python/pyproject.toml b/python/pyproject.toml index b435022e7..b12c29cb0 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "requests~=2.25", "requests-toolbelt~=1.0", "alive-progress~=3.0", + "diskcache~=5.6", # May move these to optional dependencies in the future. "pandas-stubs>=2.0,<4.0", "types-PyYAML~=6.0", @@ -350,6 +351,12 @@ ignore_errors = true [[tool.mypy.overrides]] module = "nptdms" ignore_missing_imports = true + +# diskcache ships without inline type hints or PEP 561 marker. Used by the +# channel data cache's optional on-disk tier. +[[tool.mypy.overrides]] +module = "diskcache" +ignore_missing_imports = true ignore_errors = true # alive-progress 3.3.0 ships py.typed but its `alive_it` signature is too diff --git a/python/uv.lock b/python/uv.lock index d152551a9..43c24b552 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.8" resolution-markers = [ "python_full_version >= '3.8.2' and python_full_version < '3.9'", @@ -638,6 +638,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/c3/253a89ee03fc9b9682f1541728eb66db7db22148cd94f89ab22528cd1e1b/deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a", size = 11178, upload-time = "2020-04-20T14:23:36.581Z" }, ] +[[package]] +name = "diskcache" +version = "5.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916, upload-time = "2023-08-31T06:12:00.316Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" }, +] + [[package]] name = "eval-type-backport" version = "0.3.1" @@ -4334,6 +4343,7 @@ source = { editable = "." } dependencies = [ { name = "alive-progress", version = "3.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, { name = "alive-progress", version = "3.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "diskcache" }, { name = "eval-type-backport" }, { name = "filelock", version = "3.16.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, { name = "filelock", version = "3.19.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, @@ -4562,6 +4572,7 @@ requires-dist = [ { name = "cffi", marker = "extra == 'dev-all'", specifier = "~=1.14" }, { name = "cffi", marker = "extra == 'docs-build'", specifier = "~=1.14" }, { name = "cffi", marker = "extra == 'openssl'", specifier = "~=1.14" }, + { name = "diskcache", specifier = "~=5.6" }, { name = "eval-type-backport", specifier = "~=0.2" }, { name = "filelock", specifier = "~=3.15" }, { name = "googleapis-common-protos", specifier = ">=1.60" }, From 4737ae30c37f7ea24b03f546144a3d72522ca9f5 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Wed, 24 Jun 2026 14:36:35 -0700 Subject: [PATCH 08/14] stubs --- .../resources/sync_stubs/__init__.pyi | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index c37c3aed3..704e3b8c0 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -5,6 +5,7 @@ from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: + import os import re from datetime import datetime, timedelta from pathlib import Path @@ -452,6 +453,86 @@ class ChannelsAPI: """ ... + def clear_data_cache_on_disk(self, path: str | os.PathLike[str] | None = None) -> None: + """Delete a previously-persisted on-disk channel data cache directory. + + Drops stale caches from previous sessions, recovers from a corrupt + cache, or reclaims disk space. Removes the directory entirely; a + future :meth:`enable_data_cache_disk` call at the same path will see + a fresh empty cache. + + This is a thin proxy around + :meth:`ChannelCache.clear_disk ` + — exposed on the resource so callers don't need to reach into + ``_internal`` modules. But that is a class method so the user could call without a client if desired. + + Args: + path: Directory of the cache to clear. ``None`` (the default) + targets ``ChannelCache.DEFAULT_DISK_PATH``. + + Raises: + ValueError: If ``path`` exists but does not look like a sift + channel data cache directory. + """ + ... + + def configure_data_cache(self, *, max_bytes: int) -> None: + """Configure the in-memory channel data cache used by ``get_data``. + + Args: + max_bytes: Byte cap on the cache. ``0`` disables caching + (every ``get_data`` call goes to the wire). Defaults to + 512 MiB until explicitly configured. Must be ``>= 0``. + + Safe to call before or after the first ``get_data``. If the cache is + already live, the new cap is applied immediately and least-recently- + used entries are evicted until ``total_bytes`` fits. + + Example: + client.channels.configure_data_cache(max_bytes=128 * 1024 * 1024) + client.channels.configure_data_cache(max_bytes=0) # disable + """ + ... + + def disable_data_cache_disk(self) -> None: + """Stop persisting the channel data cache to disk. + + Closes the disk-cache file handle. The on-disk directory is NOT + deleted — use :meth:`clear_data_cache_on_disk` to wipe it. In-memory + entries are preserved. + """ + ... + + def enable_data_cache_disk( + self, *, path: str | os.PathLike[str] | None = None, max_bytes: int | None = None + ) -> None: + """Persist the channel data cache to disk, surviving process restarts. + + The disk-backed tier is a second-chance layer beneath the in-memory + cache: on a memory miss, ``get_data`` checks disk before going to the + wire. The default path lives under ``tempfile.gettempdir()`` and is + shared across sessions, so a re-run of the same workload picks up + previously-cached windows without a fetch. + + Safe to call before or after the first ``get_data``. Reconfiguring + (different ``path`` or ``max_bytes``) closes the previous disk handle + and opens a new one; in-memory contents are preserved across the swap. + + Args: + path: Directory to persist the cache to. ``None`` (the default) + uses ``DEFAULT_DISK_CACHE_PATH``. Existing entries at the path + become available as cache hits. + max_bytes: Byte cap on the disk tier. ``None`` uses + ``DEFAULT_DISK_CACHE_MAX_BYTES`` (4 GiB). When the bound is + reached, ``diskcache``'s LRU eviction takes over. + + Example: + client.channels.enable_data_cache_disk() + client.channels.enable_data_cache_disk(path="/data/sift-cache") + client.channels.enable_data_cache_disk(max_bytes=1024 ** 3) # 1 GiB + """ + ... + def find(self, **kwargs) -> Channel | None: """Find a single channel matching the given query. Takes the same arguments as `list`. If more than one channel is found, raises an error. From cf46f32b9983d860f3f987945386a3fac939a574 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 25 Jun 2026 13:01:32 -0700 Subject: [PATCH 09/14] Move disk cache out of resource. --- python/CHANGELOG.md | 24 ++-- .../_internal/disk_cache_config.py | 87 ++++++++++++++ .../_internal/low_level_wrappers/test_data.py | 19 ++- .../_internal/test_disk_cache_config.py | 112 ++++++++++++++++++ python/lib/sift_client/_tests/conftest.py | 27 +++++ .../_tests/resources/test_channels.py | 88 ++++++++++++-- python/lib/sift_client/client.py | 4 - python/lib/sift_client/resources/channels.py | 106 +++++++++++------ .../resources/sync_stubs/__init__.pyi | 40 ++++--- python/pyproject.toml | 8 ++ python/uv.lock | 2 +- 11 files changed, 435 insertions(+), 82 deletions(-) create mode 100644 python/lib/sift_client/_internal/disk_cache_config.py create mode 100644 python/lib/sift_client/_tests/_internal/test_disk_cache_config.py diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index 8e61faa1b..e33995a8e 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -26,31 +26,27 @@ client.channels.configure_data_cache(max_bytes=0) # disable cac `ignore_cache=True` on `client.channels.get_data(...)` now also skips writing into the cache, matching its read-side bypass semantics. Previously a "non-caching" workload still appended to the shared cache on every call, which still caused increased memory usage. -The internal `DataLowLevelClient.channel_cache` is no longer a class attribute. Any external code that relied on `DataLowLevelClient.channel_cache.channels.clear()` as a workaround should remove it — the bounded cache no longer requires manual purging. +#### On-disk channel data cache (opt-out, on by default) -#### On-disk channel data cache (opt-in) +The channel data cache now persists to disk by default, surviving process restarts. The disk tier is a second-chance layer beneath the in-memory cache: on a memory miss, `get_data` checks disk before going to the wire. Re-running the same workload in a new session picks up the previously-cached windows for free — no configuration required. -The channel data cache can now optionally persist to disk, surviving process restarts. The disk tier is a second-chance layer beneath the in-memory cache: on a memory miss, `get_data` checks disk before going to the wire. Re-running the same workload in a new session picks up the previously-cached windows for free. +The default location is `/sift-channel-data-cache`, capped at 4 GiB with LRU eviction. If the default path can't be opened (read-only filesystem, restricted container, etc.), the client logs a warning and falls back to the in-memory cache only — `get_data` continues to work. -```python -# Enable disk persistence at the default tmp location. -client.channels.enable_data_cache_disk() - -# Or pick a custom directory and byte cap. -client.channels.enable_data_cache_disk(path="/data/sift-cache", max_bytes=2 * 1024 ** 3) +Opt out, reconfigure, or wipe the on-disk cache from the `channels` resource: -# Stop persisting (does not delete on-disk data). +```python +# Opt out — no data persisted to disk. client.channels.disable_data_cache_disk() -``` -To remove a stale cache directory from a previous session: +# Reconfigure the location or byte cap. +client.channels.enable_data_cache_disk(path="/data/sift-cache", max_bytes=2 * 1024 ** 3) -```python +# Remove a stale or corrupted cache directory. client.channels.clear_data_cache_on_disk() # default tmp path client.channels.clear_data_cache_on_disk("/data/sift-cache") # custom path ``` -`clear_data_cache_on_disk` refuses to delete directories that don't look like a sift channel data cache (missing the `diskcache` marker), so a typo'd path won't wipe unrelated data. +`enable_data_cache_disk` is also the way to turn the tier back on after a prior `disable_data_cache_disk` call. The disk tier is powered by [`diskcache`](https://grantjenks.com/docs/diskcache/) (pure-Python, SQLite-backed) and has its own independent byte cap with LRU eviction. The in-memory tier remains the fast path — disk is only consulted on a memory miss. diff --git a/python/lib/sift_client/_internal/disk_cache_config.py b/python/lib/sift_client/_internal/disk_cache_config.py new file mode 100644 index 000000000..c49eaf442 --- /dev/null +++ b/python/lib/sift_client/_internal/disk_cache_config.py @@ -0,0 +1,87 @@ +"""User-expressed configuration for a resource's optional disk-cache tier.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import os + + +class DiskCacheConfig: + """Holds a resource's disk-cache enable/path/max-bytes intent. + + Resources own one instance, mutate it via :meth:`enable` / :meth:`disable` + in response to user calls, and read the properties at lazy-init time to + decide what kwargs to forward to their cache-aware wrapper. + + The :attr:`using_default_path` property is the key invariant for the + silent-fallback-vs-loud-raise distinction in resource lazy-init code: + if the user picked a specific path and opening fails, the failure + surfaces; if the user left the default and opening fails, the resource + falls back to memory-only without disrupting the call. + + Args: + enabled: Initial enabled state. Pass ``True`` for opt-out (the disk + tier is on by default and users call ``disable`` to turn it off); + pass ``False`` for opt-in (users call ``enable`` to turn it on). + """ + + def __init__(self, *, enabled: bool = True) -> None: + self._enabled = enabled + self._path: str | None = None + self._max_bytes: int | None = None + + @property + def enabled(self) -> bool: + """Whether the disk tier should be opened on the next lazy init.""" + return self._enabled + + @property + def path(self) -> str | None: + """User-supplied disk-cache path, or ``None`` to defer to the cache's default.""" + return self._path + + @property + def max_bytes(self) -> int | None: + """User-supplied disk-cache byte cap, or ``None`` to defer to the cache's default.""" + return self._max_bytes + + @property + def using_default_path(self) -> bool: + """``True`` when the disk tier is enabled *and* the path is the cache's default. + + Resources use this to decide whether to silently fall back to memory + on a disk-open failure (default path: the user didn't ask for it + specifically, so degrade gracefully) or to re-raise (explicit path: + the user asked for it, so failure must surface). + """ + return self._enabled and self._path is None + + def enable( + self, + *, + path: str | os.PathLike[str] | None = None, + max_bytes: int | None = None, + ) -> None: + """Mark the disk tier as enabled, optionally with a custom path or byte cap. + + Args: + path: Directory to persist to. ``None`` leaves the cache's + default in effect. + max_bytes: Byte cap on the disk tier. ``None`` leaves the + cache's default in effect. + """ + self._enabled = True + self._path = str(path) if path is not None else None + self._max_bytes = max_bytes + + def disable(self) -> None: + """Mark the disk tier as disabled and clear any custom path / byte cap. + + Subsequent :meth:`enable` calls re-enable at the cache's defaults + unless overrides are supplied. + """ + self._enabled = False + self._path = None + self._max_bytes = None diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py index 16c99a6dd..0ace402d2 100644 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py @@ -42,6 +42,13 @@ _NOW = datetime(2025, 1, 1, tzinfo=timezone.utc) _WINDOW_END = _NOW + timedelta(days=1) +# Snapshot of the real ``DEFAULT_DISK_PATH`` constant captured at module import. +# The autouse ``_isolate_default_disk_cache_path`` fixture in ``conftest.py`` +# overrides the class attribute on every test for isolation; the +# ``TestChannelCacheClearDisk::test_default_path_constant_under_tmp`` test still +# needs to see the production value to verify its shape. +_PRODUCTION_DEFAULT_DISK_PATH = ChannelCache.DEFAULT_DISK_PATH + # ---------- shared helpers ----------- @@ -517,11 +524,17 @@ def test_clear_refuses_non_diskcache_directory(self, tmp_path) -> None: assert (target / "important.txt").read_text() == "don't delete me" def test_default_path_constant_under_tmp(self) -> None: - """Default lives under the OS tmp dir, not a user directory.""" + """Default lives under the OS tmp dir, not a user directory. + + Reads the module-level snapshot captured at import time rather than + ``ChannelCache.DEFAULT_DISK_PATH`` directly, because the autouse + ``_isolate_default_disk_cache_path`` fixture monkeypatches that + attribute for every test to keep ``/tmp`` clean. + """ import tempfile - assert ChannelCache.DEFAULT_DISK_PATH.startswith(tempfile.gettempdir()) - assert ChannelCache.DEFAULT_DISK_PATH.endswith("sift-channel-data-cache") + assert _PRODUCTION_DEFAULT_DISK_PATH.startswith(tempfile.gettempdir()) + assert _PRODUCTION_DEFAULT_DISK_PATH.endswith("sift-channel-data-cache") class TestMergePages: diff --git a/python/lib/sift_client/_tests/_internal/test_disk_cache_config.py b/python/lib/sift_client/_tests/_internal/test_disk_cache_config.py new file mode 100644 index 000000000..bce8a4ab9 --- /dev/null +++ b/python/lib/sift_client/_tests/_internal/test_disk_cache_config.py @@ -0,0 +1,112 @@ +"""Tests for :class:`sift_client._internal.disk_cache_config.DiskCacheConfig`. + +The class is a small intent holder; the tests pin three things that +resource lazy-init code relies on: + +* Enable / disable round-trips preserve the right state and clear overrides. +* ``using_default_path`` reflects "enabled AND no user override", which + drives the silent-fallback-vs-loud-raise distinction in resources. +* ``enable`` accepts ``os.PathLike`` and stringifies it eagerly so consumers + never need to handle ``pathlib.Path`` vs ``str``. +""" + +from __future__ import annotations + +import pathlib + +import pytest + +from sift_client._internal.disk_cache_config import DiskCacheConfig + + +class TestDiskCacheConfig: + def test_opt_out_initial_state_enabled_no_overrides(self) -> None: + """``enabled=True`` (opt-out) starts on with no overrides.""" + config = DiskCacheConfig(enabled=True) + assert config.enabled + assert config.path is None + assert config.max_bytes is None + assert config.using_default_path + + def test_opt_in_initial_state_disabled(self) -> None: + """``enabled=False`` (opt-in) starts off; ``using_default_path`` is False.""" + config = DiskCacheConfig(enabled=False) + assert not config.enabled + assert config.path is None + assert config.max_bytes is None + assert not config.using_default_path + + def test_enable_with_no_args_keeps_defaults(self) -> None: + """``enable()`` with no args turns on and clears any prior overrides.""" + config = DiskCacheConfig(enabled=False) + config.enable() + assert config.enabled + assert config.path is None + assert config.max_bytes is None + assert config.using_default_path + + def test_enable_with_path_marks_non_default(self) -> None: + """A user-supplied path flips ``using_default_path`` off.""" + config = DiskCacheConfig(enabled=True) + config.enable(path="/custom/path") + assert config.enabled + assert config.path == "/custom/path" + assert not config.using_default_path + + def test_enable_with_max_bytes_keeps_default_path(self) -> None: + """Setting ``max_bytes`` alone doesn't make the path non-default.""" + config = DiskCacheConfig(enabled=True) + config.enable(max_bytes=1024) + assert config.enabled + assert config.path is None + assert config.max_bytes == 1024 + assert config.using_default_path + + def test_enable_stringifies_pathlike(self) -> None: + """``os.PathLike`` inputs are stored as strings so consumers can be dumb.""" + config = DiskCacheConfig(enabled=True) + config.enable(path=pathlib.Path("/some/path")) + assert isinstance(config.path, str) + assert config.path == "/some/path" + + def test_disable_clears_overrides(self) -> None: + """``disable()`` zeroes path and max_bytes so a future re-enable starts clean.""" + config = DiskCacheConfig(enabled=True) + config.enable(path="/custom", max_bytes=4096) + config.disable() + assert not config.enabled + assert config.path is None + assert config.max_bytes is None + assert not config.using_default_path + + def test_reenable_after_disable_returns_to_defaults(self) -> None: + """``disable`` then ``enable()`` (no args) restores the opt-out starting state.""" + config = DiskCacheConfig(enabled=True) + config.enable(path="/custom", max_bytes=4096) + config.disable() + config.enable() + assert config.enabled + assert config.path is None + assert config.max_bytes is None + assert config.using_default_path + + @pytest.mark.parametrize( + ("enabled", "path", "expected"), + [ + (True, None, True), + (True, "/custom", False), + (False, None, False), + (False, "/custom", False), # disabled wins even with a stashed path + ], + ids=["enabled+default", "enabled+custom", "disabled+default", "disabled+custom"], + ) + def test_using_default_path_matrix( + self, enabled: bool, path: str | None, expected: bool + ) -> None: + """``using_default_path`` is the AND of ``enabled`` and ``path is None``.""" + config = DiskCacheConfig(enabled=enabled) + if path is not None: + # Bypass enable() so we can exercise the disabled+custom combo + # without enable() flipping enabled back on. + config._path = path + assert config.using_default_path is expected diff --git a/python/lib/sift_client/_tests/conftest.py b/python/lib/sift_client/_tests/conftest.py index 41469dac5..31aebf03a 100644 --- a/python/lib/sift_client/_tests/conftest.py +++ b/python/lib/sift_client/_tests/conftest.py @@ -9,6 +9,33 @@ from sift_client.util.util import AsyncAPIs +@pytest.fixture(autouse=True) +def _isolate_default_disk_cache_path(monkeypatch, tmp_path): + """Redirect ``ChannelCache.DEFAULT_DISK_PATH`` to a per-test tmp dir. + + The channel data disk cache is **opt-out** — any test that triggers the + lazy ``DataLowLevelClient`` init through ``ChannelsAPIAsync`` would + otherwise create the real ``/tmp/sift-channel-data-cache`` directory and + leak state across runs. Redirecting the default to ``tmp_path`` keeps + every test self-contained without each test having to know that the disk + tier is on by default. + + The override deliberately preserves the ``sift-channel-data-cache`` + suffix so ``TestChannelCacheClearDisk::test_default_path_constant_under_tmp`` + keeps validating the real shape of the constant. + + Importing ``ChannelCache`` here pulls in pandas, but only once per + session — fixture body still runs per-test, just the monkeypatch. + """ + from sift_client._internal.low_level_wrappers.data import ChannelCache + + monkeypatch.setattr( + ChannelCache, + "DEFAULT_DISK_PATH", + str(tmp_path / "sift-channel-data-cache"), + ) + + @pytest.fixture(scope="session") def sift_client() -> SiftClient: """Create a SiftClient instance for testing. diff --git a/python/lib/sift_client/_tests/resources/test_channels.py b/python/lib/sift_client/_tests/resources/test_channels.py index e3d29ab73..3ed3826b1 100644 --- a/python/lib/sift_client/_tests/resources/test_channels.py +++ b/python/lib/sift_client/_tests/resources/test_channels.py @@ -507,6 +507,11 @@ class TestConfigureDataCache: """``configure_data_cache`` is the resource-level knob for the in-memory channel data cache. Before the cache is initialized, it stashes the value for the lazy-init path; after, it retunes the live cache. + + Each test that triggers ``_ensure_data_low_level_client`` opens the + opt-out disk tier (redirected to ``tmp_path`` by the conftest fixture) + and closes the handle in ``finally`` so the diskcache lock doesn't leak + into the next test. """ def test_before_lazy_init_propagates_to_cache(self): @@ -515,24 +520,33 @@ def test_before_lazy_init_propagates_to_cache(self): api.configure_data_cache(max_bytes=123) assert api._data_low_level_client is None # still lazy api._ensure_data_low_level_client() - assert api._data_low_level_client.channel_cache.max_bytes == 123 + try: + assert api._data_low_level_client.channel_cache.max_bytes == 123 + finally: + api._data_low_level_client.channel_cache.close() def test_after_lazy_init_updates_live_cache(self): """Configuring after first use retunes the live cache in place.""" api = _make_api() api._ensure_data_low_level_client() - original_client = api._data_low_level_client - api.configure_data_cache(max_bytes=456) - # Same wrapper instance — we mutated, not replaced. - assert api._data_low_level_client is original_client - assert api._data_low_level_client.channel_cache.max_bytes == 456 + try: + original_client = api._data_low_level_client + api.configure_data_cache(max_bytes=456) + # Same wrapper instance — we mutated, not replaced. + assert api._data_low_level_client is original_client + assert api._data_low_level_client.channel_cache.max_bytes == 456 + finally: + api._data_low_level_client.channel_cache.close() def test_zero_disables_cache_via_resource(self): """Resource-level ``max_bytes=0`` end-to-end disables the cache.""" api = _make_api() api.configure_data_cache(max_bytes=0) api._ensure_data_low_level_client() - assert not api._data_low_level_client.channel_cache.enabled + try: + assert not api._data_low_level_client.channel_cache.enabled + finally: + api._data_low_level_client.channel_cache.close() def test_negative_raises(self): api = _make_api() @@ -549,10 +563,24 @@ class TestEnableDataCacheDisk: resource-level wiring around it. """ - def test_disabled_by_default(self): + def test_enabled_by_default(self): + """Disk persistence is opt-out: the default-constructed resource + lands at ``ChannelCache.DEFAULT_DISK_PATH`` on first ``get_data``. + + The autouse ``_isolate_default_disk_cache_path`` fixture in + ``conftest.py`` redirects the constant to a per-test tmp dir so this + doesn't litter the real ``/tmp``. + """ + from sift_client._internal.low_level_wrappers.data import ChannelCache + api = _make_api() api._ensure_data_low_level_client() - assert not api._data_low_level_client.channel_cache.disk_enabled + cache = api._data_low_level_client.channel_cache + try: + assert cache.disk_enabled + assert cache.disk_path == ChannelCache.DEFAULT_DISK_PATH + finally: + cache.close() def test_enable_before_lazy_init_propagates(self, tmp_path): api = _make_api() @@ -567,7 +595,13 @@ def test_enable_before_lazy_init_propagates(self, tmp_path): cache.close() def test_enable_after_lazy_init_updates_live_cache(self, tmp_path): + """``disable_data_cache_disk`` → ``enable_data_cache_disk`` round-trip + on a live cache swaps the disk handle without recreating the wrapper. + """ api = _make_api() + # Start from the disk-off state so the test exercises the "off → on" + # transition rather than "default-on → reconfigured-on". + api.disable_data_cache_disk() api._ensure_data_low_level_client() cache = api._data_low_level_client.channel_cache try: @@ -625,3 +659,39 @@ def test_clear_data_cache_on_disk_proxies_to_cache(self, tmp_path): api = _make_api() api.clear_data_cache_on_disk(path) assert not path.exists() + + def test_default_path_failure_falls_back_to_memory(self, monkeypatch, tmp_path): + """If the opt-out default disk path can't be opened, the wrapper logs + a warning and continues with the in-memory cache only. + + Simulated by pointing ``DEFAULT_DISK_PATH`` at a path that already + exists as a regular file — ``os.makedirs(..., exist_ok=True)`` raises + ``FileExistsError`` for non-directory targets. + """ + from sift_client._internal.low_level_wrappers.data import ChannelCache + + blocker = tmp_path / "not-a-dir" + blocker.write_text("i am a file, not a directory") + monkeypatch.setattr(ChannelCache, "DEFAULT_DISK_PATH", str(blocker)) + + api = _make_api() + api._ensure_data_low_level_client() # must not raise + cache = api._data_low_level_client.channel_cache + try: + # Disk silently dropped, memory still working. + assert not cache.disk_enabled + assert cache.enabled + finally: + cache.close() + + def test_explicit_path_failure_propagates(self, tmp_path): + """An explicit ``enable_data_cache_disk(path=...)`` that can't open + propagates the OSError — silent fallback would hide a user mistake. + """ + blocker = tmp_path / "not-a-dir" + blocker.write_text("i am a file, not a directory") + + api = _make_api() + api.enable_data_cache_disk(path=str(blocker)) + with pytest.raises(FileExistsError): + api._ensure_data_low_level_client() diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py index 6afc36386..5db5bf473 100644 --- a/python/lib/sift_client/client.py +++ b/python/lib/sift_client/client.py @@ -149,10 +149,6 @@ def __init__( mapped to a frontend automatically; see the ``app_url`` property. A value here takes precedence over ``connection_config.app_url``. - Resource-specific knobs live on the resource itself. For example, - to tune the channel data cache used by ``client.channels.get_data``:: - - client.channels.configure_data_cache(max_bytes=128 * 1024 * 1024) """ if not (api_key and grpc_url and rest_url) and not connection_config: raise ValueError( diff --git a/python/lib/sift_client/resources/channels.py b/python/lib/sift_client/resources/channels.py index 91322a65c..794930fda 100644 --- a/python/lib/sift_client/resources/channels.py +++ b/python/lib/sift_client/resources/channels.py @@ -1,7 +1,9 @@ from __future__ import annotations +import logging from typing import TYPE_CHECKING +from sift_client._internal.disk_cache_config import DiskCacheConfig from sift_client._internal.low_level_wrappers.channels import ChannelsLowLevelClient from sift_client._internal.low_level_wrappers.units import UnitsLowLevelClient from sift_client.resources._base import ResourceBase @@ -20,6 +22,8 @@ from sift_client.client import SiftClient +logger = logging.getLogger(__name__) + def _channel_ids_from_list(items: list[str | Channel]) -> list[str]: """Resolve a list of channel IDs or Channel objects to a list of channel IDs. @@ -69,13 +73,7 @@ def __init__(self, sift_client: SiftClient): # at lazy-init time" so we don't have to import ``data.py`` (and # therefore pandas) just to remember the default. self._data_cache_max_bytes: int | None = None - # Disk-tier configuration, stashed until lazy init (or applied - # immediately if the wrapper is already constructed). All three - # remain ``None`` / ``False`` when the disk tier is disabled, which - # is the default — disk persistence is opt-in. - self._disk_cache_enabled: bool = False - self._disk_cache_path: str | None = None - self._disk_cache_max_bytes: int | None = None + self._disk_cache_config = DiskCacheConfig(enabled=True) def configure_data_cache(self, *, max_bytes: int) -> None: """Configure the in-memory channel data cache used by ``get_data``. @@ -105,7 +103,11 @@ def enable_data_cache_disk( path: str | os.PathLike[str] | None = None, max_bytes: int | None = None, ) -> None: - """Persist the channel data cache to disk, surviving process restarts. + """Configure (or re-enable after ``disable_data_cache_disk``) the disk cache. + + Disk persistence is **on by default** at ``ChannelCache.DEFAULT_DISK_PATH``; + use this method when you want to override the path or size, or to turn + the tier back on after a prior ``disable_data_cache_disk`` call. The disk-backed tier is a second-chance layer beneath the in-memory cache: on a memory miss, ``get_data`` checks disk before going to the @@ -117,35 +119,37 @@ def enable_data_cache_disk( (different ``path`` or ``max_bytes``) closes the previous disk handle and opens a new one; in-memory contents are preserved across the swap. + An explicit ``path`` that can't be opened (e.g. permission denied, + read-only filesystem) raises so the caller knows the request didn't + take. The default-path open does *not* raise — see + ``_ensure_data_low_level_client`` for the fall-back-to-memory path. + Args: path: Directory to persist the cache to. ``None`` (the default) - uses ``DEFAULT_DISK_CACHE_PATH``. Existing entries at the path - become available as cache hits. + uses ``ChannelCache.DEFAULT_DISK_PATH``. Existing entries at + the path become available as cache hits. max_bytes: Byte cap on the disk tier. ``None`` uses - ``DEFAULT_DISK_CACHE_MAX_BYTES`` (4 GiB). When the bound is - reached, ``diskcache``'s LRU eviction takes over. + ``ChannelCache.DEFAULT_DISK_MAX_BYTES`` (4 GiB). When the + bound is reached, ``diskcache``'s LRU eviction takes over. Example: - client.channels.enable_data_cache_disk() client.channels.enable_data_cache_disk(path="/data/sift-cache") client.channels.enable_data_cache_disk(max_bytes=1024 ** 3) # 1 GiB """ - self._disk_cache_enabled = True - self._disk_cache_path = str(path) if path is not None else None - self._disk_cache_max_bytes = max_bytes + self._disk_cache_config.enable(path=path, max_bytes=max_bytes) if self._data_low_level_client is not None: self._data_low_level_client.channel_cache.enable_disk(path=path, max_bytes=max_bytes) def disable_data_cache_disk(self) -> None: - """Stop persisting the channel data cache to disk. + """Opt out of disk persistence for the channel data cache. - Closes the disk-cache file handle. The on-disk directory is NOT - deleted — use :meth:`clear_data_cache_on_disk` to wipe it. In-memory - entries are preserved. + Disk persistence is on by default; call this when you don't want any + cached data written to disk. Closes any open disk-cache file handle. + The on-disk directory is NOT deleted — use + :meth:`clear_data_cache_on_disk` to wipe it. In-memory entries are + preserved. """ - self._disk_cache_enabled = False - self._disk_cache_path = None - self._disk_cache_max_bytes = None + self._disk_cache_config.disable() if self._data_low_level_client is not None: self._data_low_level_client.channel_cache.disable_disk() @@ -153,14 +157,16 @@ def clear_data_cache_on_disk(self, path: str | os.PathLike[str] | None = None) - """Delete a previously-persisted on-disk channel data cache directory. Drops stale caches from previous sessions, recovers from a corrupt - cache, or reclaims disk space. Removes the directory entirely; a - future :meth:`enable_data_cache_disk` call at the same path will see - a fresh empty cache. + cache, or reclaims disk space. Removes the directory entirely; if disk + persistence is on, the next ``get_data`` re-opens an empty cache at + the same path. This is a thin proxy around :meth:`ChannelCache.clear_disk ` — exposed on the resource so callers don't need to reach into - ``_internal`` modules. But that is a class method so the user could call without a client if desired. + ``_internal`` modules. The underlying classmethod is also reachable + directly (``ChannelCache.clear_disk(...)``) if the caller doesn't have + a ``SiftClient`` handy. Args: path: Directory of the cache to clear. ``None`` (the default) @@ -361,17 +367,43 @@ def _ensure_data_low_level_client(self): kwargs: dict = {} if self._data_cache_max_bytes is not None: kwargs["data_cache_max_bytes"] = self._data_cache_max_bytes - if self._disk_cache_enabled: + disk_config = self._disk_cache_config + if disk_config.enabled: # ``disk_path=None`` means "disabled" to ChannelCache; substitute - # the default explicitly so an explicit ``enable_data_cache_disk()`` - # without a path still opens the disk tier. - kwargs["disk_cache_path"] = self._disk_cache_path or ChannelCache.DEFAULT_DISK_PATH - if self._disk_cache_max_bytes is not None: - kwargs["disk_cache_max_bytes"] = self._disk_cache_max_bytes - self._data_low_level_client = DataLowLevelClient( - grpc_client=self.client.grpc_client, - **kwargs, - ) + # the default explicitly so the opt-out default still opens + # the disk tier. ``DEFAULT_DISK_PATH`` is read here (not at + # config construction) so test fixtures that monkeypatch the + # class attribute see the override. + kwargs["disk_cache_path"] = disk_config.path or ChannelCache.DEFAULT_DISK_PATH + if disk_config.max_bytes is not None: + kwargs["disk_cache_max_bytes"] = disk_config.max_bytes + try: + self._data_low_level_client = DataLowLevelClient( + grpc_client=self.client.grpc_client, + **kwargs, + ) + except Exception: + # Explicit user-supplied disk path failures propagate so the + # caller knows their request didn't take. Default-path failures + # (read-only ``/tmp``, restricted containers, etc.) degrade + # silently to memory-only so ``get_data`` still works. + if not disk_config.using_default_path: + raise + logger.warning( + "Could not open the default channel data disk cache at %r; " + "falling back to in-memory cache only. Call " + "``client.channels.disable_data_cache_disk()`` to silence " + "this warning, or pass an explicit path via " + "``enable_data_cache_disk(path=...)``.", + kwargs.get("disk_cache_path"), + exc_info=True, + ) + kwargs.pop("disk_cache_path", None) + kwargs.pop("disk_cache_max_bytes", None) + self._data_low_level_client = DataLowLevelClient( + grpc_client=self.client.grpc_client, + **kwargs, + ) async def get_data( self, diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 704e3b8c0..8e76a56ff 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -457,14 +457,16 @@ class ChannelsAPI: """Delete a previously-persisted on-disk channel data cache directory. Drops stale caches from previous sessions, recovers from a corrupt - cache, or reclaims disk space. Removes the directory entirely; a - future :meth:`enable_data_cache_disk` call at the same path will see - a fresh empty cache. + cache, or reclaims disk space. Removes the directory entirely; if disk + persistence is on, the next ``get_data`` re-opens an empty cache at + the same path. This is a thin proxy around :meth:`ChannelCache.clear_disk ` — exposed on the resource so callers don't need to reach into - ``_internal`` modules. But that is a class method so the user could call without a client if desired. + ``_internal`` modules. The underlying classmethod is also reachable + directly (``ChannelCache.clear_disk(...)``) if the caller doesn't have + a ``SiftClient`` handy. Args: path: Directory of the cache to clear. ``None`` (the default) @@ -495,18 +497,24 @@ class ChannelsAPI: ... def disable_data_cache_disk(self) -> None: - """Stop persisting the channel data cache to disk. + """Opt out of disk persistence for the channel data cache. - Closes the disk-cache file handle. The on-disk directory is NOT - deleted — use :meth:`clear_data_cache_on_disk` to wipe it. In-memory - entries are preserved. + Disk persistence is on by default; call this when you don't want any + cached data written to disk. Closes any open disk-cache file handle. + The on-disk directory is NOT deleted — use + :meth:`clear_data_cache_on_disk` to wipe it. In-memory entries are + preserved. """ ... def enable_data_cache_disk( self, *, path: str | os.PathLike[str] | None = None, max_bytes: int | None = None ) -> None: - """Persist the channel data cache to disk, surviving process restarts. + """Configure (or re-enable after ``disable_data_cache_disk``) the disk cache. + + Disk persistence is **on by default** at ``ChannelCache.DEFAULT_DISK_PATH``; + use this method when you want to override the path or size, or to turn + the tier back on after a prior ``disable_data_cache_disk`` call. The disk-backed tier is a second-chance layer beneath the in-memory cache: on a memory miss, ``get_data`` checks disk before going to the @@ -518,16 +526,20 @@ class ChannelsAPI: (different ``path`` or ``max_bytes``) closes the previous disk handle and opens a new one; in-memory contents are preserved across the swap. + An explicit ``path`` that can't be opened (e.g. permission denied, + read-only filesystem) raises so the caller knows the request didn't + take. The default-path open does *not* raise — see + ``_ensure_data_low_level_client`` for the fall-back-to-memory path. + Args: path: Directory to persist the cache to. ``None`` (the default) - uses ``DEFAULT_DISK_CACHE_PATH``. Existing entries at the path - become available as cache hits. + uses ``ChannelCache.DEFAULT_DISK_PATH``. Existing entries at + the path become available as cache hits. max_bytes: Byte cap on the disk tier. ``None`` uses - ``DEFAULT_DISK_CACHE_MAX_BYTES`` (4 GiB). When the bound is - reached, ``diskcache``'s LRU eviction takes over. + ``ChannelCache.DEFAULT_DISK_MAX_BYTES`` (4 GiB). When the + bound is reached, ``diskcache``'s LRU eviction takes over. Example: - client.channels.enable_data_cache_disk() client.channels.enable_data_cache_disk(path="/data/sift-cache") client.channels.enable_data_cache_disk(max_bytes=1024 ** 3) # 1 GiB """ diff --git a/python/pyproject.toml b/python/pyproject.toml index b12c29cb0..dfe94c043 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -252,6 +252,13 @@ dev-all = ["development", "all", "build"] docs-build = ["dev-all", "docs"] # Note python 3.9+ [tool.uv] +# Pin uv to a version that writes lockfile revision 3 (introduced in uv 0.8.4 +# by astral-sh/uv#14489, which added ``exclude-newer-package`` to the lock +# schema). Older uv silently rolls the lockfile back to revision 2 on the +# next ``uv lock`` / ``uv sync`` (a no-op-looking change), then a teammate +# on a newer uv re-bumps it — churning the revision field in PRs. +# ``required-version`` blocks the older uv up front with a clear error. +required-version = ">=0.8.4" # Fork resolution per Python minor in the support range. Each fork resolves # independently, which lets 3.8 pick numpy 1.24.x + rosbags 0.9.23 without # being constrained by the 3.9+ universe (numpy 2.0 drops 3.8). @@ -351,6 +358,7 @@ ignore_errors = true [[tool.mypy.overrides]] module = "nptdms" ignore_missing_imports = true +ignore_errors = true # diskcache ships without inline type hints or PEP 561 marker. Used by the # channel data cache's optional on-disk tier. diff --git a/python/uv.lock b/python/uv.lock index 43c24b552..7a0c68645 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.8" resolution-markers = [ "python_full_version >= '3.8.2' and python_full_version < '3.9'", From 3c6a2ca149a5ddd8fd2034051d3741310da06427 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 25 Jun 2026 14:03:49 -0700 Subject: [PATCH 10/14] handle single large fetch --- .../_internal/low_level_wrappers/data.py | 54 +++++- .../_internal/low_level_wrappers/test_data.py | 159 +++++++++++++++++- 2 files changed, 203 insertions(+), 10 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index ae0bbf6e5..e0bcf67a3 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -125,6 +125,13 @@ def __init__( self._entries: OrderedDict[str, ChannelCacheEntry] = OrderedDict() self._total_bytes: int = 0 self._max_bytes: int = max_bytes + # Channels we've already logged an "entry exceeds tier cap" warning + # for. The check on the put path would otherwise spam the log once + # per ``get_data`` call for any channel whose typical entry is bigger + # than the cap. A successful normal put for the same channel clears + # the bit so a future regression re-warns. + self._oversized_memory_warned: set[str] = set() + self._oversized_disk_warned: set[str] = set() self._disk: diskcache.Cache | None = None self._disk_path: str | None = None self._disk_max_bytes: int | None = None @@ -307,8 +314,30 @@ def put(self, channel_id: str, entry: ChannelCacheEntry) -> None: if self.enabled: self._put_memory(channel_id, entry) if self._disk is not None: + if ( + self._disk_max_bytes is not None + and entry.size_bytes > self._disk_max_bytes + ): + if channel_id not in self._oversized_disk_warned: + logger.warning( + "Channel %s data (%d bytes) is larger than the disk " + "cache cap (%d bytes); skipping disk cache for this " + "channel so other entries aren't evicted. Raise the " + "cap via ``client.channels.enable_data_cache_disk(" + "max_bytes=...)`` to cache this channel on disk.", + channel_id, + entry.size_bytes, + self._disk_max_bytes, + ) + self._oversized_disk_warned.add(channel_id) + try: + self._disk.delete(channel_id, retry=True) + except Exception: + pass + return try: self._disk.set(channel_id, entry, retry=True) + self._oversized_disk_warned.discard(channel_id) except Exception: # Best-effort persistence: keep going on disk errors so the # in-memory cache (and the user's ``get_data`` call) still @@ -323,6 +352,11 @@ def invalidate(self, channel_id: str) -> None: prior = self._entries.pop(channel_id, None) if prior is not None: self._total_bytes -= prior.size_bytes + # Invalidation is a fresh start for this channel; if it was warned + # about as oversized previously, the next put should re-evaluate + # against the current cap and re-warn if still too big. + self._oversized_memory_warned.discard(channel_id) + self._oversized_disk_warned.discard(channel_id) if self._disk is not None: try: self._disk.delete(channel_id, retry=True) @@ -332,6 +366,8 @@ def invalidate(self, channel_id: str) -> None: def clear(self) -> None: self._entries.clear() self._total_bytes = 0 + self._oversized_memory_warned.clear() + self._oversized_disk_warned.clear() if self._disk is not None: self._disk.clear() @@ -340,10 +376,26 @@ def close(self) -> None: self._close_disk() def _put_memory(self, channel_id: str, entry: ChannelCacheEntry) -> None: - """Memory-tier insert + eviction. Caller has already gated on ``enabled``.""" + """Memory-tier insert + eviction. Caller has already gated on ``enabled``. + """ prior = self._entries.pop(channel_id, None) if prior is not None: self._total_bytes -= prior.size_bytes + if entry.size_bytes > self._max_bytes: + if channel_id not in self._oversized_memory_warned: + logger.warning( + "Channel %s data (%d bytes) is larger than the in-memory " + "cache cap (%d bytes); skipping cache for this channel so " + "other entries aren't evicted. Raise the cap via " + "``client.channels.configure_data_cache(max_bytes=...)`` " + "to cache this channel.", + channel_id, + entry.size_bytes, + self._max_bytes, + ) + self._oversized_memory_warned.add(channel_id) + return + self._oversized_memory_warned.discard(channel_id) self._entries[channel_id] = entry self._total_bytes += entry.size_bytes self._evict_until_under_bound() diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py index 0ace402d2..5ef05b9d7 100644 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py @@ -22,6 +22,7 @@ from __future__ import annotations +import logging from contextlib import contextmanager from datetime import datetime, timedelta, timezone from typing import Any, Iterator @@ -102,6 +103,31 @@ def _invariant_holds(cache: ChannelCache) -> bool: return cache.total_bytes == sum(e.size_bytes for e in cache._entries.values()) +@contextmanager +def _capture_data_warnings() -> Iterator[list[logging.LogRecord]]: + """Capture warnings emitted by the ``data`` module's logger directly. + + Pytest's ``caplog`` reads from the root logger, but the Sift pytest plugin + sets ``propagate=False`` on the ``sift_client`` logger when audit logging + is active, so records emitted from any descendant don't reach the root. + Attaching a list-backed handler at the leaf logger bypasses that and + surfaces exactly the records we emit. + """ + target = logging.getLogger("sift_client._internal.low_level_wrappers.data") + records: list[logging.LogRecord] = [] + + class _ListHandler(logging.Handler): + def emit(self, record: logging.LogRecord) -> None: + records.append(record) + + handler = _ListHandler(level=logging.WARNING) + target.addHandler(handler) + try: + yield records + finally: + target.removeHandler(handler) + + def _patch_deserializer(sentinel_to_frames: dict[str, dict[str, pd.DataFrame]]) -> Any: """Patch ``try_deserialize_channel_data`` to translate string sentinels. @@ -236,25 +262,101 @@ def test_get_promotes_to_most_recent(self) -> None: assert "c" in cache assert _invariant_holds(cache) - def test_oversized_entry_evicts_with_neighbours(self) -> None: - """A single entry larger than the cap ends up evicted itself. + def test_oversized_entry_skips_cache_preserves_neighbours(self) -> None: + """A single entry larger than the cap is rejected without evicting peers. - The alternative ("keep the oversized entry and accept that the cap - is soft") would silently reintroduce unbounded growth for any - workload whose typical entry is bigger than ``max_bytes``. + Before this guard, ``_put_memory`` would insert the oversized entry, + then loop popping LRU until the cap was satisfied — but since no + amount of eviction makes an oversized entry fit, the loop drained + every other entry *and* the oversized one, wiping the cache on every + fetch of that channel. The fix: detect the oversized case up front, + warn, and skip the insert. """ small_a, small_b, oversized = _entry(rows=10), _entry(rows=10), _entry(rows=10_000) cache = ChannelCache(max_bytes=small_a.size_bytes + small_b.size_bytes) cache.put("a", small_a) cache.put("b", small_b) - cache.put("huge", oversized) + with _capture_data_warnings() as records: + cache.put("huge", oversized) assert "huge" not in cache - # Every other entry was evicted in the failed attempt to make room. - assert "a" not in cache - assert "b" not in cache + # Critical: the previously cached entries survive. + assert "a" in cache + assert "b" in cache + assert cache.total_bytes == small_a.size_bytes + small_b.size_bytes + assert _invariant_holds(cache) + # User gets a clear, actionable warning. + assert any("larger than the in-memory cache cap" in r.getMessage() for r in records) + + def test_oversized_put_drops_prior_entry(self) -> None: + """An oversized re-insert must drop the prior slice, not silently keep it. + + Otherwise a stale subrange would masquerade as a hit on the next + ``get`` even though the caller's intent was to refresh the entry. + """ + small, oversized = _entry(rows=10), _entry(rows=10_000) + cache = ChannelCache(max_bytes=small.size_bytes) + cache.put("chan", small) + assert "chan" in cache + cache.put("chan", oversized) + assert "chan" not in cache assert cache.total_bytes == 0 assert _invariant_holds(cache) + def test_oversized_put_warns_once_per_channel(self) -> None: + """Repeated oversized puts for the same channel log once, not on every call. + + Without dedup, every ``get_data`` for an oversized channel would + write a fresh WARNING line — quickly drowning out other signal in + the logs. + """ + oversized = _entry(rows=10_000) + cache = ChannelCache(max_bytes=oversized.size_bytes // 4) + with _capture_data_warnings() as records: + for _ in range(5): + cache.put("chan", oversized) + warnings = [r for r in records if "larger than the in-memory cache cap" in r.getMessage()] + assert len(warnings) == 1 + + def test_oversized_warning_resets_after_normal_put(self) -> None: + """A successful normal-sized put clears the dedup bit. + + Used by callers who narrow a time window after seeing the warning: + the next oversized regression should re-warn rather than stay silent. + """ + oversized = _entry(rows=10_000) + small = _entry(rows=10) + cache = ChannelCache(max_bytes=small.size_bytes * 2) + with _capture_data_warnings() as records: + cache.put("chan", oversized) # 1st warning + cache.put("chan", small) # resets state + cache.put("chan", oversized) # 2nd warning + warnings = [r for r in records if "larger than the in-memory cache cap" in r.getMessage()] + assert len(warnings) == 2 + + def test_invalidate_resets_oversized_warning(self) -> None: + """``invalidate`` is a fresh start; the next oversized put re-warns.""" + oversized = _entry(rows=10_000) + cache = ChannelCache(max_bytes=oversized.size_bytes // 4) + with _capture_data_warnings() as records: + cache.put("chan", oversized) + cache.invalidate("chan") + cache.put("chan", oversized) + warnings = [r for r in records if "larger than the in-memory cache cap" in r.getMessage()] + assert len(warnings) == 2 + + def test_clear_resets_oversized_warning(self) -> None: + """``clear`` resets all dedup state across channels.""" + oversized = _entry(rows=10_000) + cache = ChannelCache(max_bytes=oversized.size_bytes // 4) + with _capture_data_warnings() as records: + cache.put("chan-a", oversized) + cache.put("chan-b", oversized) + cache.clear() + cache.put("chan-a", oversized) + cache.put("chan-b", oversized) + warnings = [r for r in records if "larger than the in-memory cache cap" in r.getMessage()] + assert len(warnings) == 4 + def test_max_bytes_zero_disables_cache(self) -> None: cache = ChannelCache(max_bytes=0) cache.put("c1", _entry(rows=100)) @@ -449,6 +551,45 @@ def test_clear_wipes_both_tiers(self, tmp_path) -> None: finally: cache.close() + def test_oversized_entry_skips_disk_preserves_other_entries(self, tmp_path) -> None: + """An entry larger than the disk cap is skipped on disk too. + + Without the guard, ``diskcache``'s cull() would evict every other + on-disk row trying to fit an unfittable entry, then drop the entry + itself — the same wipe-everything failure mode as the memory tier. + + Memory is sized to accept small entries but reject the oversized one + too, so memory-tier writes don't compete with disk-tier writes. We + assert on the disk ``_disk`` mapping directly because that's where + the contested behavior lives. + """ + small = _entry(rows=4) + oversized = _entry(rows=10_000) + # ``disk_max_bytes`` has to leave room for ``diskcache``'s pickle + # envelope around each small entry (a few KB) AND be small enough + # that the oversized entry trips the guard. Half the oversized + # DataFrame's raw byte size hits both constraints comfortably. + cache = ChannelCache( + max_bytes=oversized.size_bytes * 2, + disk_path=tmp_path / "disk-oversize", + disk_max_bytes=oversized.size_bytes // 2, + ) + try: + cache.put("small-1", small) + cache.put("small-2", small) + assert cache._disk is not None + with _capture_data_warnings() as records: + cache.put("huge", oversized) + # Disk-side prior entries survive; oversized one was not written. + assert "small-1" in cache._disk + assert "small-2" in cache._disk + assert "huge" not in cache._disk + assert any( + "larger than the disk cache cap" in r.getMessage() for r in records + ) + finally: + cache.close() + def test_disable_disk_preserves_memory(self, tmp_path) -> None: """Turning off disk closes the handle but keeps memory intact.""" cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "disable") From 814280c2bb8d29c2507fc7e6fea67bbd33d4d376 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 25 Jun 2026 14:16:56 -0700 Subject: [PATCH 11/14] fmt --- .../lib/sift_client/_internal/low_level_wrappers/data.py | 8 ++------ .../_tests/_internal/low_level_wrappers/test_data.py | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index e0bcf67a3..04f75dd47 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -314,10 +314,7 @@ def put(self, channel_id: str, entry: ChannelCacheEntry) -> None: if self.enabled: self._put_memory(channel_id, entry) if self._disk is not None: - if ( - self._disk_max_bytes is not None - and entry.size_bytes > self._disk_max_bytes - ): + if self._disk_max_bytes is not None and entry.size_bytes > self._disk_max_bytes: if channel_id not in self._oversized_disk_warned: logger.warning( "Channel %s data (%d bytes) is larger than the disk " @@ -376,8 +373,7 @@ def close(self) -> None: self._close_disk() def _put_memory(self, channel_id: str, entry: ChannelCacheEntry) -> None: - """Memory-tier insert + eviction. Caller has already gated on ``enabled``. - """ + """Memory-tier insert + eviction. Caller has already gated on ``enabled``.""" prior = self._entries.pop(channel_id, None) if prior is not None: self._total_bytes -= prior.size_bytes diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py index 5ef05b9d7..d06307a71 100644 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py @@ -584,9 +584,7 @@ def test_oversized_entry_skips_disk_preserves_other_entries(self, tmp_path) -> N assert "small-1" in cache._disk assert "small-2" in cache._disk assert "huge" not in cache._disk - assert any( - "larger than the disk cache cap" in r.getMessage() for r in records - ) + assert any("larger than the disk cache cap" in r.getMessage() for r in records) finally: cache.close() From 63cc0452496721a1b50acf8f75fc09fbf932c868 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 25 Jun 2026 15:18:45 -0700 Subject: [PATCH 12/14] Remove in memory cache layer. --- python/CHANGELOG.md | 29 +- .../_internal/low_level_wrappers/data.py | 291 +++---- .../_internal/low_level_wrappers/test_data.py | 731 ++++++++---------- .../_tests/resources/test_channels.py | 72 +- python/lib/sift_client/resources/channels.py | 76 +- .../resources/sync_stubs/__init__.pyi | 49 +- 6 files changed, 464 insertions(+), 784 deletions(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index e33995a8e..d3ffaeb65 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -11,31 +11,18 @@ This project adheres to [Semantic Versioning](http://semver.org/). Up to a ~80x speedup for some get_data calls. -#### Bounded channel data cache +#### Channel data cache (opt-out, on by default) -The in-memory channel data cache used by `client.channels.get_data(...)` is now byte-bounded with LRU eviction (default 512 MiB). Once the bound is reached, the least-recently-used cached channel is evicted. +`client.channels.get_data(...)` now caches the channel windows it returns to disk by default. Subsequent calls covering the same channel/time range — including from a fresh process — read straight out of the cache instead of going to the wire. This also bounds memory: nothing is held in process after the call returns, which fixes the OOM seen on long sustained pulls (~5–7 GB of cache for a 145M-point pull in earlier versions). -Configure the bound on the `channels` resource: +The default location is `/sift-channel-data-cache`, capped at 4 GiB with LRU eviction. If the default path can't be opened (read-only filesystem, restricted container, etc.), the client logs a warning and continues with caching disabled — `get_data` still works, it just always goes to the wire. -```python -client.channels.configure_data_cache(max_bytes=128 * 1024 * 1024) # 128 MiB cap -client.channels.configure_data_cache(max_bytes=0) # disable caching -``` - -`configure_data_cache` may be called at any time; if the cache is already populated, the new bound is applied immediately and excess entries are evicted. - -`ignore_cache=True` on `client.channels.get_data(...)` now also skips writing into the cache, matching its read-side bypass semantics. Previously a "non-caching" workload still appended to the shared cache on every call, which still caused increased memory usage. - -#### On-disk channel data cache (opt-out, on by default) - -The channel data cache now persists to disk by default, surviving process restarts. The disk tier is a second-chance layer beneath the in-memory cache: on a memory miss, `get_data` checks disk before going to the wire. Re-running the same workload in a new session picks up the previously-cached windows for free — no configuration required. - -The default location is `/sift-channel-data-cache`, capped at 4 GiB with LRU eviction. If the default path can't be opened (read-only filesystem, restricted container, etc.), the client logs a warning and falls back to the in-memory cache only — `get_data` continues to work. +`ignore_cache=True` on `client.channels.get_data(...)` now skips writing into the cache as well as reading from it. Previously a "non-caching" workload still appended to the shared cache on every call. -Opt out, reconfigure, or wipe the on-disk cache from the `channels` resource: +Opt out, reconfigure, or wipe the cache from the `channels` resource: ```python -# Opt out — no data persisted to disk. +# Opt out — no data persisted to disk; every get_data call goes to the wire. client.channels.disable_data_cache_disk() # Reconfigure the location or byte cap. @@ -46,9 +33,9 @@ client.channels.clear_data_cache_on_disk() # default tmp path client.channels.clear_data_cache_on_disk("/data/sift-cache") # custom path ``` -`enable_data_cache_disk` is also the way to turn the tier back on after a prior `disable_data_cache_disk` call. +`enable_data_cache_disk` is also the way to turn the cache back on after a prior `disable_data_cache_disk` call. -The disk tier is powered by [`diskcache`](https://grantjenks.com/docs/diskcache/) (pure-Python, SQLite-backed) and has its own independent byte cap with LRU eviction. The in-memory tier remains the fast path — disk is only consulted on a memory miss. +The cache is powered by [`diskcache`](https://grantjenks.com/docs/diskcache/) (pure-Python, SQLite-backed) with LRU eviction. #### Resource and principal attributes (ABAC) diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index 04f75dd47..03ab29268 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -5,7 +5,6 @@ import os import shutil import tempfile -from collections import OrderedDict from datetime import datetime, timezone from pathlib import Path from typing import TYPE_CHECKING, Any, cast @@ -41,11 +40,6 @@ # has been resolved. In the mean time each channel gets its own request. REQUEST_BATCH_SIZE = 1 -# Default in-memory budget for cached channel DataFrames, per ``DataLowLevelClient`` -# instance. 512 MiB is well below typical limits while still letting common -# interactive workloads stay in cache. Override via ``SiftClient(data_cache_max_bytes=...)``. -DEFAULT_DATA_CACHE_MAX_BYTES = 512 * 1024 * 1024 - class ChannelCacheEntry(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) @@ -67,33 +61,37 @@ def _new_cache_entry( class ChannelCache: - """Two-tier cache of per-channel DataFrames. - - Tier 1: an LRU-ordered, byte-bounded in-memory dict (hot path). ``max_bytes - <= 0`` disables this tier: ``get`` always misses memory, ``put`` doesn't - populate it. - - Tier 2 (optional, see ``enable_disk``): a ``diskcache``-backed write-through - layer that survives process restarts. When enabled, ``put`` writes to both - tiers, ``get`` falls back to disk on a memory miss (promoting the hit back - into memory), and ``invalidate``/``clear`` cascade to disk. The disk tier - has its own byte cap that ``diskcache`` enforces with LRU eviction. - - The two tiers are independent: setting ``max_bytes=0`` keeps the disk layer - active, useful for "cold storage only" workloads. + """Disk-backed cache of per-channel DataFrames. + + A ``diskcache``-backed key/value store that survives process restarts. + ``put`` writes through to disk, ``get`` reads from disk, and + ``invalidate``/``clear`` remove entries. The disk tier has a byte cap + that ``diskcache`` enforces with its own LRU eviction. + + When no ``disk_path`` is supplied the cache is a no-op: ``get`` always + returns ``None``, ``__contains__`` is always ``False``, and ``put`` is + silently dropped. This is the "caching disabled" mode used after a + :meth:`disable_disk` call (or when disk persistence is turned off on + the owning resource). + + An in-memory tier previously sat in front of disk. It was removed once + benchmarks showed that for the workloads driving the OOM regression the + extra memory footprint outweighed the per-call pickle/deserialize cost + on a warm disk hit; if profiling shows the disk reads dominating again, + re-introduce a small front cache here. """ - #: Default directory for the on-disk tier. Lives under - #: ``tempfile.gettempdir()`` so it survives across sessions of the same - #: user but doesn't pollute the user's home dir. The suffix is fixed so - #: multiple processes (different ``SiftClient`` instances, notebooks, etc.) - #: naturally share the same store and can read each other's prior sessions. + #: Default directory for the cache. Lives under ``tempfile.gettempdir()`` + #: so it survives across sessions of the same user but doesn't pollute + #: the user's home dir. The suffix is fixed so multiple processes + #: (different ``SiftClient`` instances, notebooks, etc.) naturally share + #: the same store and can read each other's prior sessions. DEFAULT_DISK_PATH: str = os.path.join(tempfile.gettempdir(), "sift-channel-data-cache") - #: Default byte cap for the disk tier when ``enable_disk`` is called - #: without an explicit ``max_bytes``. 4 GiB is a generous ceiling for the - #: typical ``/tmp`` filesystem; ``diskcache`` enforces it with its own - #: SQLite-backed LRU eviction once the bound is reached. + #: Default byte cap for the cache when ``enable_disk`` is called without + #: an explicit ``max_bytes``. 4 GiB is a generous ceiling for the typical + #: ``/tmp`` filesystem; ``diskcache`` enforces it with its own SQLite- + #: backed LRU eviction once the bound is reached. DEFAULT_DISK_MAX_BYTES: int = 4 * 1024 * 1024 * 1024 #: Marker file ``diskcache`` writes inside every cache directory. We @@ -103,34 +101,27 @@ class ChannelCache: def __init__( self, - max_bytes: int = DEFAULT_DATA_CACHE_MAX_BYTES, *, disk_path: str | os.PathLike[str] | None = None, disk_max_bytes: int | None = None, ): - """Construct an in-memory cache, optionally backed by disk. + """Construct a disk-backed cache. Args: - max_bytes: Byte cap on the in-memory tier. ``0`` disables it. - disk_path: Directory for the disk tier. ``None`` (the default) - disables disk. A previously-populated directory is reused, - so subsequent sessions can read from existing entries. - disk_max_bytes: Byte cap on the disk tier. ``None`` falls back to + disk_path: Directory for the cache. ``None`` disables caching + entirely (every operation becomes a no-op). A previously- + populated directory is reused, so subsequent sessions can + read existing entries. + disk_max_bytes: Byte cap on disk usage. ``None`` falls back to ``DEFAULT_DISK_MAX_BYTES``. Ignored when ``disk_path`` is ``None``. """ - if max_bytes < 0: - raise ValueError(f"data_cache_max_bytes must be >= 0, got {max_bytes}") self.name_id_map: dict[str, str] = {} - self._entries: OrderedDict[str, ChannelCacheEntry] = OrderedDict() - self._total_bytes: int = 0 - self._max_bytes: int = max_bytes - # Channels we've already logged an "entry exceeds tier cap" warning + # Channels we've already logged an "entry exceeds disk cap" warning # for. The check on the put path would otherwise spam the log once # per ``get_data`` call for any channel whose typical entry is bigger # than the cap. A successful normal put for the same channel clears # the bit so a future regression re-warns. - self._oversized_memory_warned: set[str] = set() self._oversized_disk_warned: set[str] = set() self._disk: diskcache.Cache | None = None self._disk_path: str | None = None @@ -171,64 +162,31 @@ def clear_disk(cls, path: str | os.PathLike[str] | None = None) -> None: ) shutil.rmtree(target) - @property - def enabled(self) -> bool: - """Whether the in-memory tier accepts writes (``max_bytes > 0``).""" - return self._max_bytes > 0 - - @property - def max_bytes(self) -> int: - return self._max_bytes - - @max_bytes.setter - def max_bytes(self, value: int) -> None: - """Reconfigure the in-memory byte cap and immediately evict any excess. - - Used by ``ChannelsAPIAsync.configure_data_cache`` to retune a live - cache. Lowering the cap below ``total_bytes`` triggers LRU eviction - in the same loop ``put`` uses, so the invariant ``total_bytes <= - max_bytes`` is restored before the setter returns. Does not touch - the disk tier. - """ - if value < 0: - raise ValueError(f"data_cache_max_bytes must be >= 0, got {value}") - self._max_bytes = value - self._evict_until_under_bound() - - @property - def total_bytes(self) -> int: - return self._total_bytes - @property def disk_enabled(self) -> bool: - """Whether the disk-backed second-chance tier is currently open.""" + """Whether the disk-backed store is currently open.""" return self._disk is not None @property def disk_path(self) -> str | None: - """Filesystem path of the disk tier when enabled, else ``None``.""" + """Filesystem path of the cache when enabled, else ``None``.""" return self._disk_path @property def disk_max_bytes(self) -> int | None: - """Configured byte cap on the disk tier, or ``None`` when disabled.""" + """Configured byte cap on disk usage, or ``None`` when disabled.""" return self._disk_max_bytes - def __len__(self) -> int: - return len(self._entries) - def __contains__(self, channel_id: str) -> bool: - """True if the channel is cached in memory OR on disk. + """True if the channel is cached on disk. Used by ``_filter_cached_channels`` to decide whether ``get_data`` - needs to hit the wire. Including the disk tier here lets a fresh - session served by a warm disk avoid re-fetching. + needs to hit the wire. A warm disk lets a fresh session avoid + re-fetching previously-served windows. """ - if channel_id in self._entries: - return True - if self._disk is not None and channel_id in self._disk: - return True - return False + if self._disk is None: + return False + return channel_id in self._disk def enable_disk( self, @@ -236,19 +194,19 @@ def enable_disk( path: str | os.PathLike[str] | None = None, max_bytes: int | None = None, ) -> None: - """Enable (or reconfigure) the disk-backed second-chance tier. + """Enable (or reconfigure) the disk-backed cache. - If a previous disk tier was open at a different path or with a - different size cap, it's closed first. Memory contents are left - intact; they are NOT replayed to disk so disk reflects only future - writes. + If a previous disk handle was open at a different path or with a + different size cap, it's closed first. Disk contents at the new + path are NOT recreated from anywhere — only future writes land in + the new location. Args: path: Directory to persist to. ``None`` uses :attr:`DEFAULT_DISK_PATH`. The directory is created if missing; an existing one is opened in place and its contents become available to ``get``. - max_bytes: Byte cap for the disk tier (``None`` → + max_bytes: Byte cap on disk usage (``None`` → :attr:`DEFAULT_DISK_MAX_BYTES`). """ target_path = str(path) if path is not None else self.DEFAULT_DISK_PATH @@ -263,24 +221,15 @@ def enable_disk( self._open_disk(target_path, target_max) def disable_disk(self) -> None: - """Close the disk tier (if open). Does not touch the disk contents. + """Close the disk handle (if open). Does not touch the disk contents. - Use ``sift_client.clear_data_cache_on_disk(path)`` to remove a + Use ``client.channels.clear_data_cache_on_disk(path)`` to remove a directory from disk. """ self._close_disk() def get(self, channel_id: str) -> ChannelCacheEntry | None: - """Return the entry for ``channel_id`` if cached, otherwise None. - - Memory is consulted first; on a miss, the disk tier (if enabled) is - checked. A disk hit is promoted back into memory (subject to the - in-memory cap) so subsequent accesses stay hot. - """ - entry = self._entries.get(channel_id) - if entry is not None: - self._entries.move_to_end(channel_id) - return entry + """Return the entry for ``channel_id`` if cached, otherwise None.""" if self._disk is None: return None try: @@ -288,8 +237,8 @@ def get(self, channel_id: str) -> ChannelCacheEntry | None: except Exception: # diskcache surfaces ``sqlite3.DatabaseError`` (and friends) for # corrupt or partially-written entries from a prior session. - # Treat as a miss; force ``invalidate`` to drop the bad row so - # we don't repeatedly trip the same path. + # Treat as a miss; force-drop the bad row so we don't repeatedly + # trip the same path. logger.warning("disk cache read failed for %s; invalidating", channel_id) try: del self._disk[channel_id] @@ -298,61 +247,55 @@ def get(self, channel_id: str) -> ChannelCacheEntry | None: return None if disk_entry is None or not isinstance(disk_entry, ChannelCacheEntry): return None - if self.enabled: - # Promote disk hit into memory so subsequent reads are cheap. - self._put_memory(channel_id, disk_entry) return disk_entry def put(self, channel_id: str, entry: ChannelCacheEntry) -> None: - """Insert or replace ``channel_id`` in memory (if enabled) and on disk. + """Insert or replace ``channel_id`` on disk. - Memory reclaims any prior entry's byte count BEFORE adding the new - one's, so a re-insert (e.g. concat-merge of fresh data into an - existing entry) accounts for the size delta correctly. Disk writes - replace the prior row. + No-op when the disk tier is disabled. Entries larger than + ``disk_max_bytes`` are skipped (with a one-shot warning per + channel) instead of being inserted, since diskcache's eviction + loop would otherwise drain every other row trying — and failing — + to fit them. """ - if self.enabled: - self._put_memory(channel_id, entry) - if self._disk is not None: - if self._disk_max_bytes is not None and entry.size_bytes > self._disk_max_bytes: - if channel_id not in self._oversized_disk_warned: - logger.warning( - "Channel %s data (%d bytes) is larger than the disk " - "cache cap (%d bytes); skipping disk cache for this " - "channel so other entries aren't evicted. Raise the " - "cap via ``client.channels.enable_data_cache_disk(" - "max_bytes=...)`` to cache this channel on disk.", - channel_id, - entry.size_bytes, - self._disk_max_bytes, - ) - self._oversized_disk_warned.add(channel_id) - try: - self._disk.delete(channel_id, retry=True) - except Exception: - pass - return + if self._disk is None: + return + if self._disk_max_bytes is not None and entry.size_bytes > self._disk_max_bytes: + if channel_id not in self._oversized_disk_warned: + logger.warning( + "Channel %s data (%d bytes) is larger than the disk " + "cache cap (%d bytes); skipping disk cache for this " + "channel so other entries aren't evicted. Raise the " + "cap via ``client.channels.enable_data_cache_disk(" + "max_bytes=...)`` to cache this channel on disk.", + channel_id, + entry.size_bytes, + self._disk_max_bytes, + ) + self._oversized_disk_warned.add(channel_id) try: - self._disk.set(channel_id, entry, retry=True) - self._oversized_disk_warned.discard(channel_id) + self._disk.delete(channel_id, retry=True) except Exception: - # Best-effort persistence: keep going on disk errors so the - # in-memory cache (and the user's ``get_data`` call) still - # succeeds. Drop the (possibly partial) disk row. - logger.warning("disk cache write failed for %s; invalidating", channel_id) - try: - self._disk.delete(channel_id, retry=True) - except Exception: - pass + pass + return + try: + self._disk.set(channel_id, entry, retry=True) + self._oversized_disk_warned.discard(channel_id) + except Exception: + # Best-effort persistence: keep going on disk errors so the + # user's ``get_data`` call still succeeds. Drop the (possibly + # partial) disk row. + logger.warning("disk cache write failed for %s; invalidating", channel_id) + try: + self._disk.delete(channel_id, retry=True) + except Exception: + pass def invalidate(self, channel_id: str) -> None: - prior = self._entries.pop(channel_id, None) - if prior is not None: - self._total_bytes -= prior.size_bytes + """Remove ``channel_id`` from the cache. Safe to call when absent.""" # Invalidation is a fresh start for this channel; if it was warned # about as oversized previously, the next put should re-evaluate # against the current cap and re-warn if still too big. - self._oversized_memory_warned.discard(channel_id) self._oversized_disk_warned.discard(channel_id) if self._disk is not None: try: @@ -361,55 +304,19 @@ def invalidate(self, channel_id: str) -> None: pass def clear(self) -> None: - self._entries.clear() - self._total_bytes = 0 - self._oversized_memory_warned.clear() + """Wipe all entries from disk. The directory itself remains.""" self._oversized_disk_warned.clear() if self._disk is not None: self._disk.clear() def close(self) -> None: - """Release the disk-tier file handle. Safe to call without disk enabled.""" + """Release the disk file handle. Safe to call without disk enabled.""" self._close_disk() - def _put_memory(self, channel_id: str, entry: ChannelCacheEntry) -> None: - """Memory-tier insert + eviction. Caller has already gated on ``enabled``.""" - prior = self._entries.pop(channel_id, None) - if prior is not None: - self._total_bytes -= prior.size_bytes - if entry.size_bytes > self._max_bytes: - if channel_id not in self._oversized_memory_warned: - logger.warning( - "Channel %s data (%d bytes) is larger than the in-memory " - "cache cap (%d bytes); skipping cache for this channel so " - "other entries aren't evicted. Raise the cap via " - "``client.channels.configure_data_cache(max_bytes=...)`` " - "to cache this channel.", - channel_id, - entry.size_bytes, - self._max_bytes, - ) - self._oversized_memory_warned.add(channel_id) - return - self._oversized_memory_warned.discard(channel_id) - self._entries[channel_id] = entry - self._total_bytes += entry.size_bytes - self._evict_until_under_bound() - - def _evict_until_under_bound(self) -> None: - # ``popitem(last=False)`` drops the oldest entry. A single fresh entry - # whose ``size_bytes`` alone exceeds ``max_bytes`` ends up evicted on - # the final iteration. - while self._entries and self._total_bytes > self._max_bytes: - _, dropped = self._entries.popitem(last=False) - self._total_bytes -= dropped.size_bytes - def _open_disk(self, path: str, max_bytes: int) -> None: import diskcache os.makedirs(path, exist_ok=True) - # ``least-recently-used`` matches the in-memory tier's eviction policy; - # statistics/tag_index are off because we only need plain k/v reads. self._disk = diskcache.Cache( directory=path, size_limit=max_bytes, @@ -442,7 +349,6 @@ def __init__( self, grpc_client: GrpcClient, *, - data_cache_max_bytes: int = DEFAULT_DATA_CACHE_MAX_BYTES, disk_cache_path: str | os.PathLike[str] | None = None, disk_cache_max_bytes: int | None = None, ): @@ -450,17 +356,14 @@ def __init__( Args: grpc_client: The gRPC client to use for making API calls. - data_cache_max_bytes: Cap on the in-memory channel-data cache (bytes). - Set to ``0`` to disable in-memory caching. See ``ChannelCache``. - disk_cache_path: Directory for the disk-backed second-chance tier. - ``None`` disables disk persistence. See ``ChannelCache``. - disk_cache_max_bytes: Byte cap for the disk tier. ``None`` uses - ``DEFAULT_DISK_CACHE_MAX_BYTES``. Ignored when + disk_cache_path: Directory for the disk-backed channel-data cache. + ``None`` disables caching entirely. See ``ChannelCache``. + disk_cache_max_bytes: Byte cap for disk usage. ``None`` uses + ``ChannelCache.DEFAULT_DISK_MAX_BYTES``. Ignored when ``disk_cache_path`` is ``None``. """ super().__init__(grpc_client) self.channel_cache = ChannelCache( - max_bytes=data_cache_max_bytes, disk_path=disk_cache_path, disk_max_bytes=disk_cache_max_bytes, ) diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py index d06307a71..4fc094440 100644 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py @@ -1,11 +1,10 @@ """Tests for :mod:`sift_client._internal.low_level_wrappers.data`. -Five classes, narrowest scope first: +Four classes, narrowest scope first: -* :class:`TestChannelCache` — pure ``ChannelCache`` unit tests (byte - accounting, LRU promotion, eviction). -* :class:`TestChannelCacheDisk` — disk-backed second-chance tier - (fresh open, cross-session reload, fall-through reads, disable). +* :class:`TestChannelCache` — disk-backed :class:`ChannelCache` unit tests + (fresh open, cross-session reload, invalidate/clear, oversized guards, + disable/reconfigure). * :class:`TestChannelCacheClearDisk` — ``ChannelCache.clear_disk`` classmethod (default path, custom path, safety guard). * :class:`TestMergePages` — ``DataLowLevelClient._merge_pages``, the @@ -32,7 +31,6 @@ import pytest from sift_client._internal.low_level_wrappers.data import ( - DEFAULT_DATA_CACHE_MAX_BYTES, ChannelCache, ChannelCacheEntry, DataLowLevelClient, @@ -98,9 +96,15 @@ def _channel(cid: str) -> Channel: ) -def _invariant_holds(cache: ChannelCache) -> bool: - """``total_bytes`` must equal the sum of per-entry sizes at all times.""" - return cache.total_bytes == sum(e.size_bytes for e in cache._entries.values()) +def _client_with_cache(tmp_path, subdir: str = "cache") -> DataLowLevelClient: + """Build a ``DataLowLevelClient`` whose ``ChannelCache`` points at ``tmp_path``. + + Tests that exercise cache behaviour (hits/misses/eviction) need an + actual disk-backed cache, so ``disk_cache_path`` must be supplied. A + plain ``DataLowLevelClient(MagicMock())`` defaults to no-cache mode + and would silently turn every cache test into a wire-path test. + """ + return DataLowLevelClient(MagicMock(), disk_cache_path=tmp_path / subdir) @contextmanager @@ -198,266 +202,55 @@ async def fake_impl( class TestChannelCache: - """Byte accounting, LRU promotion, eviction.""" - - def test_put_get_roundtrip_and_size_replacement(self) -> None: - """First put records size; second put on same key replaces it. - - Without size reclamation on the second put, ``total_bytes`` would - double-count and trip the eviction loop on the next insert. - """ - cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) - small, big = _entry(rows=10), _entry(rows=1000) - cache.put("c1", small) - assert cache.get("c1") is small - assert cache.total_bytes == small.size_bytes - cache.put("c1", big) - assert cache.get("c1") is big - assert cache.total_bytes == big.size_bytes # not small + big - assert _invariant_holds(cache) - - def test_invalidate(self) -> None: - """Removes a present entry and decrements bytes; no-op for missing keys.""" - cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) - cache.invalidate("never_added") # safe before any puts - assert cache.total_bytes == 0 - cache.put("c1", _entry(rows=10)) - cache.invalidate("c1") - assert cache.get("c1") is None - assert cache.total_bytes == 0 - assert _invariant_holds(cache) - - def test_clear(self) -> None: - cache = ChannelCache(max_bytes=DEFAULT_DATA_CACHE_MAX_BYTES) - cache.put("c1", _entry(rows=10)) - cache.put("c2", _entry(rows=20)) - cache.clear() - assert cache.total_bytes == 0 - assert len(cache) == 0 - assert _invariant_holds(cache) - - def test_oldest_entry_evicted_first(self) -> None: - """Insertion order determines eviction when only puts have happened.""" - a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) - cache = ChannelCache(max_bytes=a.size_bytes + b.size_bytes) # room for two - cache.put("a", a) - cache.put("b", b) - cache.put("c", c) # evicts "a" - assert "a" not in cache - assert "b" in cache - assert "c" in cache - assert cache.total_bytes <= a.size_bytes + b.size_bytes - assert _invariant_holds(cache) - - def test_get_promotes_to_most_recent(self) -> None: - """Reading an entry must protect it from the next eviction.""" - a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) - cache = ChannelCache(max_bytes=a.size_bytes + b.size_bytes) - cache.put("a", a) - cache.put("b", b) - assert cache.get("a") is a # promote a - cache.put("c", c) # b is now oldest, gets evicted - assert "a" in cache - assert "b" not in cache - assert "c" in cache - assert _invariant_holds(cache) - - def test_oversized_entry_skips_cache_preserves_neighbours(self) -> None: - """A single entry larger than the cap is rejected without evicting peers. - - Before this guard, ``_put_memory`` would insert the oversized entry, - then loop popping LRU until the cap was satisfied — but since no - amount of eviction makes an oversized entry fit, the loop drained - every other entry *and* the oversized one, wiping the cache on every - fetch of that channel. The fix: detect the oversized case up front, - warn, and skip the insert. - """ - small_a, small_b, oversized = _entry(rows=10), _entry(rows=10), _entry(rows=10_000) - cache = ChannelCache(max_bytes=small_a.size_bytes + small_b.size_bytes) - cache.put("a", small_a) - cache.put("b", small_b) - with _capture_data_warnings() as records: - cache.put("huge", oversized) - assert "huge" not in cache - # Critical: the previously cached entries survive. - assert "a" in cache - assert "b" in cache - assert cache.total_bytes == small_a.size_bytes + small_b.size_bytes - assert _invariant_holds(cache) - # User gets a clear, actionable warning. - assert any("larger than the in-memory cache cap" in r.getMessage() for r in records) - - def test_oversized_put_drops_prior_entry(self) -> None: - """An oversized re-insert must drop the prior slice, not silently keep it. - - Otherwise a stale subrange would masquerade as a hit on the next - ``get`` even though the caller's intent was to refresh the entry. - """ - small, oversized = _entry(rows=10), _entry(rows=10_000) - cache = ChannelCache(max_bytes=small.size_bytes) - cache.put("chan", small) - assert "chan" in cache - cache.put("chan", oversized) - assert "chan" not in cache - assert cache.total_bytes == 0 - assert _invariant_holds(cache) - - def test_oversized_put_warns_once_per_channel(self) -> None: - """Repeated oversized puts for the same channel log once, not on every call. - - Without dedup, every ``get_data`` for an oversized channel would - write a fresh WARNING line — quickly drowning out other signal in - the logs. - """ - oversized = _entry(rows=10_000) - cache = ChannelCache(max_bytes=oversized.size_bytes // 4) - with _capture_data_warnings() as records: - for _ in range(5): - cache.put("chan", oversized) - warnings = [r for r in records if "larger than the in-memory cache cap" in r.getMessage()] - assert len(warnings) == 1 - - def test_oversized_warning_resets_after_normal_put(self) -> None: - """A successful normal-sized put clears the dedup bit. - - Used by callers who narrow a time window after seeing the warning: - the next oversized regression should re-warn rather than stay silent. - """ - oversized = _entry(rows=10_000) - small = _entry(rows=10) - cache = ChannelCache(max_bytes=small.size_bytes * 2) - with _capture_data_warnings() as records: - cache.put("chan", oversized) # 1st warning - cache.put("chan", small) # resets state - cache.put("chan", oversized) # 2nd warning - warnings = [r for r in records if "larger than the in-memory cache cap" in r.getMessage()] - assert len(warnings) == 2 - - def test_invalidate_resets_oversized_warning(self) -> None: - """``invalidate`` is a fresh start; the next oversized put re-warns.""" - oversized = _entry(rows=10_000) - cache = ChannelCache(max_bytes=oversized.size_bytes // 4) - with _capture_data_warnings() as records: - cache.put("chan", oversized) - cache.invalidate("chan") - cache.put("chan", oversized) - warnings = [r for r in records if "larger than the in-memory cache cap" in r.getMessage()] - assert len(warnings) == 2 - - def test_clear_resets_oversized_warning(self) -> None: - """``clear`` resets all dedup state across channels.""" - oversized = _entry(rows=10_000) - cache = ChannelCache(max_bytes=oversized.size_bytes // 4) - with _capture_data_warnings() as records: - cache.put("chan-a", oversized) - cache.put("chan-b", oversized) - cache.clear() - cache.put("chan-a", oversized) - cache.put("chan-b", oversized) - warnings = [r for r in records if "larger than the in-memory cache cap" in r.getMessage()] - assert len(warnings) == 4 - - def test_max_bytes_zero_disables_cache(self) -> None: - cache = ChannelCache(max_bytes=0) - cache.put("c1", _entry(rows=100)) - assert not cache.enabled - assert cache.get("c1") is None - assert cache.total_bytes == 0 - assert len(cache) == 0 - - def test_negative_max_bytes_raises(self) -> None: - with pytest.raises(ValueError, match="data_cache_max_bytes"): - ChannelCache(max_bytes=-1) - - def test_set_max_bytes_lower_evicts_immediately(self) -> None: - """Lowering ``max_bytes`` below ``total_bytes`` evicts LRU until it fits. - - Used by ``ChannelsAPIAsync.configure_data_cache`` to retune a live - cache without forcing the caller to call ``clear()`` first. - """ - a, b, c = _entry(rows=50), _entry(rows=50), _entry(rows=50) - cache = ChannelCache(max_bytes=a.size_bytes + b.size_bytes + c.size_bytes) - cache.put("a", a) - cache.put("b", b) - cache.put("c", c) - # Lower the cap to fit only one entry; LRU "a" and "b" must drop. - cache.max_bytes = c.size_bytes - assert cache.max_bytes == c.size_bytes - assert "a" not in cache - assert "b" not in cache - assert "c" in cache - assert _invariant_holds(cache) - - def test_set_max_bytes_negative_raises(self) -> None: - cache = ChannelCache(max_bytes=100) - with pytest.raises(ValueError, match="data_cache_max_bytes"): - cache.max_bytes = -1 - - def test_repeated_concat_updates_stay_under_bound(self) -> None: - """Simulates the customer's sliding-window pull: same channel, growing. - - Without size reclamation on update, ``total_bytes`` would creep - above the cap silently. We re-build the entry each iteration to - mimic the ``_update_cache`` concat path. - """ - cap = 1_000_000 # ~1 MB - cache = ChannelCache(max_bytes=cap) - accumulated = pd.DataFrame() - for i in range(50): - chunk = _frame(rows=1000, start=_NOW + timedelta(seconds=i), freq="us") - accumulated = pd.concat([accumulated, chunk]) - cache.put( - "c1", - _new_cache_entry( - data=accumulated, - start_time=accumulated.index[0].to_pydatetime(), - end_time=accumulated.index[-1].to_pydatetime(), - ), - ) - assert cache.total_bytes <= cap, ( - f"iteration {i}: total_bytes={cache.total_bytes} exceeded cap={cap}" - ) - assert _invariant_holds(cache) - - -class TestChannelCacheDisk: - """Disk-backed second-chance tier of :class:`ChannelCache`. - - Three things must hold across these tests: - - 1. A fresh disk directory starts empty and accepts new writes. - 2. Closing a populated cache and reopening at the same path surfaces - the previous entries on read (the "previous session" requirement). - 3. The two tiers stay consistent across ``invalidate``/``clear`` and - ``disable_disk``, so the disk tier never becomes a stale shadow of - memory. - - All tests confine writes to ``tmp_path`` so nothing leaks into the real - ``/tmp/sift-channel-data-cache``. + """Disk-backed :class:`ChannelCache` behaviour. + + Five invariants must hold across these tests: + + 1. Constructing without a ``disk_path`` yields a no-op cache (every + operation is silent; ``__contains__`` returns ``False``). + 2. A fresh disk directory starts empty and accepts new writes. + 3. Closing a populated cache and reopening at the same path surfaces + the previous entries on read (the "previous session" requirement + that powers cold-start reuse). + 4. Oversized entries are skipped with a deduped warning rather than + being inserted and triggering an eviction storm. + 5. ``invalidate``/``clear`` reset the oversized-warning dedup state + so a future regression re-warns. + + All tests confine writes to ``tmp_path`` so nothing leaks into the + real ``/tmp/sift-channel-data-cache``. """ - def test_disabled_by_default(self) -> None: - """No ``disk_path`` → disk tier stays off and untouched.""" - cache = ChannelCache(max_bytes=10_000_000) + def test_disabled_when_no_path(self) -> None: + """``ChannelCache()`` with no ``disk_path`` is a silent no-op.""" + cache = ChannelCache() assert cache.disk_enabled is False assert cache.disk_path is None assert cache.disk_max_bytes is None + # Operations don't raise; the cache just stays empty. + cache.put("chan-1", _entry(rows=4)) + assert "chan-1" not in cache + assert cache.get("chan-1") is None + cache.invalidate("chan-1") + cache.clear() + cache.close() def test_fresh_cache_writes_and_reads(self, tmp_path) -> None: """A fresh disk directory accepts writes and serves them back.""" path = tmp_path / "fresh" - cache = ChannelCache(max_bytes=10_000_000, disk_path=path) + cache = ChannelCache(disk_path=path) try: assert cache.disk_enabled assert cache.disk_path == str(path) assert cache.disk_max_bytes == ChannelCache.DEFAULT_DISK_MAX_BYTES entry = _entry(rows=8) cache.put("chan-1", entry) - # Same instance: memory hit takes precedence; disk is just a copy. assert "chan-1" in cache got = cache.get("chan-1") assert got is not None pd.testing.assert_frame_equal(got.data, entry.data) + assert got.start_time == entry.start_time + assert got.end_time == entry.end_time finally: cache.close() @@ -465,8 +258,8 @@ def test_reopen_existing_dir_sees_prior_session_entries(self, tmp_path) -> None: """Closing then reopening at the same path makes prior entries hit. This is the "look for existing caches from previous sessions" - guarantee: a new ``ChannelCache`` with an empty in-memory tier - finds entries on disk and promotes them into memory on first read. + guarantee: a new ``ChannelCache`` at a populated directory finds + entries on disk and returns them on the next read. """ path = tmp_path / "prev-session" df = _frame("chan-1", rows=12, freq="s") @@ -476,101 +269,122 @@ def test_reopen_existing_dir_sees_prior_session_entries(self, tmp_path) -> None: end_time=df.index[-1].to_pydatetime(), ) # Session 1: populate and close. - session1 = ChannelCache(max_bytes=10_000_000, disk_path=path) + session1 = ChannelCache(disk_path=path) session1.put("chan-1", original_entry) session1.close() - # Session 2: fresh process simulated by a brand-new ChannelCache. - # Memory starts empty, but ``__contains__`` reports the entry from - # disk and ``get`` returns it with bytes intact. - session2 = ChannelCache(max_bytes=10_000_000, disk_path=path) + # Session 2: fresh process simulated by a brand-new ChannelCache + # at the same directory. + session2 = ChannelCache(disk_path=path) try: - assert len(session2) == 0 # in-memory tier starts cold - assert "chan-1" in session2 # disk-backed contains + assert "chan-1" in session2 got = session2.get("chan-1") assert got is not None pd.testing.assert_frame_equal(got.data, original_entry.data) assert got.start_time == original_entry.start_time assert got.end_time == original_entry.end_time - # After the disk hit, the entry is now promoted into memory. - assert len(session2) == 1 finally: session2.close() - def test_disk_hit_promotes_into_memory(self, tmp_path) -> None: - """A disk-only entry becomes a memory entry after one ``get``.""" - cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "promote") + def test_repeated_put_overwrites(self, tmp_path) -> None: + """A second ``put`` on the same key replaces the prior entry.""" + cache = ChannelCache(disk_path=tmp_path / "overwrite") try: - cache.put("chan-1", _entry(rows=4)) - # Drop from memory only (simulate eviction). - del cache._entries["chan-1"] - cache._total_bytes = 0 - assert "chan-1" in cache # still on disk - assert cache.get("chan-1") is not None - assert "chan-1" in cache._entries # promoted back into memory + small = _entry(rows=10) + bigger = _entry(rows=100) + cache.put("chan", small) + cache.put("chan", bigger) + got = cache.get("chan") + assert got is not None + pd.testing.assert_frame_equal(got.data, bigger.data) finally: cache.close() - def test_disk_only_when_memory_disabled(self, tmp_path) -> None: - """``max_bytes=0`` (no memory) still routes writes/reads through disk. - - Cold-storage configuration: caller wants persistence without - paying the in-memory footprint. - """ - cache = ChannelCache(max_bytes=0, disk_path=tmp_path / "disk-only") + def test_invalidate_removes_entry(self, tmp_path) -> None: + """``invalidate`` drops the entry; safe to call when absent.""" + cache = ChannelCache(disk_path=tmp_path / "inval") try: - assert not cache.enabled - assert cache.disk_enabled + cache.invalidate("never_added") # safe before any puts cache.put("chan-1", _entry(rows=4)) - assert "chan-1" not in cache._entries # never landed in memory - got = cache.get("chan-1") - assert got is not None - assert "chan-1" not in cache._entries # memory still bypassed + cache.invalidate("chan-1") + assert "chan-1" not in cache + assert cache.get("chan-1") is None finally: cache.close() - def test_invalidate_clears_both_tiers(self, tmp_path) -> None: - cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "inval") + def test_clear_wipes_disk(self, tmp_path) -> None: + cache = ChannelCache(disk_path=tmp_path / "clear") try: cache.put("chan-1", _entry(rows=4)) - cache.invalidate("chan-1") - assert "chan-1" not in cache._entries - assert "chan-1" not in cache # contains() must check disk too + cache.put("chan-2", _entry(rows=4)) + cache.clear() + assert "chan-1" not in cache + assert "chan-2" not in cache finally: cache.close() - def test_clear_wipes_both_tiers(self, tmp_path) -> None: - cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "clear") + def test_disable_disk_closes_handle(self, tmp_path) -> None: + """Turning off disk closes the handle and silences subsequent ops.""" + cache = ChannelCache(disk_path=tmp_path / "disable") try: cache.put("chan-1", _entry(rows=4)) + cache.disable_disk() + assert not cache.disk_enabled + assert cache.disk_path is None + assert "chan-1" not in cache # no handle → no hits + assert cache.get("chan-1") is None + # Subsequent puts are silently dropped. cache.put("chan-2", _entry(rows=4)) - cache.clear() - assert len(cache) == 0 - assert "chan-1" not in cache assert "chan-2" not in cache finally: cache.close() - def test_oversized_entry_skips_disk_preserves_other_entries(self, tmp_path) -> None: - """An entry larger than the disk cap is skipped on disk too. + def test_enable_disk_reconfigures_path(self, tmp_path) -> None: + """Reconfiguring to a different path closes the old handle. + + The new directory starts empty: ``chan-1`` lived in the old + directory's diskcache, so the lookup at the new path misses. + """ + cache = ChannelCache(disk_path=tmp_path / "a") + try: + cache.put("chan-1", _entry(rows=4)) + cache.enable_disk(path=tmp_path / "b") + assert cache.disk_path == str(tmp_path / "b") + assert "chan-1" not in cache # fresh directory + finally: + cache.close() + + def test_enable_disk_noop_when_same_settings(self, tmp_path) -> None: + """Re-enabling with identical settings doesn't churn the disk handle.""" + cache = ChannelCache(disk_path=tmp_path / "noop") + try: + handle_before = cache._disk + cache.enable_disk(path=tmp_path / "noop", max_bytes=ChannelCache.DEFAULT_DISK_MAX_BYTES) + assert cache._disk is handle_before + finally: + cache.close() + + def test_oversized_entry_skips_cache_preserves_neighbours(self, tmp_path) -> None: + """An entry larger than the cap is skipped without evicting peers. - Without the guard, ``diskcache``'s cull() would evict every other - on-disk row trying to fit an unfittable entry, then drop the entry - itself — the same wipe-everything failure mode as the memory tier. + Without this guard, ``diskcache``'s cull would evict every other + row trying to fit an unfittable entry, then drop the entry itself + — the wipe-everything failure mode the bounded-cache work + originally fixed. The disk-tier guard mirrors that fix. Memory is sized to accept small entries but reject the oversized one - too, so memory-tier writes don't compete with disk-tier writes. We + so memory-tier writes don't compete with disk-tier writes. We assert on the disk ``_disk`` mapping directly because that's where the contested behavior lives. + + ``disk_max_bytes`` has to leave room for ``diskcache``'s pickle + envelope around each small entry (a few KB) AND be small enough + that the oversized entry trips the guard. Half the oversized + DataFrame's raw byte size hits both constraints comfortably. """ small = _entry(rows=4) oversized = _entry(rows=10_000) - # ``disk_max_bytes`` has to leave room for ``diskcache``'s pickle - # envelope around each small entry (a few KB) AND be small enough - # that the oversized entry trips the guard. Half the oversized - # DataFrame's raw byte size hits both constraints comfortably. cache = ChannelCache( - max_bytes=oversized.size_bytes * 2, disk_path=tmp_path / "disk-oversize", disk_max_bytes=oversized.size_bytes // 2, ) @@ -580,54 +394,110 @@ def test_oversized_entry_skips_disk_preserves_other_entries(self, tmp_path) -> N assert cache._disk is not None with _capture_data_warnings() as records: cache.put("huge", oversized) - # Disk-side prior entries survive; oversized one was not written. - assert "small-1" in cache._disk - assert "small-2" in cache._disk - assert "huge" not in cache._disk + # Prior entries survive; oversized one was not written. + assert "small-1" in cache + assert "small-2" in cache + assert "huge" not in cache assert any("larger than the disk cache cap" in r.getMessage() for r in records) finally: cache.close() - def test_disable_disk_preserves_memory(self, tmp_path) -> None: - """Turning off disk closes the handle but keeps memory intact.""" - cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "disable") + def test_oversized_put_drops_prior_entry(self, tmp_path) -> None: + """An oversized re-insert must drop the prior slice, not silently keep it. + + Otherwise a stale subrange would masquerade as a hit on the next + ``get`` even though the caller's intent was to refresh the entry. + """ + small = _entry(rows=4) + oversized = _entry(rows=10_000) + cache = ChannelCache( + disk_path=tmp_path / "drop-prior", + disk_max_bytes=oversized.size_bytes // 2, + ) try: - cache.put("chan-1", _entry(rows=4)) - cache.disable_disk() - assert not cache.disk_enabled - assert cache.disk_path is None - # Memory entry survives the disk-tier teardown. - assert "chan-1" in cache - assert cache.get("chan-1") is not None + cache.put("chan", small) + assert "chan" in cache + cache.put("chan", oversized) + assert "chan" not in cache finally: cache.close() - def test_enable_disk_reconfigures_path(self, tmp_path) -> None: - """Reconfiguring to a different path closes the old handle.""" - cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "a") + def test_oversized_put_warns_once_per_channel(self, tmp_path) -> None: + """Repeated oversized puts for the same channel log once, not on every call. + + Without dedup, every ``get_data`` for an oversized channel would + write a fresh WARNING line — quickly drowning out other signal in + the logs. + """ + oversized = _entry(rows=10_000) + cache = ChannelCache( + disk_path=tmp_path / "dedup", + disk_max_bytes=oversized.size_bytes // 2, + ) try: - cache.put("chan-1", _entry(rows=4)) - cache.enable_disk(path=tmp_path / "b") - assert cache.disk_path == str(tmp_path / "b") - # The new disk dir is fresh: nothing on disk yet under the new path. - # ``chan-1`` is still in memory, so __contains__ is still True. - assert "chan-1" in cache - # But the new disk dir is empty; drop from memory and the - # contains check now relies on disk, which won't find it. - del cache._entries["chan-1"] - cache._total_bytes = 0 - assert "chan-1" not in cache + with _capture_data_warnings() as records: + for _ in range(5): + cache.put("chan", oversized) + warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] + assert len(warnings) == 1 finally: cache.close() - def test_enable_disk_noop_when_same_settings(self, tmp_path) -> None: - """Re-enabling with identical settings doesn't churn the disk handle.""" - cache = ChannelCache(max_bytes=10_000_000, disk_path=tmp_path / "noop") + def test_oversized_warning_resets_after_normal_put(self, tmp_path) -> None: + """A successful normal-sized put clears the dedup bit. + + Used by callers who narrow a time window after seeing the warning: + the next oversized regression should re-warn rather than stay silent. + """ + small = _entry(rows=4) + oversized = _entry(rows=10_000) + cache = ChannelCache( + disk_path=tmp_path / "reset-after-normal", + disk_max_bytes=oversized.size_bytes // 2, + ) try: - handle_before = cache._disk - cache.enable_disk(path=tmp_path / "noop", max_bytes=ChannelCache.DEFAULT_DISK_MAX_BYTES) - # Same handle, no reopen. - assert cache._disk is handle_before + with _capture_data_warnings() as records: + cache.put("chan", oversized) # 1st warning + cache.put("chan", small) # resets state + cache.put("chan", oversized) # 2nd warning + warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] + assert len(warnings) == 2 + finally: + cache.close() + + def test_invalidate_resets_oversized_warning(self, tmp_path) -> None: + """``invalidate`` is a fresh start; the next oversized put re-warns.""" + oversized = _entry(rows=10_000) + cache = ChannelCache( + disk_path=tmp_path / "reset-invalidate", + disk_max_bytes=oversized.size_bytes // 2, + ) + try: + with _capture_data_warnings() as records: + cache.put("chan", oversized) + cache.invalidate("chan") + cache.put("chan", oversized) + warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] + assert len(warnings) == 2 + finally: + cache.close() + + def test_clear_resets_oversized_warning(self, tmp_path) -> None: + """``clear`` resets dedup state across channels.""" + oversized = _entry(rows=10_000) + cache = ChannelCache( + disk_path=tmp_path / "reset-clear", + disk_max_bytes=oversized.size_bytes // 2, + ) + try: + with _capture_data_warnings() as records: + cache.put("chan-a", oversized) + cache.put("chan-b", oversized) + cache.clear() + cache.put("chan-a", oversized) + cache.put("chan-b", oversized) + warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] + assert len(warnings) == 4 finally: cache.close() @@ -642,7 +512,7 @@ class TestChannelCacheClearDisk: def test_clear_removes_directory(self, tmp_path) -> None: path = tmp_path / "victim" - cache = ChannelCache(max_bytes=10_000_000, disk_path=path) + cache = ChannelCache(disk_path=path) cache.put("chan-1", _entry(rows=4)) cache.close() assert path.exists() @@ -659,7 +529,6 @@ def test_clear_refuses_non_diskcache_directory(self, tmp_path) -> None: (target / "important.txt").write_text("don't delete me") with pytest.raises(ValueError, match="does not look like a sift channel data cache"): ChannelCache.clear_disk(target) - # Unrelated contents preserved. assert (target / "important.txt").read_text() == "don't delete me" def test_default_path_constant_under_tmp(self) -> None: @@ -779,31 +648,49 @@ class TestDataLowLevelClient: :class:`TestGetChannelData`. """ - def test_per_instance_isolation(self) -> None: - """Two clients must not share cache state. + def test_no_cache_when_disk_path_omitted(self) -> None: + """Default construction leaves the cache in no-op mode. - Regression test for the original OOM bug: ``channel_cache`` was a - class attribute, so every ``SiftClient`` in the process appended to - the same dict. Two fresh clients must have independent caches. + The ``ChannelsAPIAsync`` resource is the public surface for + opting into disk persistence; the bare ``DataLowLevelClient`` + keeps caching off so unit tests don't accidentally write to + ``/tmp`` just by instantiating the wrapper. """ - client_a = DataLowLevelClient(MagicMock()) - client_b = DataLowLevelClient(MagicMock()) - client_a.channel_cache.put("c1", _entry(rows=10)) - assert "c1" in client_a.channel_cache - assert "c1" not in client_b.channel_cache - assert client_b.channel_cache.total_bytes == 0 - - def test_data_cache_max_bytes_kwarg_propagates(self) -> None: - """``data_cache_max_bytes`` is forwarded to the underlying cache. - - The disabled-cache *behaviour* itself is covered by - :meth:`TestChannelCache.test_max_bytes_zero_disables_cache`; this - test just verifies the constructor passes the kwarg through. + client = DataLowLevelClient(MagicMock()) + assert not client.channel_cache.disk_enabled + + def test_per_instance_isolation(self, tmp_path) -> None: + """Two clients with separate disk paths must not share cache state. + + Regression test for the original OOM bug: ``channel_cache`` was a + class attribute, so every ``SiftClient`` in the process appended + to the same dict. Two fresh clients with distinct directories must + have independent caches. """ - assert DataLowLevelClient(MagicMock(), data_cache_max_bytes=0).channel_cache.max_bytes == 0 - assert ( - DataLowLevelClient(MagicMock(), data_cache_max_bytes=42).channel_cache.max_bytes == 42 + client_a = _client_with_cache(tmp_path, "a") + client_b = _client_with_cache(tmp_path, "b") + try: + client_a.channel_cache.put("c1", _entry(rows=10)) + assert "c1" in client_a.channel_cache + assert "c1" not in client_b.channel_cache + finally: + client_a.channel_cache.close() + client_b.channel_cache.close() + + def test_disk_cache_kwargs_propagate(self, tmp_path) -> None: + """Constructor kwargs land on the underlying ``ChannelCache``.""" + path = tmp_path / "kwargs" + client = DataLowLevelClient( + MagicMock(), + disk_cache_path=path, + disk_cache_max_bytes=8_192, ) + try: + assert client.channel_cache.disk_enabled + assert client.channel_cache.disk_path == str(path) + assert client.channel_cache.disk_max_bytes == 8_192 + finally: + client.channel_cache.close() class TestGetChannelData: @@ -850,81 +737,89 @@ async def test_multi_page_response_concatenated_per_channel(self) -> None: pd.testing.assert_frame_equal(result["c1"].sort_index(), expected.sort_index()) @pytest.mark.asyncio - async def test_cache_hit_short_circuits_grpc(self) -> None: + async def test_cache_hit_short_circuits_grpc(self, tmp_path) -> None: """Second request for the same channel + window skips ``_get_data_impl``. Stages two pages-worth of data so a faulty cache that falls through wouldn't silently pass by hitting EOF — any second-call invocation would consume the second page and bump ``len(call_log)``. """ - client = DataLowLevelClient(MagicMock()) + client = _client_with_cache(tmp_path) df = _frame("c1") - with _fake_grpc(client, {"c1": [df, df]}) as call_log: - first = await client.get_channel_data( - channels=[_channel("c1")], - start_time=_NOW, - end_time=_WINDOW_END, - ) - calls_after_first = len(call_log) - assert calls_after_first >= 1 - - second = await client.get_channel_data( - channels=[_channel("c1")], - start_time=_NOW, - end_time=_WINDOW_END, - ) - assert len(call_log) == calls_after_first, ( - "second call should be served from cache without invoking _get_data_impl" - ) - pd.testing.assert_frame_equal(first["c1"].sort_index(), second["c1"].sort_index()) + try: + with _fake_grpc(client, {"c1": [df, df]}) as call_log: + first = await client.get_channel_data( + channels=[_channel("c1")], + start_time=_NOW, + end_time=_WINDOW_END, + ) + calls_after_first = len(call_log) + assert calls_after_first >= 1 + + second = await client.get_channel_data( + channels=[_channel("c1")], + start_time=_NOW, + end_time=_WINDOW_END, + ) + assert len(call_log) == calls_after_first, ( + "second call should be served from cache without invoking _get_data_impl" + ) + pd.testing.assert_frame_equal(first["c1"].sort_index(), second["c1"].sort_index()) + finally: + client.channel_cache.close() @pytest.mark.asyncio - async def test_partial_cache_hit_merges_cached_and_fresh(self) -> None: + async def test_partial_cache_hit_merges_cached_and_fresh(self, tmp_path) -> None: """Cached + uncached channels resolved together in one return dict. Only the uncached channel triggers ``_get_data_impl``. """ - client = DataLowLevelClient(MagicMock()) + client = _client_with_cache(tmp_path) c1_df, c2_df = _frame("c1"), _frame("c2", offset=100) - with _fake_grpc(client, {"c1": [c1_df], "c2": [c2_df]}) as call_log: - await client.get_channel_data( - channels=[_channel("c1")], - start_time=_NOW, - end_time=_WINDOW_END, - ) - calls_after_warmup = len(call_log) - - result = await client.get_channel_data( - channels=[_channel("c1"), _channel("c2")], - start_time=_NOW, - end_time=_WINDOW_END, - ) - new_calls = call_log[calls_after_warmup:] - - assert new_calls, "c2 should hit the wire on the second call" - for call in new_calls: - assert call["channel_ids"] == ["c2"], f"only c2 should hit the wire, saw {call!r}" - assert set(result.keys()) == {"c1", "c2"} - pd.testing.assert_frame_equal(result["c1"].sort_index(), c1_df.sort_index()) - pd.testing.assert_frame_equal(result["c2"].sort_index(), c2_df.sort_index()) + try: + with _fake_grpc(client, {"c1": [c1_df], "c2": [c2_df]}) as call_log: + await client.get_channel_data( + channels=[_channel("c1")], + start_time=_NOW, + end_time=_WINDOW_END, + ) + calls_after_warmup = len(call_log) + + result = await client.get_channel_data( + channels=[_channel("c1"), _channel("c2")], + start_time=_NOW, + end_time=_WINDOW_END, + ) + new_calls = call_log[calls_after_warmup:] + + assert new_calls, "c2 should hit the wire on the second call" + for call in new_calls: + assert call["channel_ids"] == ["c2"], f"only c2 should hit the wire, saw {call!r}" + assert set(result.keys()) == {"c1", "c2"} + pd.testing.assert_frame_equal(result["c1"].sort_index(), c1_df.sort_index()) + pd.testing.assert_frame_equal(result["c2"].sort_index(), c2_df.sort_index()) + finally: + client.channel_cache.close() @pytest.mark.asyncio - async def test_ignore_cache_true_returns_fresh_and_skips_write(self) -> None: + async def test_ignore_cache_true_returns_fresh_and_skips_write(self, tmp_path) -> None: """``ignore_cache=True`` returns mock data and leaves the cache empty. End-to-end version of the latent bug that compounded the customer's OOM: pre-fix, ``_update_cache`` ran even when the caller had asked the cache to be ignored. """ - client = DataLowLevelClient(MagicMock()) + client = _client_with_cache(tmp_path) df = _frame("c1") - with _fake_grpc(client, {"c1": [df]}): - result = await client.get_channel_data( - channels=[_channel("c1")], - start_time=_NOW, - end_time=_WINDOW_END, - ignore_cache=True, - ) - pd.testing.assert_frame_equal(result["c1"], df) - assert "c1" not in client.channel_cache - assert client.channel_cache.total_bytes == 0 + try: + with _fake_grpc(client, {"c1": [df]}): + result = await client.get_channel_data( + channels=[_channel("c1")], + start_time=_NOW, + end_time=_WINDOW_END, + ignore_cache=True, + ) + pd.testing.assert_frame_equal(result["c1"], df) + assert "c1" not in client.channel_cache + finally: + client.channel_cache.close() diff --git a/python/lib/sift_client/_tests/resources/test_channels.py b/python/lib/sift_client/_tests/resources/test_channels.py index 3ed3826b1..ceee9ddef 100644 --- a/python/lib/sift_client/_tests/resources/test_channels.py +++ b/python/lib/sift_client/_tests/resources/test_channels.py @@ -503,69 +503,18 @@ async def fake_update_channel(update): assert captured["update"].unit == "" -class TestConfigureDataCache: - """``configure_data_cache`` is the resource-level knob for the in-memory - channel data cache. Before the cache is initialized, it stashes the value - for the lazy-init path; after, it retunes the live cache. - - Each test that triggers ``_ensure_data_low_level_client`` opens the - opt-out disk tier (redirected to ``tmp_path`` by the conftest fixture) - and closes the handle in ``finally`` so the diskcache lock doesn't leak - into the next test. - """ - - def test_before_lazy_init_propagates_to_cache(self): - """Configuring before the first ``get_data`` lands on the cache at init.""" - api = _make_api() - api.configure_data_cache(max_bytes=123) - assert api._data_low_level_client is None # still lazy - api._ensure_data_low_level_client() - try: - assert api._data_low_level_client.channel_cache.max_bytes == 123 - finally: - api._data_low_level_client.channel_cache.close() - - def test_after_lazy_init_updates_live_cache(self): - """Configuring after first use retunes the live cache in place.""" - api = _make_api() - api._ensure_data_low_level_client() - try: - original_client = api._data_low_level_client - api.configure_data_cache(max_bytes=456) - # Same wrapper instance — we mutated, not replaced. - assert api._data_low_level_client is original_client - assert api._data_low_level_client.channel_cache.max_bytes == 456 - finally: - api._data_low_level_client.channel_cache.close() - - def test_zero_disables_cache_via_resource(self): - """Resource-level ``max_bytes=0`` end-to-end disables the cache.""" - api = _make_api() - api.configure_data_cache(max_bytes=0) - api._ensure_data_low_level_client() - try: - assert not api._data_low_level_client.channel_cache.enabled - finally: - api._data_low_level_client.channel_cache.close() - - def test_negative_raises(self): - api = _make_api() - with pytest.raises(ValueError, match="max_bytes"): - api.configure_data_cache(max_bytes=-1) - - class TestEnableDataCacheDisk: """``enable_data_cache_disk`` / ``disable_data_cache_disk`` plumb the disk - tier setting to the underlying ``ChannelCache``, both pre- and post-init. + cache settings to the underlying ``ChannelCache``, both pre- and post-init. - The disk tier itself is exercised directly in - ``test_data.py::TestChannelCacheDisk``; the tests here just verify the + The cache itself is exercised directly in + ``test_data.py::TestChannelCache``; the tests here just verify the resource-level wiring around it. """ def test_enabled_by_default(self): - """Disk persistence is opt-out: the default-constructed resource - lands at ``ChannelCache.DEFAULT_DISK_PATH`` on first ``get_data``. + """Disk caching is opt-out: the default-constructed resource lands + at ``ChannelCache.DEFAULT_DISK_PATH`` on first ``get_data``. The autouse ``_isolate_default_disk_cache_path`` fixture in ``conftest.py`` redirects the constant to a per-test tmp dir so this @@ -652,7 +601,7 @@ def test_clear_data_cache_on_disk_proxies_to_cache(self, tmp_path): path = tmp_path / "to-clear" # Populate a real disk-cache directory so the marker check passes. - cache = ChannelCache(max_bytes=10_000_000, disk_path=path) + cache = ChannelCache(disk_path=path) cache.close() assert path.exists() @@ -660,9 +609,9 @@ def test_clear_data_cache_on_disk_proxies_to_cache(self, tmp_path): api.clear_data_cache_on_disk(path) assert not path.exists() - def test_default_path_failure_falls_back_to_memory(self, monkeypatch, tmp_path): - """If the opt-out default disk path can't be opened, the wrapper logs - a warning and continues with the in-memory cache only. + def test_default_path_failure_falls_back_to_no_cache(self, monkeypatch, tmp_path): + """If the opt-out default cache path can't be opened, the wrapper + logs a warning and continues with caching disabled. Simulated by pointing ``DEFAULT_DISK_PATH`` at a path that already exists as a regular file — ``os.makedirs(..., exist_ok=True)`` raises @@ -678,9 +627,8 @@ def test_default_path_failure_falls_back_to_memory(self, monkeypatch, tmp_path): api._ensure_data_low_level_client() # must not raise cache = api._data_low_level_client.channel_cache try: - # Disk silently dropped, memory still working. + # Cache silently dropped; ``get_data`` will go straight to the wire. assert not cache.disk_enabled - assert cache.enabled finally: cache.close() diff --git a/python/lib/sift_client/resources/channels.py b/python/lib/sift_client/resources/channels.py index 794930fda..6be88f84e 100644 --- a/python/lib/sift_client/resources/channels.py +++ b/python/lib/sift_client/resources/channels.py @@ -69,34 +69,8 @@ def __init__(self, sift_client: SiftClient): self._low_level_client = ChannelsLowLevelClient(grpc_client=self.client.grpc_client) self._units_low_level_client = UnitsLowLevelClient(grpc_client=self.client.grpc_client) self._data_low_level_client = None - # Caller-supplied cache size; ``None`` means "use the wrapper default - # at lazy-init time" so we don't have to import ``data.py`` (and - # therefore pandas) just to remember the default. - self._data_cache_max_bytes: int | None = None self._disk_cache_config = DiskCacheConfig(enabled=True) - def configure_data_cache(self, *, max_bytes: int) -> None: - """Configure the in-memory channel data cache used by ``get_data``. - - Args: - max_bytes: Byte cap on the cache. ``0`` disables caching - (every ``get_data`` call goes to the wire). Defaults to - 512 MiB until explicitly configured. Must be ``>= 0``. - - Safe to call before or after the first ``get_data``. If the cache is - already live, the new cap is applied immediately and least-recently- - used entries are evicted until ``total_bytes`` fits. - - Example: - client.channels.configure_data_cache(max_bytes=128 * 1024 * 1024) - client.channels.configure_data_cache(max_bytes=0) # disable - """ - if max_bytes < 0: - raise ValueError(f"max_bytes must be >= 0, got {max_bytes}") - self._data_cache_max_bytes = max_bytes - if self._data_low_level_client is not None: - self._data_low_level_client.channel_cache.max_bytes = max_bytes - def enable_data_cache_disk( self, *, @@ -107,28 +81,28 @@ def enable_data_cache_disk( Disk persistence is **on by default** at ``ChannelCache.DEFAULT_DISK_PATH``; use this method when you want to override the path or size, or to turn - the tier back on after a prior ``disable_data_cache_disk`` call. + the cache back on after a prior ``disable_data_cache_disk`` call. - The disk-backed tier is a second-chance layer beneath the in-memory - cache: on a memory miss, ``get_data`` checks disk before going to the - wire. The default path lives under ``tempfile.gettempdir()`` and is - shared across sessions, so a re-run of the same workload picks up - previously-cached windows without a fetch. + Each entry that ``get_data`` returns is written to the cache and read + back on subsequent calls, even after process restart. The default + path lives under ``tempfile.gettempdir()`` and is shared across + sessions, so a re-run of the same workload picks up previously-cached + windows without a fetch. Safe to call before or after the first ``get_data``. Reconfiguring - (different ``path`` or ``max_bytes``) closes the previous disk handle - and opens a new one; in-memory contents are preserved across the swap. + (different ``path`` or ``max_bytes``) closes the previous handle and + opens a new one. An explicit ``path`` that can't be opened (e.g. permission denied, read-only filesystem) raises so the caller knows the request didn't take. The default-path open does *not* raise — see - ``_ensure_data_low_level_client`` for the fall-back-to-memory path. + ``_ensure_data_low_level_client`` for the silent fall-back behaviour. Args: path: Directory to persist the cache to. ``None`` (the default) uses ``ChannelCache.DEFAULT_DISK_PATH``. Existing entries at the path become available as cache hits. - max_bytes: Byte cap on the disk tier. ``None`` uses + max_bytes: Byte cap on disk usage. ``None`` uses ``ChannelCache.DEFAULT_DISK_MAX_BYTES`` (4 GiB). When the bound is reached, ``diskcache``'s LRU eviction takes over. @@ -141,13 +115,12 @@ def enable_data_cache_disk( self._data_low_level_client.channel_cache.enable_disk(path=path, max_bytes=max_bytes) def disable_data_cache_disk(self) -> None: - """Opt out of disk persistence for the channel data cache. + """Opt out of caching for ``get_data`` (no reads or writes). - Disk persistence is on by default; call this when you don't want any - cached data written to disk. Closes any open disk-cache file handle. - The on-disk directory is NOT deleted — use - :meth:`clear_data_cache_on_disk` to wipe it. In-memory entries are - preserved. + Caching is on by default; call this when you don't want any cached + data written to or read from disk. Closes any open cache file + handle. The on-disk directory is NOT deleted — use + :meth:`clear_data_cache_on_disk` to wipe it. """ self._disk_cache_config.disable() if self._data_low_level_client is not None: @@ -362,16 +335,12 @@ def _ensure_data_low_level_client(self): DataLowLevelClient, ) - # Pass each kwarg only when explicitly configured so the wrapper's - # own defaults remain the single source of truth. kwargs: dict = {} - if self._data_cache_max_bytes is not None: - kwargs["data_cache_max_bytes"] = self._data_cache_max_bytes disk_config = self._disk_cache_config if disk_config.enabled: - # ``disk_path=None`` means "disabled" to ChannelCache; substitute + # ``disk_path=None`` means "no cache" to ChannelCache; substitute # the default explicitly so the opt-out default still opens - # the disk tier. ``DEFAULT_DISK_PATH`` is read here (not at + # the cache. ``DEFAULT_DISK_PATH`` is read here (not at # config construction) so test fixtures that monkeypatch the # class attribute see the override. kwargs["disk_cache_path"] = disk_config.path or ChannelCache.DEFAULT_DISK_PATH @@ -383,26 +352,23 @@ def _ensure_data_low_level_client(self): **kwargs, ) except Exception: - # Explicit user-supplied disk path failures propagate so the + # Explicit user-supplied paths failures propagate so the # caller knows their request didn't take. Default-path failures # (read-only ``/tmp``, restricted containers, etc.) degrade - # silently to memory-only so ``get_data`` still works. + # silently to no-cache mode so ``get_data`` still works. if not disk_config.using_default_path: raise logger.warning( - "Could not open the default channel data disk cache at %r; " - "falling back to in-memory cache only. Call " + "Could not open the default channel data cache at %r; " + "falling back to no caching for ``get_data``. Call " "``client.channels.disable_data_cache_disk()`` to silence " "this warning, or pass an explicit path via " "``enable_data_cache_disk(path=...)``.", kwargs.get("disk_cache_path"), exc_info=True, ) - kwargs.pop("disk_cache_path", None) - kwargs.pop("disk_cache_max_bytes", None) self._data_low_level_client = DataLowLevelClient( grpc_client=self.client.grpc_client, - **kwargs, ) async def get_data( diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index 8e76a56ff..cc3ec914f 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -478,32 +478,13 @@ class ChannelsAPI: """ ... - def configure_data_cache(self, *, max_bytes: int) -> None: - """Configure the in-memory channel data cache used by ``get_data``. - - Args: - max_bytes: Byte cap on the cache. ``0`` disables caching - (every ``get_data`` call goes to the wire). Defaults to - 512 MiB until explicitly configured. Must be ``>= 0``. - - Safe to call before or after the first ``get_data``. If the cache is - already live, the new cap is applied immediately and least-recently- - used entries are evicted until ``total_bytes`` fits. - - Example: - client.channels.configure_data_cache(max_bytes=128 * 1024 * 1024) - client.channels.configure_data_cache(max_bytes=0) # disable - """ - ... - def disable_data_cache_disk(self) -> None: - """Opt out of disk persistence for the channel data cache. + """Opt out of caching for ``get_data`` (no reads or writes). - Disk persistence is on by default; call this when you don't want any - cached data written to disk. Closes any open disk-cache file handle. - The on-disk directory is NOT deleted — use - :meth:`clear_data_cache_on_disk` to wipe it. In-memory entries are - preserved. + Caching is on by default; call this when you don't want any cached + data written to or read from disk. Closes any open cache file + handle. The on-disk directory is NOT deleted — use + :meth:`clear_data_cache_on_disk` to wipe it. """ ... @@ -514,28 +495,28 @@ class ChannelsAPI: Disk persistence is **on by default** at ``ChannelCache.DEFAULT_DISK_PATH``; use this method when you want to override the path or size, or to turn - the tier back on after a prior ``disable_data_cache_disk`` call. + the cache back on after a prior ``disable_data_cache_disk`` call. - The disk-backed tier is a second-chance layer beneath the in-memory - cache: on a memory miss, ``get_data`` checks disk before going to the - wire. The default path lives under ``tempfile.gettempdir()`` and is - shared across sessions, so a re-run of the same workload picks up - previously-cached windows without a fetch. + Each entry that ``get_data`` returns is written to the cache and read + back on subsequent calls, even after process restart. The default + path lives under ``tempfile.gettempdir()`` and is shared across + sessions, so a re-run of the same workload picks up previously-cached + windows without a fetch. Safe to call before or after the first ``get_data``. Reconfiguring - (different ``path`` or ``max_bytes``) closes the previous disk handle - and opens a new one; in-memory contents are preserved across the swap. + (different ``path`` or ``max_bytes``) closes the previous handle and + opens a new one. An explicit ``path`` that can't be opened (e.g. permission denied, read-only filesystem) raises so the caller knows the request didn't take. The default-path open does *not* raise — see - ``_ensure_data_low_level_client`` for the fall-back-to-memory path. + ``_ensure_data_low_level_client`` for the silent fall-back behaviour. Args: path: Directory to persist the cache to. ``None`` (the default) uses ``ChannelCache.DEFAULT_DISK_PATH``. Existing entries at the path become available as cache hits. - max_bytes: Byte cap on the disk tier. ``None`` uses + max_bytes: Byte cap on disk usage. ``None`` uses ``ChannelCache.DEFAULT_DISK_MAX_BYTES`` (4 GiB). When the bound is reached, ``diskcache``'s LRU eviction takes over. From babb927b696fd8c022ced4f3e588f4256d5bf8f5 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 25 Jun 2026 20:42:28 -0700 Subject: [PATCH 13/14] Split channel cache into shared DiskCache + ChannelDataCache adapter. Move the diskcache mechanics into one client-owned store (DiskCache) so every cache-aware resource shares a single byte budget and LRU. Channels wrap the store in a ChannelDataCache adapter that namespaces keys as "channel:". Cache knobs (enable_disk / disable_disk / clear_disk) move from the channels resource to a client.cache namespace. Co-authored-by: Cursor --- python/CHANGELOG.md | 16 +- .../sift_client/_internal/cache_namespace.py | 114 ++++ .../lib/sift_client/_internal/disk_cache.py | 336 +++++++++++ .../_internal/low_level_wrappers/data.py | 349 +++--------- .../_internal/low_level_wrappers/test_data.py | 523 +++++------------- .../_tests/_internal/test_disk_cache.py | 343 ++++++++++++ python/lib/sift_client/_tests/conftest.py | 33 +- .../_tests/resources/test_channels.py | 142 ----- .../sift_client/_tests/test_client_cache.py | 261 +++++++++ python/lib/sift_client/client.py | 67 +++ python/lib/sift_client/resources/channels.py | 134 +---- .../resources/sync_stubs/__init__.pyi | 74 --- 12 files changed, 1379 insertions(+), 1013 deletions(-) create mode 100644 python/lib/sift_client/_internal/cache_namespace.py create mode 100644 python/lib/sift_client/_internal/disk_cache.py create mode 100644 python/lib/sift_client/_tests/_internal/test_disk_cache.py create mode 100644 python/lib/sift_client/_tests/test_client_cache.py diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index d3ffaeb65..4b9205bbd 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -11,29 +11,29 @@ This project adheres to [Semantic Versioning](http://semver.org/). Up to a ~80x speedup for some get_data calls. -#### Channel data cache (opt-out, on by default) +#### Shared on-disk cache (opt-out, on by default) `client.channels.get_data(...)` now caches the channel windows it returns to disk by default. Subsequent calls covering the same channel/time range — including from a fresh process — read straight out of the cache instead of going to the wire. This also bounds memory: nothing is held in process after the call returns, which fixes the OOM seen on long sustained pulls (~5–7 GB of cache for a 145M-point pull in earlier versions). -The default location is `/sift-channel-data-cache`, capped at 4 GiB with LRU eviction. If the default path can't be opened (read-only filesystem, restricted container, etc.), the client logs a warning and continues with caching disabled — `get_data` still works, it just always goes to the wire. +The cache lives on the `SiftClient` as a single shared store: every cache-aware resource writes to one global byte budget at one path, with one LRU policy. The default location is `/sift-data-cache`, capped at 4 GiB with LRU eviction. If the default path can't be opened (read-only filesystem, restricted container, etc.), the client logs a warning and continues with caching disabled — `get_data` still works, it just always goes to the wire. `ignore_cache=True` on `client.channels.get_data(...)` now skips writing into the cache as well as reading from it. Previously a "non-caching" workload still appended to the shared cache on every call. -Opt out, reconfigure, or wipe the cache from the `channels` resource: +Configuration lives on the new `client.cache` namespace — knobs are global because the store is shared: ```python # Opt out — no data persisted to disk; every get_data call goes to the wire. -client.channels.disable_data_cache_disk() +client.cache.disable_disk() # Reconfigure the location or byte cap. -client.channels.enable_data_cache_disk(path="/data/sift-cache", max_bytes=2 * 1024 ** 3) +client.cache.enable_disk(path="/data/sift-cache", max_bytes=2 * 1024 ** 3) # Remove a stale or corrupted cache directory. -client.channels.clear_data_cache_on_disk() # default tmp path -client.channels.clear_data_cache_on_disk("/data/sift-cache") # custom path +client.cache.clear_disk() # default tmp path +client.cache.clear_disk("/data/sift-cache") # custom path ``` -`enable_data_cache_disk` is also the way to turn the cache back on after a prior `disable_data_cache_disk` call. +`enable_disk` is also the way to turn the cache back on after a prior `disable_disk` call. The cache is powered by [`diskcache`](https://grantjenks.com/docs/diskcache/) (pure-Python, SQLite-backed) with LRU eviction. diff --git a/python/lib/sift_client/_internal/cache_namespace.py b/python/lib/sift_client/_internal/cache_namespace.py new file mode 100644 index 000000000..c76ccaeb9 --- /dev/null +++ b/python/lib/sift_client/_internal/cache_namespace.py @@ -0,0 +1,114 @@ +"""User-facing surface for the shared on-disk cache. + +This module hosts the small bag of methods exposed as ``client.cache``. +The cache itself (a :class:`~sift_client._internal.disk_cache.DiskCache`) +lives on :class:`~sift_client.client.SiftClient` so every resource that +wants to persist results across calls can reach into one shared store. + +The namespace deliberately mirrors :class:`DiskCache` rather than the +old per-resource API (``client.channels.enable_data_cache_disk(...)``): +since the store is shared, configuration is global. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +from sift_client._internal.disk_cache import DiskCache + +if TYPE_CHECKING: + import os + + from sift_client.client import SiftClient + +logger = logging.getLogger(__name__) + + +class CacheNamespace: + """Resource-agnostic surface for the on-disk cache shared by all resources. + + Exposed as ``client.cache``. The actual handle (:class:`DiskCache`) is + constructed lazily on first use so importing :mod:`sift_client` doesn't + pay the diskcache cost up front. Configuration changes made before + that first use are recorded against the + :class:`~sift_client._internal.disk_cache_config.DiskCacheConfig` on the + client and applied when the store opens; changes after first use are + routed directly to the live :class:`DiskCache`. + + Default policy: disk caching is **opt-out** (the ``DiskCacheConfig`` is + constructed with ``enabled=True``). Users who don't want any state on + disk call :meth:`disable_disk` to silence it; users who want a custom + location or byte cap call :meth:`enable_disk` with arguments. + """ + + def __init__(self, client: SiftClient): + self._client = client + + def enable_disk( + self, + *, + path: str | os.PathLike[str] | None = None, + max_bytes: int | None = None, + ) -> None: + """Enable (or reconfigure) on-disk caching. + + Disk caching is **on by default** at :attr:`DiskCache.DEFAULT_DISK_PATH`; + use this method to override the path or size, or to turn the cache + back on after a prior :meth:`disable_disk` call. + + Reconfiguring a live cache (different ``path`` or ``max_bytes``) + closes the previous handle and opens a new one. Existing entries + at the new path become available as cache hits. + + An explicit ``path`` that can't be opened (permission denied, + read-only filesystem, ...) raises so the caller knows their + request didn't take. The default-path open does *not* raise — see + :meth:`SiftClient._get_disk_cache` for the silent fall-back. + + Args: + path: Directory to persist to. ``None`` (the default) uses + :attr:`DiskCache.DEFAULT_DISK_PATH`. + max_bytes: Byte cap on disk usage. ``None`` uses + :attr:`DiskCache.DEFAULT_DISK_MAX_BYTES` (4 GiB). When the + bound is reached, ``diskcache``'s LRU eviction takes over. + + Example: + client.cache.enable_disk(path="/data/sift-cache") + client.cache.enable_disk(max_bytes=1024 ** 3) # 1 GiB + """ + client = self._client + client._disk_cache_config.enable(path=path, max_bytes=max_bytes) + if client._disk_cache is not None: + client._disk_cache.enable_disk(path=path, max_bytes=max_bytes) + + def disable_disk(self) -> None: + """Opt out of on-disk caching (no reads or writes). + + Caching is on by default; call this when you don't want any + cached data written to or read from disk. Closes any open cache + file handle. The on-disk directory is NOT deleted — use + :meth:`clear_disk` to wipe it. + """ + client = self._client + client._disk_cache_config.disable() + if client._disk_cache is not None: + client._disk_cache.disable_disk() + + def clear_disk(self, path: str | os.PathLike[str] | None = None) -> None: + """Delete a previously-persisted on-disk cache directory. + + Drops stale caches from previous sessions, recovers from a + corrupt cache, or reclaims disk space. Removes the directory + entirely; if disk caching is on, the next access re-opens an + empty cache at the same path. + + Args: + path: Directory of the cache to clear. ``None`` (the default) + targets :attr:`DiskCache.DEFAULT_DISK_PATH`. + + Raises: + ValueError: If ``path`` exists but does not look like a sift + data cache directory. + """ + DiskCache.clear_disk(path) diff --git a/python/lib/sift_client/_internal/disk_cache.py b/python/lib/sift_client/_internal/disk_cache.py new file mode 100644 index 000000000..0118c6477 --- /dev/null +++ b/python/lib/sift_client/_internal/disk_cache.py @@ -0,0 +1,336 @@ +"""Shared on-disk key/value store used by every resource that wants to cache results. + +One :class:`DiskCache` instance lives on the :class:`SiftClient` (see +``client._disk_cache``). Resources don't construct their own — they receive +a reference and wrap it in a typed adapter that namespaces keys (e.g. +``ChannelDataCache`` in ``low_level_wrappers/data.py``). The store itself +is deliberately value-agnostic: callers hand in ``size_bytes`` for the +oversize guard, ``diskcache`` pickles whatever object the caller supplied, +and the store never needs to know what's inside. + +This module is the sibling of :mod:`._disk_cache_config` — the config +holds user intent (enabled / path / max_bytes) and the store is the live +handle keyed off that intent. + +Key behaviours pinned here so the adapter layer can stay thin: + +* Default path lives under :func:`tempfile.gettempdir` and is shared + across processes, so a fresh session reads previously-cached entries. +* The byte cap is one global budget; LRU eviction spans all resources + sharing the store (channels, calculated channels, exports, ...). +* :meth:`clear_disk` (classmethod) refuses to delete a directory that + doesn't look like a sift cache (no diskcache marker), so a typo'd + path can't take out the user's documents. +* Oversized entries are skipped with a one-shot warning per key — + otherwise diskcache's eviction loop would drain every other row + trying to fit an unfittable entry. +* Construction with ``disk_path=None`` (or after :meth:`disable_disk`) + is a silent no-op store. Callers don't need to branch on disabled + state; reads always miss and writes are dropped. +""" + +from __future__ import annotations + +import logging +import os +import shutil +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING, Any, Iterator, cast + +if TYPE_CHECKING: + import diskcache + +logger = logging.getLogger(__name__) + + +class DiskCache: + """Process-wide disk-backed key/value store. + + Wraps a :class:`diskcache.Cache` with the lifecycle management and + safety rails sift resources rely on. The instance is shared — each + resource adapter namespaces its keys (e.g. ``channel:``) so multiple + resources can write to the same store without colliding. + + When ``disk_path`` is ``None``, the instance is a silent no-op: every + ``get`` misses, every ``put`` is dropped, and ``__contains__`` is + always ``False``. This lets callers treat "caching disabled" the same + as a cold cache, with no branching needed at the read/write site. + + Args: + disk_path: Directory to persist to. ``None`` keeps the store + disabled. A previously-populated directory is reused, so a + fresh process reading the same path sees existing entries. + disk_max_bytes: Byte cap on the store. ``None`` falls back to + :attr:`DEFAULT_DISK_MAX_BYTES`. Ignored when ``disk_path`` + is ``None``. + """ + + #: Default directory for the shared cache. Lives under + #: :func:`tempfile.gettempdir` so it survives across sessions of the + #: same user but doesn't pollute the home directory. The suffix is + #: fixed so multiple ``SiftClient`` instances naturally share the + #: same store and pick up each other's prior sessions. + DEFAULT_DISK_PATH: str = os.path.join(tempfile.gettempdir(), "sift-data-cache") + + #: Default byte cap when :meth:`enable_disk` is called without an + #: explicit ``max_bytes``. 4 GiB is generous for the typical ``/tmp`` + #: filesystem; ``diskcache`` enforces the cap with its own SQLite- + #: backed LRU eviction once the bound is reached. + DEFAULT_DISK_MAX_BYTES: int = 4 * 1024 * 1024 * 1024 + + #: Marker file ``diskcache`` writes inside every cache directory. The + #: classmethod :meth:`clear_disk` checks for this before any + #: ``shutil.rmtree`` so a typo'd path can't wipe out an unrelated + #: directory. + _DISKCACHE_MARKER: str = "cache.db" + + def __init__( + self, + *, + disk_path: str | os.PathLike[str] | None = None, + disk_max_bytes: int | None = None, + ): + # Keys we've already logged an "entry exceeds disk cap" warning + # for. Tracks the full namespaced key (e.g. ``channel:foo``), not + # the resource-side id, so two adapters that happen to share an + # id space don't collide on dedup. A successful normal put + # clears the bit so a future regression re-warns. + self._oversized_warned: set[str] = set() + self._disk: diskcache.Cache | None = None + self._disk_path: str | None = None + self._disk_max_bytes: int | None = None + if disk_path is not None: + self._open_disk( + str(disk_path), + disk_max_bytes if disk_max_bytes is not None else self.DEFAULT_DISK_MAX_BYTES, + ) + + @classmethod + def clear_disk(cls, path: str | os.PathLike[str] | None = None) -> None: + """Delete a previously-persisted on-disk cache directory. + + Use this to drop stale caches from previous sessions, recover + from a corrupt cache, or reclaim disk space. The directory is + removed entirely; a future :meth:`enable_disk` call at the same + path opens a fresh empty cache. + + Args: + path: Directory of the cache to clear. ``None`` (the default) + targets :attr:`DEFAULT_DISK_PATH`. + + Raises: + ValueError: If ``path`` exists but does not look like a sift + cache directory (missing the ``diskcache`` marker file). + The guard makes accidental misuse a hard error rather + than silent data loss. + """ + target = Path(path) if path is not None else Path(cls.DEFAULT_DISK_PATH) + if not target.exists(): + return + if not (target / cls._DISKCACHE_MARKER).exists(): + raise ValueError( + f"{str(target)!r} does not look like a sift data cache " + f"directory (missing {cls._DISKCACHE_MARKER!r} marker). " + f"Refusing to delete." + ) + shutil.rmtree(target) + + @property + def disk_enabled(self) -> bool: + """Whether a disk handle is currently open.""" + return self._disk is not None + + @property + def disk_path(self) -> str | None: + """Filesystem path of the cache when enabled, else ``None``.""" + return self._disk_path + + @property + def disk_max_bytes(self) -> int | None: + """Configured byte cap on disk usage, or ``None`` when disabled.""" + return self._disk_max_bytes + + def __contains__(self, key: str) -> bool: + """True if ``key`` is cached. Always ``False`` when disabled.""" + if self._disk is None: + return False + return key in self._disk + + def __iter__(self) -> Iterator[str]: + """Yield cached keys. Lets adapters scope a clear to their prefix. + + Yields nothing when disabled. The underlying diskcache iterator + is snapshot-style, but callers that intend to mutate during + iteration should still wrap with ``list(...)`` to be safe. + + ``diskcache.Cache`` is typed as yielding ``bytes | str | ...`` + because it supports arbitrary key types; the cast narrows to the + ``str`` contract this layer enforces. Adapters never write + non-string keys. + """ + if self._disk is None: + return + for key in self._disk: + yield cast("str", key) + + def enable_disk( + self, + *, + path: str | os.PathLike[str] | None = None, + max_bytes: int | None = None, + ) -> None: + """Open the disk handle, replacing any previous one. + + Reconfiguring to a different ``path`` or ``max_bytes`` closes the + prior handle first. Existing entries at the new path become + available via :meth:`get` without further setup. + + Args: + path: Directory to persist to. ``None`` uses + :attr:`DEFAULT_DISK_PATH`. + max_bytes: Byte cap (``None`` → :attr:`DEFAULT_DISK_MAX_BYTES`). + """ + target_path = str(path) if path is not None else self.DEFAULT_DISK_PATH + target_max = max_bytes if max_bytes is not None else self.DEFAULT_DISK_MAX_BYTES + if ( + self._disk is not None + and self._disk_path == target_path + and self._disk_max_bytes == target_max + ): + return + self._close_disk() + self._open_disk(target_path, target_max) + + def disable_disk(self) -> None: + """Close the disk handle (if open). Does not touch on-disk contents. + + Use :meth:`clear_disk` to remove a directory from disk. + """ + self._close_disk() + + def get(self, key: str) -> Any | None: + """Return the cached value for ``key`` or ``None`` on a miss. + + Returns ``None`` for misses, decoded values for hits, and ``None`` + (after self-invalidating the row) for corrupt entries surfaced + by ``diskcache`` as ``sqlite3.DatabaseError`` or similar. The + caller is expected to ``isinstance``-check the result against + the type they wrote. + """ + if self._disk is None: + return None + try: + return self._disk.get(key, default=None, retry=True) + except Exception: + # diskcache surfaces ``sqlite3.DatabaseError`` (and friends) + # for corrupt or partially-written entries from a prior + # session. Treat as a miss and force-drop the bad row so + # we don't repeatedly trip the same path. + logger.warning("disk cache read failed for %s; invalidating", key) + try: + del self._disk[key] + except Exception: + pass + return None + + def put(self, key: str, value: Any, *, size_bytes: int) -> None: + """Write ``value`` under ``key``. No-op when disabled. + + Entries whose ``size_bytes`` exceeds :attr:`disk_max_bytes` are + skipped with a one-shot warning per key, since diskcache's + eviction loop would otherwise drain every other row trying — and + failing — to fit an oversized entry. Callers are responsible + for measuring the size; the store stays value-agnostic. + + Args: + key: Namespaced key (e.g. ``"channel:"``). Adapters are + responsible for picking a prefix that won't collide with + other adapters writing to the same store. + value: Anything ``diskcache`` can pickle. + size_bytes: Caller-measured size used for the oversize guard. + """ + if self._disk is None: + return + if self._disk_max_bytes is not None and size_bytes > self._disk_max_bytes: + if key not in self._oversized_warned: + logger.warning( + "Entry for %s (%d bytes) is larger than the disk " + "cache cap (%d bytes); skipping disk cache for this " + "entry so other entries aren't evicted. Raise the " + "cap via ``client.cache.enable_disk(max_bytes=...)`` " + "to cache this entry on disk.", + key, + size_bytes, + self._disk_max_bytes, + ) + self._oversized_warned.add(key) + try: + self._disk.delete(key, retry=True) + except Exception: + pass + return + try: + self._disk.set(key, value, retry=True) + self._oversized_warned.discard(key) + except Exception: + # Best-effort persistence: keep going on disk errors so the + # caller's request still succeeds. Drop the (possibly + # partial) disk row. + logger.warning("disk cache write failed for %s; invalidating", key) + try: + self._disk.delete(key, retry=True) + except Exception: + pass + + def invalidate(self, key: str) -> None: + """Remove ``key`` from the cache. Safe to call when absent.""" + # Invalidation is a fresh start for this key; the next put should + # re-evaluate against the current cap and re-warn if still too big. + self._oversized_warned.discard(key) + if self._disk is not None: + try: + self._disk.delete(key, retry=True) + except Exception: + pass + + def clear(self) -> None: + """Wipe every entry from the store. The directory itself remains. + + Spans all adapters sharing the store — typically used at test + teardown or for full reset. Adapters that want to wipe only their + own namespace should iterate ``self`` and call :meth:`invalidate` + on matching keys. + """ + self._oversized_warned.clear() + if self._disk is not None: + self._disk.clear() + + def close(self) -> None: + """Release the disk file handle. Safe to call when disabled.""" + self._close_disk() + + def _open_disk(self, path: str, max_bytes: int) -> None: + import diskcache + + os.makedirs(path, exist_ok=True) + self._disk = diskcache.Cache( + directory=path, + size_limit=max_bytes, + eviction_policy="least-recently-used", + statistics=0, + tag_index=0, + ) + self._disk_path = path + self._disk_max_bytes = max_bytes + + def _close_disk(self) -> None: + if self._disk is None: + return + try: + self._disk.close() + except Exception: + pass + self._disk = None + self._disk_path = None + self._disk_max_bytes = None diff --git a/python/lib/sift_client/_internal/low_level_wrappers/data.py b/python/lib/sift_client/_internal/low_level_wrappers/data.py index 03ab29268..c524a9e03 100644 --- a/python/lib/sift_client/_internal/low_level_wrappers/data.py +++ b/python/lib/sift_client/_internal/low_level_wrappers/data.py @@ -2,11 +2,7 @@ import asyncio import logging -import os -import shutil -import tempfile from datetime import datetime, timezone -from pathlib import Path from typing import TYPE_CHECKING, Any, cast import pandas as pd @@ -20,14 +16,13 @@ ) from sift.data.v2.data_pb2_grpc import DataServiceStub +from sift_client._internal.disk_cache import DiskCache from sift_client._internal.low_level_wrappers.base import LowLevelClientBase from sift_client._internal.time import to_timestamp_nanos from sift_client.sift_types.channel import Channel, ChannelDataType from sift_client.transport import WithGrpcClient if TYPE_CHECKING: - import diskcache - from sift_client.transport.grpc_transport import GrpcClient # Configure logging @@ -60,283 +55,100 @@ def _new_cache_entry( ) -class ChannelCache: - """Disk-backed cache of per-channel DataFrames. - - A ``diskcache``-backed key/value store that survives process restarts. - ``put`` writes through to disk, ``get`` reads from disk, and - ``invalidate``/``clear`` remove entries. The disk tier has a byte cap - that ``diskcache`` enforces with its own LRU eviction. - - When no ``disk_path`` is supplied the cache is a no-op: ``get`` always - returns ``None``, ``__contains__`` is always ``False``, and ``put`` is - silently dropped. This is the "caching disabled" mode used after a - :meth:`disable_disk` call (or when disk persistence is turned off on - the owning resource). - - An in-memory tier previously sat in front of disk. It was removed once - benchmarks showed that for the workloads driving the OOM regression the - extra memory footprint outweighed the per-call pickle/deserialize cost - on a warm disk hit; if profiling shows the disk reads dominating again, - re-introduce a small front cache here. +class ChannelDataCache: + """Channel-side adapter over the shared :class:`DiskCache` store. + + The store is owned by :class:`~sift_client.client.SiftClient` and + shared by every cache-aware resource; this adapter is the typed, + namespaced view of it that the channel data path uses. + + Responsibilities the adapter holds onto: + + * **Key namespacing.** Every read/write goes through :meth:`_key`, + which prefixes the channel id with ``channel:``. That keeps a + future calculated-channels or exports adapter on the same store + from colliding on raw resource ids. + * **Typing.** ``put`` only accepts :class:`ChannelCacheEntry`; + ``get`` ``isinstance``-checks the raw value before handing it back, + so a corrupt or cross-adapter row reads as a miss instead of + blowing up downstream pandas code. + * **Size measurement.** The store stays value-agnostic; the adapter + already computes ``size_bytes`` on the entry via + :func:`_new_cache_entry` (``DataFrame.memory_usage(deep=True)``) so + it just forwards that to the store's oversize guard. + * **Resource-side state.** :attr:`name_id_map` lives here because + it's channel-specific bookkeeping needed to wire raw fetch + responses (keyed by channel *name*) back to the cache (keyed by + channel *id*). + + The :class:`DiskCacheAdapter` ``Protocol`` is intentionally not + declared yet — there's only one adapter shape so far. When a second + resource grows its own adapter, extract the Protocol from the two + real shapes rather than guessing from one. """ - #: Default directory for the cache. Lives under ``tempfile.gettempdir()`` - #: so it survives across sessions of the same user but doesn't pollute - #: the user's home dir. The suffix is fixed so multiple processes - #: (different ``SiftClient`` instances, notebooks, etc.) naturally share - #: the same store and can read each other's prior sessions. - DEFAULT_DISK_PATH: str = os.path.join(tempfile.gettempdir(), "sift-channel-data-cache") - - #: Default byte cap for the cache when ``enable_disk`` is called without - #: an explicit ``max_bytes``. 4 GiB is a generous ceiling for the typical - #: ``/tmp`` filesystem; ``diskcache`` enforces it with its own SQLite- - #: backed LRU eviction once the bound is reached. - DEFAULT_DISK_MAX_BYTES: int = 4 * 1024 * 1024 * 1024 + #: Namespace prefix for keys this adapter writes to the shared + #: :class:`DiskCache`. Picked at class scope so adapters in other + #: resources can pick distinct prefixes without runtime negotiation. + KEY_PREFIX: str = "channel:" - #: Marker file ``diskcache`` writes inside every cache directory. We - #: sanity-check for this before any ``shutil.rmtree`` so a typo in the - #: ``clear_disk`` ``path`` argument can't wipe out an unrelated directory. - _DISKCACHE_MARKER: str = "cache.db" - - def __init__( - self, - *, - disk_path: str | os.PathLike[str] | None = None, - disk_max_bytes: int | None = None, - ): - """Construct a disk-backed cache. + def __init__(self, store: DiskCache): + """Wrap ``store`` with channel-data semantics. Args: - disk_path: Directory for the cache. ``None`` disables caching - entirely (every operation becomes a no-op). A previously- - populated directory is reused, so subsequent sessions can - read existing entries. - disk_max_bytes: Byte cap on disk usage. ``None`` falls back to - ``DEFAULT_DISK_MAX_BYTES``. Ignored when ``disk_path`` is - ``None``. + store: The shared :class:`DiskCache` instance owned by the + :class:`SiftClient`. Multiple adapters may share one store. """ + self._store = store self.name_id_map: dict[str, str] = {} - # Channels we've already logged an "entry exceeds disk cap" warning - # for. The check on the put path would otherwise spam the log once - # per ``get_data`` call for any channel whose typical entry is bigger - # than the cap. A successful normal put for the same channel clears - # the bit so a future regression re-warns. - self._oversized_disk_warned: set[str] = set() - self._disk: diskcache.Cache | None = None - self._disk_path: str | None = None - self._disk_max_bytes: int | None = None - if disk_path is not None: - self._open_disk( - str(disk_path), - disk_max_bytes if disk_max_bytes is not None else self.DEFAULT_DISK_MAX_BYTES, - ) - - @classmethod - def clear_disk(cls, path: str | os.PathLike[str] | None = None) -> None: - """Delete a previously-persisted on-disk cache directory. - - Use this to drop stale caches from previous sessions, recover from a - corrupt cache, or reclaim disk space. The directory is removed - entirely; a future ``enable_disk`` call at the same path will see a - fresh empty cache. - - Args: - path: Directory of the cache to clear. ``None`` (the default) - targets :attr:`DEFAULT_DISK_PATH`. - - Raises: - ValueError: If ``path`` exists but does not look like a sift - channel data cache directory (missing the ``diskcache`` - marker file). This guard makes accidental misuse a hard - error rather than silent data loss. - """ - target = Path(path) if path is not None else Path(cls.DEFAULT_DISK_PATH) - if not target.exists(): - return - if not (target / cls._DISKCACHE_MARKER).exists(): - raise ValueError( - f"{str(target)!r} does not look like a sift channel data cache " - f"directory (missing {cls._DISKCACHE_MARKER!r} marker). " - f"Refusing to delete." - ) - shutil.rmtree(target) - - @property - def disk_enabled(self) -> bool: - """Whether the disk-backed store is currently open.""" - return self._disk is not None - @property - def disk_path(self) -> str | None: - """Filesystem path of the cache when enabled, else ``None``.""" - return self._disk_path + def _key(self, channel_id: str) -> str: + return f"{self.KEY_PREFIX}{channel_id}" @property - def disk_max_bytes(self) -> int | None: - """Configured byte cap on disk usage, or ``None`` when disabled.""" - return self._disk_max_bytes + def store(self) -> DiskCache: + """The shared underlying store. Tests reach in for store-level state.""" + return self._store def __contains__(self, channel_id: str) -> bool: - """True if the channel is cached on disk. + """True if the channel is cached. False when the store is disabled.""" + return self._key(channel_id) in self._store - Used by ``_filter_cached_channels`` to decide whether ``get_data`` - needs to hit the wire. A warm disk lets a fresh session avoid - re-fetching previously-served windows. - """ - if self._disk is None: - return False - return channel_id in self._disk - - def enable_disk( - self, - *, - path: str | os.PathLike[str] | None = None, - max_bytes: int | None = None, - ) -> None: - """Enable (or reconfigure) the disk-backed cache. - - If a previous disk handle was open at a different path or with a - different size cap, it's closed first. Disk contents at the new - path are NOT recreated from anywhere — only future writes land in - the new location. + def get(self, channel_id: str) -> ChannelCacheEntry | None: + """Return the entry for ``channel_id`` if cached, otherwise None. - Args: - path: Directory to persist to. ``None`` uses - :attr:`DEFAULT_DISK_PATH`. The directory is created if - missing; an existing one is opened in place and its - contents become available to ``get``. - max_bytes: Byte cap on disk usage (``None`` → - :attr:`DEFAULT_DISK_MAX_BYTES`). + Type-checks the raw value before returning so a row written by a + different adapter (or a corrupt entry that survived) reads as a + miss instead of being handed back as the wrong type. """ - target_path = str(path) if path is not None else self.DEFAULT_DISK_PATH - target_max = max_bytes if max_bytes is not None else self.DEFAULT_DISK_MAX_BYTES - if ( - self._disk is not None - and self._disk_path == target_path - and self._disk_max_bytes == target_max - ): - return - self._close_disk() - self._open_disk(target_path, target_max) - - def disable_disk(self) -> None: - """Close the disk handle (if open). Does not touch the disk contents. - - Use ``client.channels.clear_data_cache_on_disk(path)`` to remove a - directory from disk. - """ - self._close_disk() - - def get(self, channel_id: str) -> ChannelCacheEntry | None: - """Return the entry for ``channel_id`` if cached, otherwise None.""" - if self._disk is None: - return None - try: - disk_entry = self._disk.get(channel_id, default=None, retry=True) - except Exception: - # diskcache surfaces ``sqlite3.DatabaseError`` (and friends) for - # corrupt or partially-written entries from a prior session. - # Treat as a miss; force-drop the bad row so we don't repeatedly - # trip the same path. - logger.warning("disk cache read failed for %s; invalidating", channel_id) - try: - del self._disk[channel_id] - except Exception: - pass - return None - if disk_entry is None or not isinstance(disk_entry, ChannelCacheEntry): + raw = self._store.get(self._key(channel_id)) + if not isinstance(raw, ChannelCacheEntry): return None - return disk_entry + return raw def put(self, channel_id: str, entry: ChannelCacheEntry) -> None: """Insert or replace ``channel_id`` on disk. - No-op when the disk tier is disabled. Entries larger than - ``disk_max_bytes`` are skipped (with a one-shot warning per - channel) instead of being inserted, since diskcache's eviction - loop would otherwise drain every other row trying — and failing — - to fit them. + Forwards :attr:`ChannelCacheEntry.size_bytes` to the store so its + oversize guard can decide whether to write or skip+warn. No-op + when the underlying store is disabled. """ - if self._disk is None: - return - if self._disk_max_bytes is not None and entry.size_bytes > self._disk_max_bytes: - if channel_id not in self._oversized_disk_warned: - logger.warning( - "Channel %s data (%d bytes) is larger than the disk " - "cache cap (%d bytes); skipping disk cache for this " - "channel so other entries aren't evicted. Raise the " - "cap via ``client.channels.enable_data_cache_disk(" - "max_bytes=...)`` to cache this channel on disk.", - channel_id, - entry.size_bytes, - self._disk_max_bytes, - ) - self._oversized_disk_warned.add(channel_id) - try: - self._disk.delete(channel_id, retry=True) - except Exception: - pass - return - try: - self._disk.set(channel_id, entry, retry=True) - self._oversized_disk_warned.discard(channel_id) - except Exception: - # Best-effort persistence: keep going on disk errors so the - # user's ``get_data`` call still succeeds. Drop the (possibly - # partial) disk row. - logger.warning("disk cache write failed for %s; invalidating", channel_id) - try: - self._disk.delete(channel_id, retry=True) - except Exception: - pass + self._store.put(self._key(channel_id), entry, size_bytes=entry.size_bytes) def invalidate(self, channel_id: str) -> None: - """Remove ``channel_id`` from the cache. Safe to call when absent.""" - # Invalidation is a fresh start for this channel; if it was warned - # about as oversized previously, the next put should re-evaluate - # against the current cap and re-warn if still too big. - self._oversized_disk_warned.discard(channel_id) - if self._disk is not None: - try: - self._disk.delete(channel_id, retry=True) - except Exception: - pass + """Remove ``channel_id`` from the cache. Safe when absent.""" + self._store.invalidate(self._key(channel_id)) def clear(self) -> None: - """Wipe all entries from disk. The directory itself remains.""" - self._oversized_disk_warned.clear() - if self._disk is not None: - self._disk.clear() - - def close(self) -> None: - """Release the disk file handle. Safe to call without disk enabled.""" - self._close_disk() - - def _open_disk(self, path: str, max_bytes: int) -> None: - import diskcache - - os.makedirs(path, exist_ok=True) - self._disk = diskcache.Cache( - directory=path, - size_limit=max_bytes, - eviction_policy="least-recently-used", - statistics=0, - tag_index=0, - ) - self._disk_path = path - self._disk_max_bytes = max_bytes + """Wipe every channel entry. Other adapters' entries are preserved. - def _close_disk(self) -> None: - if self._disk is None: - return - try: - self._disk.close() - except Exception: - pass - self._disk = None - self._disk_path = None - self._disk_max_bytes = None + Walks the shared store's keyspace once and drops anything under + :attr:`KEY_PREFIX`. ``list(...)`` snapshots the iterator since + we mutate during iteration. + """ + for key in list(self._store): + if key.startswith(self.KEY_PREFIX): + self._store.invalidate(key) class DataLowLevelClient(LowLevelClientBase, WithGrpcClient): @@ -349,24 +161,25 @@ def __init__( self, grpc_client: GrpcClient, *, - disk_cache_path: str | os.PathLike[str] | None = None, - disk_cache_max_bytes: int | None = None, + channel_cache: ChannelDataCache | None = None, ): """Initialize the DataLowLevelClient. Args: grpc_client: The gRPC client to use for making API calls. - disk_cache_path: Directory for the disk-backed channel-data cache. - ``None`` disables caching entirely. See ``ChannelCache``. - disk_cache_max_bytes: Byte cap for disk usage. ``None`` uses - ``ChannelCache.DEFAULT_DISK_MAX_BYTES``. Ignored when - ``disk_cache_path`` is ``None``. + channel_cache: Adapter wrapping the shared :class:`DiskCache` the + :class:`SiftClient` owns. When ``None`` (only the unit-test + construction path), the wrapper falls back to a no-op store + so cache reads/writes are silent. Production callers always + pass an adapter built from ``client._get_disk_cache()``. """ super().__init__(grpc_client) - self.channel_cache = ChannelCache( - disk_path=disk_cache_path, - disk_max_bytes=disk_cache_max_bytes, - ) + # Production wires the shared store in via the resource. The fallback + # here lets a bare ``DataLowLevelClient(MagicMock())`` keep working + # in unit tests without forcing every site to plumb a store. + if channel_cache is None: + channel_cache = ChannelDataCache(DiskCache()) + self.channel_cache = channel_cache def _update_name_id_map(self, channels: list[Channel]): """Update the name id map with the new channels.""" diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py index 4fc094440..6e28bd2bd 100644 --- a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py +++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_data.py @@ -2,11 +2,9 @@ Four classes, narrowest scope first: -* :class:`TestChannelCache` — disk-backed :class:`ChannelCache` unit tests - (fresh open, cross-session reload, invalidate/clear, oversized guards, - disable/reconfigure). -* :class:`TestChannelCacheClearDisk` — ``ChannelCache.clear_disk`` - classmethod (default path, custom path, safety guard). +* :class:`TestChannelDataCache` — the typed adapter over the shared + :class:`DiskCache`. Covers key namespacing, the isinstance guard on + ``get``, and the prefix-scoped ``clear``. * :class:`TestMergePages` — ``DataLowLevelClient._merge_pages``, the per-channel concat helper. * :class:`TestDataLowLevelClient` — constructor wiring and per-instance @@ -14,14 +12,18 @@ * :class:`TestGetChannelData` — end-to-end on the public ``get_channel_data`` API against a mocked ``_get_data_impl``. -The OOM regression that motivated this code happened because the cache was -a class attribute that grew without bound. ``test_per_instance_isolation`` -is the canary that catches anyone re-introducing that pattern. +Storage-layer behaviour (oversize guards, marker-checked clear, +cross-session reload) lives in ``_tests/_internal/test_disk_cache.py``; +this file stays focused on the channel-data path. + +The OOM regression that motivated this code happened because the cache +was a class attribute that grew without bound. ``test_per_instance_isolation`` +is the canary that catches anyone re-introducing that pattern, even though +ownership has since moved to the client. """ from __future__ import annotations -import logging from contextlib import contextmanager from datetime import datetime, timedelta, timezone from typing import Any, Iterator @@ -30,9 +32,10 @@ import pandas as pd import pytest +from sift_client._internal.disk_cache import DiskCache from sift_client._internal.low_level_wrappers.data import ( - ChannelCache, ChannelCacheEntry, + ChannelDataCache, DataLowLevelClient, _new_cache_entry, ) @@ -41,13 +44,6 @@ _NOW = datetime(2025, 1, 1, tzinfo=timezone.utc) _WINDOW_END = _NOW + timedelta(days=1) -# Snapshot of the real ``DEFAULT_DISK_PATH`` constant captured at module import. -# The autouse ``_isolate_default_disk_cache_path`` fixture in ``conftest.py`` -# overrides the class attribute on every test for isolation; the -# ``TestChannelCacheClearDisk::test_default_path_constant_under_tmp`` test still -# needs to see the production value to verify its shape. -_PRODUCTION_DEFAULT_DISK_PATH = ChannelCache.DEFAULT_DISK_PATH - # ---------- shared helpers ----------- @@ -97,39 +93,15 @@ def _channel(cid: str) -> Channel: def _client_with_cache(tmp_path, subdir: str = "cache") -> DataLowLevelClient: - """Build a ``DataLowLevelClient`` whose ``ChannelCache`` points at ``tmp_path``. + """Build a ``DataLowLevelClient`` whose adapter points at ``tmp_path``. - Tests that exercise cache behaviour (hits/misses/eviction) need an - actual disk-backed cache, so ``disk_cache_path`` must be supplied. A - plain ``DataLowLevelClient(MagicMock())`` defaults to no-cache mode + Tests that exercise cache behaviour (hits/misses) need an actual + disk-backed adapter, so the store has to be opened explicitly. A + plain ``DataLowLevelClient(MagicMock())`` defaults to a no-op store and would silently turn every cache test into a wire-path test. """ - return DataLowLevelClient(MagicMock(), disk_cache_path=tmp_path / subdir) - - -@contextmanager -def _capture_data_warnings() -> Iterator[list[logging.LogRecord]]: - """Capture warnings emitted by the ``data`` module's logger directly. - - Pytest's ``caplog`` reads from the root logger, but the Sift pytest plugin - sets ``propagate=False`` on the ``sift_client`` logger when audit logging - is active, so records emitted from any descendant don't reach the root. - Attaching a list-backed handler at the leaf logger bypasses that and - surfaces exactly the records we emit. - """ - target = logging.getLogger("sift_client._internal.low_level_wrappers.data") - records: list[logging.LogRecord] = [] - - class _ListHandler(logging.Handler): - def emit(self, record: logging.LogRecord) -> None: - records.append(record) - - handler = _ListHandler(level=logging.WARNING) - target.addHandler(handler) - try: - yield records - finally: - target.removeHandler(handler) + store = DiskCache(disk_path=tmp_path / subdir) + return DataLowLevelClient(MagicMock(), channel_cache=ChannelDataCache(store)) def _patch_deserializer(sentinel_to_frames: dict[str, dict[str, pd.DataFrame]]) -> Any: @@ -201,348 +173,140 @@ async def fake_impl( # ---------- tests ----------- -class TestChannelCache: - """Disk-backed :class:`ChannelCache` behaviour. +class TestChannelDataCache: + """The typed adapter over the shared :class:`DiskCache`. - Five invariants must hold across these tests: + Three invariants get pinned: - 1. Constructing without a ``disk_path`` yields a no-op cache (every - operation is silent; ``__contains__`` returns ``False``). - 2. A fresh disk directory starts empty and accepts new writes. - 3. Closing a populated cache and reopening at the same path surfaces - the previous entries on read (the "previous session" requirement - that powers cold-start reuse). - 4. Oversized entries are skipped with a deduped warning rather than - being inserted and triggering an eviction storm. - 5. ``invalidate``/``clear`` reset the oversized-warning dedup state - so a future regression re-warns. + 1. Every operation routes through the namespaced key + (``channel:``), so two adapters sharing one store don't + collide on bare resource ids. + 2. :meth:`ChannelDataCache.get` returns ``None`` on a type-mismatch + hit (e.g. a row another adapter wrote) instead of handing + arbitrary objects to downstream pandas code. + 3. :meth:`ChannelDataCache.clear` wipes only the adapter's namespace + — entries belonging to other adapters survive. - All tests confine writes to ``tmp_path`` so nothing leaks into the - real ``/tmp/sift-channel-data-cache``. + Store-level behaviour (oversized guards, cross-session reload, + marker-checked clear_disk) is exercised in ``test_disk_cache.py``. """ - def test_disabled_when_no_path(self) -> None: - """``ChannelCache()`` with no ``disk_path`` is a silent no-op.""" - cache = ChannelCache() - assert cache.disk_enabled is False - assert cache.disk_path is None - assert cache.disk_max_bytes is None - # Operations don't raise; the cache just stays empty. - cache.put("chan-1", _entry(rows=4)) - assert "chan-1" not in cache - assert cache.get("chan-1") is None - cache.invalidate("chan-1") - cache.clear() - cache.close() - - def test_fresh_cache_writes_and_reads(self, tmp_path) -> None: - """A fresh disk directory accepts writes and serves them back.""" - path = tmp_path / "fresh" - cache = ChannelCache(disk_path=path) + def test_get_miss_returns_none(self, tmp_path): + adapter = ChannelDataCache(DiskCache(disk_path=tmp_path / "miss")) + try: + assert "c1" not in adapter + assert adapter.get("c1") is None + finally: + adapter.store.close() + + def test_round_trip(self, tmp_path): + """Put then get returns an equivalent entry.""" + adapter = ChannelDataCache(DiskCache(disk_path=tmp_path / "rt")) try: - assert cache.disk_enabled - assert cache.disk_path == str(path) - assert cache.disk_max_bytes == ChannelCache.DEFAULT_DISK_MAX_BYTES entry = _entry(rows=8) - cache.put("chan-1", entry) - assert "chan-1" in cache - got = cache.get("chan-1") + adapter.put("c1", entry) + assert "c1" in adapter + got = adapter.get("c1") assert got is not None pd.testing.assert_frame_equal(got.data, entry.data) assert got.start_time == entry.start_time assert got.end_time == entry.end_time finally: - cache.close() - - def test_reopen_existing_dir_sees_prior_session_entries(self, tmp_path) -> None: - """Closing then reopening at the same path makes prior entries hit. - - This is the "look for existing caches from previous sessions" - guarantee: a new ``ChannelCache`` at a populated directory finds - entries on disk and returns them on the next read. - """ - path = tmp_path / "prev-session" - df = _frame("chan-1", rows=12, freq="s") - original_entry = _new_cache_entry( - data=df, - start_time=df.index[0].to_pydatetime(), - end_time=df.index[-1].to_pydatetime(), - ) - # Session 1: populate and close. - session1 = ChannelCache(disk_path=path) - session1.put("chan-1", original_entry) - session1.close() - - # Session 2: fresh process simulated by a brand-new ChannelCache - # at the same directory. - session2 = ChannelCache(disk_path=path) - try: - assert "chan-1" in session2 - got = session2.get("chan-1") - assert got is not None - pd.testing.assert_frame_equal(got.data, original_entry.data) - assert got.start_time == original_entry.start_time - assert got.end_time == original_entry.end_time - finally: - session2.close() - - def test_repeated_put_overwrites(self, tmp_path) -> None: - """A second ``put`` on the same key replaces the prior entry.""" - cache = ChannelCache(disk_path=tmp_path / "overwrite") - try: - small = _entry(rows=10) - bigger = _entry(rows=100) - cache.put("chan", small) - cache.put("chan", bigger) - got = cache.get("chan") - assert got is not None - pd.testing.assert_frame_equal(got.data, bigger.data) - finally: - cache.close() - - def test_invalidate_removes_entry(self, tmp_path) -> None: - """``invalidate`` drops the entry; safe to call when absent.""" - cache = ChannelCache(disk_path=tmp_path / "inval") - try: - cache.invalidate("never_added") # safe before any puts - cache.put("chan-1", _entry(rows=4)) - cache.invalidate("chan-1") - assert "chan-1" not in cache - assert cache.get("chan-1") is None - finally: - cache.close() - - def test_clear_wipes_disk(self, tmp_path) -> None: - cache = ChannelCache(disk_path=tmp_path / "clear") - try: - cache.put("chan-1", _entry(rows=4)) - cache.put("chan-2", _entry(rows=4)) - cache.clear() - assert "chan-1" not in cache - assert "chan-2" not in cache - finally: - cache.close() - - def test_disable_disk_closes_handle(self, tmp_path) -> None: - """Turning off disk closes the handle and silences subsequent ops.""" - cache = ChannelCache(disk_path=tmp_path / "disable") - try: - cache.put("chan-1", _entry(rows=4)) - cache.disable_disk() - assert not cache.disk_enabled - assert cache.disk_path is None - assert "chan-1" not in cache # no handle → no hits - assert cache.get("chan-1") is None - # Subsequent puts are silently dropped. - cache.put("chan-2", _entry(rows=4)) - assert "chan-2" not in cache - finally: - cache.close() + adapter.store.close() - def test_enable_disk_reconfigures_path(self, tmp_path) -> None: - """Reconfiguring to a different path closes the old handle. + def test_writes_use_namespaced_key(self, tmp_path): + """The raw store sees ``channel:``, not the bare id. - The new directory starts empty: ``chan-1`` lived in the old - directory's diskcache, so the lookup at the new path misses. + Pins the key-shape contract two adapters share. Without it, a + second adapter that happens to share an id with the channel + adapter would clobber the channel row. """ - cache = ChannelCache(disk_path=tmp_path / "a") + store = DiskCache(disk_path=tmp_path / "ns") + adapter = ChannelDataCache(store) try: - cache.put("chan-1", _entry(rows=4)) - cache.enable_disk(path=tmp_path / "b") - assert cache.disk_path == str(tmp_path / "b") - assert "chan-1" not in cache # fresh directory + adapter.put("c1", _entry(rows=4)) + assert "channel:c1" in store + assert "c1" not in store finally: - cache.close() + store.close() - def test_enable_disk_noop_when_same_settings(self, tmp_path) -> None: - """Re-enabling with identical settings doesn't churn the disk handle.""" - cache = ChannelCache(disk_path=tmp_path / "noop") - try: - handle_before = cache._disk - cache.enable_disk(path=tmp_path / "noop", max_bytes=ChannelCache.DEFAULT_DISK_MAX_BYTES) - assert cache._disk is handle_before - finally: - cache.close() - - def test_oversized_entry_skips_cache_preserves_neighbours(self, tmp_path) -> None: - """An entry larger than the cap is skipped without evicting peers. + def test_get_isinstance_check_filters_foreign_rows(self, tmp_path): + """A row whose value isn't a ChannelCacheEntry reads as a miss. - Without this guard, ``diskcache``'s cull would evict every other - row trying to fit an unfittable entry, then drop the entry itself - — the wipe-everything failure mode the bounded-cache work - originally fixed. The disk-tier guard mirrors that fix. - - Memory is sized to accept small entries but reject the oversized one - so memory-tier writes don't compete with disk-tier writes. We - assert on the disk ``_disk`` mapping directly because that's where - the contested behavior lives. - - ``disk_max_bytes`` has to leave room for ``diskcache``'s pickle - envelope around each small entry (a few KB) AND be small enough - that the oversized entry trips the guard. Half the oversized - DataFrame's raw byte size hits both constraints comfortably. + Models a corrupt entry or a key collision from another writer. + ``ChannelDataCache.get`` must isinstance-check the raw value so + callers downstream never receive the wrong shape. """ - small = _entry(rows=4) - oversized = _entry(rows=10_000) - cache = ChannelCache( - disk_path=tmp_path / "disk-oversize", - disk_max_bytes=oversized.size_bytes // 2, - ) + store = DiskCache(disk_path=tmp_path / "foreign") + adapter = ChannelDataCache(store) try: - cache.put("small-1", small) - cache.put("small-2", small) - assert cache._disk is not None - with _capture_data_warnings() as records: - cache.put("huge", oversized) - # Prior entries survive; oversized one was not written. - assert "small-1" in cache - assert "small-2" in cache - assert "huge" not in cache - assert any("larger than the disk cache cap" in r.getMessage() for r in records) + store.put("channel:c1", {"not": "an entry"}, size_bytes=64) + assert adapter.get("c1") is None finally: - cache.close() - - def test_oversized_put_drops_prior_entry(self, tmp_path) -> None: - """An oversized re-insert must drop the prior slice, not silently keep it. + store.close() - Otherwise a stale subrange would masquerade as a hit on the next - ``get`` even though the caller's intent was to refresh the entry. - """ - small = _entry(rows=4) - oversized = _entry(rows=10_000) - cache = ChannelCache( - disk_path=tmp_path / "drop-prior", - disk_max_bytes=oversized.size_bytes // 2, - ) + def test_invalidate_removes_entry(self, tmp_path): + adapter = ChannelDataCache(DiskCache(disk_path=tmp_path / "inval")) try: - cache.put("chan", small) - assert "chan" in cache - cache.put("chan", oversized) - assert "chan" not in cache + adapter.invalidate("never_added") # safe before any puts + adapter.put("c1", _entry(rows=4)) + adapter.invalidate("c1") + assert "c1" not in adapter + assert adapter.get("c1") is None finally: - cache.close() + adapter.store.close() - def test_oversized_put_warns_once_per_channel(self, tmp_path) -> None: - """Repeated oversized puts for the same channel log once, not on every call. + def test_clear_is_prefix_scoped(self, tmp_path): + """``clear`` drops channel rows but leaves other adapters' rows alone. - Without dedup, every ``get_data`` for an oversized channel would - write a fresh WARNING line — quickly drowning out other signal in - the logs. + Simulates a second resource writing to the same store with a + different prefix; the channel adapter's clear must not be a + whole-store wipe. """ - oversized = _entry(rows=10_000) - cache = ChannelCache( - disk_path=tmp_path / "dedup", - disk_max_bytes=oversized.size_bytes // 2, - ) + store = DiskCache(disk_path=tmp_path / "scoped") + adapter = ChannelDataCache(store) try: - with _capture_data_warnings() as records: - for _ in range(5): - cache.put("chan", oversized) - warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] - assert len(warnings) == 1 + adapter.put("c1", _entry(rows=4)) + adapter.put("c2", _entry(rows=4)) + # Simulate a row written by a different adapter. + store.put("other:1", "foreign-value", size_bytes=64) + adapter.clear() + assert "c1" not in adapter + assert "c2" not in adapter + assert "other:1" in store finally: - cache.close() + store.close() - def test_oversized_warning_resets_after_normal_put(self, tmp_path) -> None: - """A successful normal-sized put clears the dedup bit. + def test_size_bytes_propagates_to_store(self, tmp_path): + """The adapter forwards the entry's ``size_bytes`` to the store guard. - Used by callers who narrow a time window after seeing the warning: - the next oversized regression should re-warn rather than stay silent. + Sized below the entry's actual ``size_bytes`` so the store's + oversize guard kicks in. The adapter never measures size itself; + it relies on ``_new_cache_entry`` having stamped the value. """ - small = _entry(rows=4) - oversized = _entry(rows=10_000) - cache = ChannelCache( - disk_path=tmp_path / "reset-after-normal", - disk_max_bytes=oversized.size_bytes // 2, - ) - try: - with _capture_data_warnings() as records: - cache.put("chan", oversized) # 1st warning - cache.put("chan", small) # resets state - cache.put("chan", oversized) # 2nd warning - warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] - assert len(warnings) == 2 - finally: - cache.close() - - def test_invalidate_resets_oversized_warning(self, tmp_path) -> None: - """``invalidate`` is a fresh start; the next oversized put re-warns.""" - oversized = _entry(rows=10_000) - cache = ChannelCache( - disk_path=tmp_path / "reset-invalidate", - disk_max_bytes=oversized.size_bytes // 2, - ) + entry = _entry(rows=10_000) + store = DiskCache(disk_path=tmp_path / "size", disk_max_bytes=entry.size_bytes // 2) + adapter = ChannelDataCache(store) try: - with _capture_data_warnings() as records: - cache.put("chan", oversized) - cache.invalidate("chan") - cache.put("chan", oversized) - warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] - assert len(warnings) == 2 + adapter.put("c1", entry) + assert "c1" not in adapter # oversize skipped by the store finally: - cache.close() - - def test_clear_resets_oversized_warning(self, tmp_path) -> None: - """``clear`` resets dedup state across channels.""" - oversized = _entry(rows=10_000) - cache = ChannelCache( - disk_path=tmp_path / "reset-clear", - disk_max_bytes=oversized.size_bytes // 2, - ) - try: - with _capture_data_warnings() as records: - cache.put("chan-a", oversized) - cache.put("chan-b", oversized) - cache.clear() - cache.put("chan-a", oversized) - cache.put("chan-b", oversized) - warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] - assert len(warnings) == 4 - finally: - cache.close() - + store.close() -class TestChannelCacheClearDisk: - """``ChannelCache.clear_disk`` removes a cache dir, refuses other dirs. + def test_no_op_store_keeps_adapter_silent(self): + """An adapter on a disabled store behaves like a cold cache. - The classmethod is the source of truth that the resource-level - ``ChannelsAPIAsync.clear_data_cache_on_disk`` proxies through, so it - must be defensive against pointing at the wrong directory. - """ - - def test_clear_removes_directory(self, tmp_path) -> None: - path = tmp_path / "victim" - cache = ChannelCache(disk_path=path) - cache.put("chan-1", _entry(rows=4)) - cache.close() - assert path.exists() - ChannelCache.clear_disk(path) - assert not path.exists() - - def test_clear_missing_path_is_noop(self, tmp_path) -> None: - ChannelCache.clear_disk(tmp_path / "never-existed") # no raise - - def test_clear_refuses_non_diskcache_directory(self, tmp_path) -> None: - """A typo'd path with unrelated contents must not be wiped.""" - target = tmp_path / "user-stuff" - target.mkdir() - (target / "important.txt").write_text("don't delete me") - with pytest.raises(ValueError, match="does not look like a sift channel data cache"): - ChannelCache.clear_disk(target) - assert (target / "important.txt").read_text() == "don't delete me" - - def test_default_path_constant_under_tmp(self) -> None: - """Default lives under the OS tmp dir, not a user directory. - - Reads the module-level snapshot captured at import time rather than - ``ChannelCache.DEFAULT_DISK_PATH`` directly, because the autouse - ``_isolate_default_disk_cache_path`` fixture monkeypatches that - attribute for every test to keep ``/tmp`` clean. + Disabling the store is the path ``client.cache.disable_disk()`` + exercises; resources can keep their adapter reference and every + operation just no-ops. """ - import tempfile - - assert _PRODUCTION_DEFAULT_DISK_PATH.startswith(tempfile.gettempdir()) - assert _PRODUCTION_DEFAULT_DISK_PATH.endswith("sift-channel-data-cache") + adapter = ChannelDataCache(DiskCache()) + assert not adapter.store.disk_enabled + adapter.put("c1", _entry(rows=4)) + assert "c1" not in adapter + assert adapter.get("c1") is None + adapter.invalidate("c1") + adapter.clear() class TestMergePages: @@ -648,24 +412,24 @@ class TestDataLowLevelClient: :class:`TestGetChannelData`. """ - def test_no_cache_when_disk_path_omitted(self) -> None: - """Default construction leaves the cache in no-op mode. + def test_default_construction_uses_no_op_store(self) -> None: + """Default construction leaves the adapter wrapping a disabled store. - The ``ChannelsAPIAsync`` resource is the public surface for - opting into disk persistence; the bare ``DataLowLevelClient`` - keeps caching off so unit tests don't accidentally write to - ``/tmp`` just by instantiating the wrapper. + Resources wire the shared store in via the keyword arg; the + ``MagicMock()``-only path here keeps unit tests free of disk I/O. """ client = DataLowLevelClient(MagicMock()) - assert not client.channel_cache.disk_enabled + assert isinstance(client.channel_cache, ChannelDataCache) + assert not client.channel_cache.store.disk_enabled def test_per_instance_isolation(self, tmp_path) -> None: - """Two clients with separate disk paths must not share cache state. + """Two clients with distinct stores must not share cache state. Regression test for the original OOM bug: ``channel_cache`` was a class attribute, so every ``SiftClient`` in the process appended - to the same dict. Two fresh clients with distinct directories must - have independent caches. + to the same dict. Two fresh adapters over independent stores must + stay independent — even now that store ownership has moved to the + client, the contract is the same. """ client_a = _client_with_cache(tmp_path, "a") client_b = _client_with_cache(tmp_path, "b") @@ -674,23 +438,24 @@ class attribute, so every ``SiftClient`` in the process appended assert "c1" in client_a.channel_cache assert "c1" not in client_b.channel_cache finally: - client_a.channel_cache.close() - client_b.channel_cache.close() - - def test_disk_cache_kwargs_propagate(self, tmp_path) -> None: - """Constructor kwargs land on the underlying ``ChannelCache``.""" - path = tmp_path / "kwargs" - client = DataLowLevelClient( - MagicMock(), - disk_cache_path=path, - disk_cache_max_bytes=8_192, - ) + client_a.channel_cache.store.close() + client_b.channel_cache.store.close() + + def test_adapter_kwarg_propagates(self, tmp_path) -> None: + """The constructor honours an externally-constructed adapter. + + Mirrors the production wiring where ``ChannelsAPIAsync`` builds + the adapter from ``client._get_disk_cache()`` and hands it in. + """ + store = DiskCache(disk_path=tmp_path / "external", disk_max_bytes=8_192) + adapter = ChannelDataCache(store) + client = DataLowLevelClient(MagicMock(), channel_cache=adapter) try: - assert client.channel_cache.disk_enabled - assert client.channel_cache.disk_path == str(path) - assert client.channel_cache.disk_max_bytes == 8_192 + assert client.channel_cache is adapter + assert client.channel_cache.store is store + assert client.channel_cache.store.disk_max_bytes == 8_192 finally: - client.channel_cache.close() + store.close() class TestGetChannelData: @@ -766,7 +531,7 @@ async def test_cache_hit_short_circuits_grpc(self, tmp_path) -> None: ) pd.testing.assert_frame_equal(first["c1"].sort_index(), second["c1"].sort_index()) finally: - client.channel_cache.close() + client.channel_cache.store.close() @pytest.mark.asyncio async def test_partial_cache_hit_merges_cached_and_fresh(self, tmp_path) -> None: @@ -799,7 +564,7 @@ async def test_partial_cache_hit_merges_cached_and_fresh(self, tmp_path) -> None pd.testing.assert_frame_equal(result["c1"].sort_index(), c1_df.sort_index()) pd.testing.assert_frame_equal(result["c2"].sort_index(), c2_df.sort_index()) finally: - client.channel_cache.close() + client.channel_cache.store.close() @pytest.mark.asyncio async def test_ignore_cache_true_returns_fresh_and_skips_write(self, tmp_path) -> None: @@ -822,4 +587,4 @@ async def test_ignore_cache_true_returns_fresh_and_skips_write(self, tmp_path) - pd.testing.assert_frame_equal(result["c1"], df) assert "c1" not in client.channel_cache finally: - client.channel_cache.close() + client.channel_cache.store.close() diff --git a/python/lib/sift_client/_tests/_internal/test_disk_cache.py b/python/lib/sift_client/_tests/_internal/test_disk_cache.py new file mode 100644 index 000000000..66e57a9e6 --- /dev/null +++ b/python/lib/sift_client/_tests/_internal/test_disk_cache.py @@ -0,0 +1,343 @@ +"""Tests for :mod:`sift_client._internal.disk_cache`. + +Two classes, narrowest scope first: + +* :class:`TestDiskCache` — direct unit tests on :class:`DiskCache`: + the disabled-when-no-path no-op, fresh writes/reads, cross-session + reload, oversize guard + dedup keyed on the full namespaced key, and + the marker-guarded :meth:`DiskCache.clear_disk` classmethod. +* :class:`TestClearDisk` — the classmethod's defensive guards. + +The store is intentionally key/value-agnostic — every test treats it as +a plain ``str``-keyed dict that happens to persist across handles, with +``size_bytes`` supplied by the caller. The channel-specific adapter +(:class:`ChannelDataCache`) is exercised separately in ``test_data.py``. +""" + +from __future__ import annotations + +import logging +from contextlib import contextmanager +from typing import Iterator + +import pytest + +from sift_client._internal.disk_cache import DiskCache + +# Snapshot of the production constant captured at import time. The autouse +# ``_isolate_default_disk_cache_path`` fixture in ``conftest.py`` overrides +# the class attribute per test; the constant-shape test still needs the +# real value to assert against. +_PRODUCTION_DEFAULT_DISK_PATH = DiskCache.DEFAULT_DISK_PATH + + +@contextmanager +def _capture_disk_cache_warnings() -> Iterator[list[logging.LogRecord]]: + """Capture warnings emitted by the disk-cache logger directly. + + Pytest's ``caplog`` reads from the root logger, but the Sift pytest + plugin sets ``propagate=False`` on the ``sift_client`` logger when + audit logging is active, so records emitted from any descendant don't + reach the root. Attaching a list-backed handler at the leaf logger + bypasses that. + """ + target = logging.getLogger("sift_client._internal.disk_cache") + records: list[logging.LogRecord] = [] + + class _ListHandler(logging.Handler): + def emit(self, record: logging.LogRecord) -> None: + records.append(record) + + handler = _ListHandler(level=logging.WARNING) + target.addHandler(handler) + try: + yield records + finally: + target.removeHandler(handler) + + +class TestDiskCache: + """End-to-end behaviour of the shared on-disk store.""" + + def test_disabled_when_no_path(self) -> None: + """``DiskCache()`` with no ``disk_path`` is a silent no-op.""" + cache = DiskCache() + assert cache.disk_enabled is False + assert cache.disk_path is None + assert cache.disk_max_bytes is None + # Every operation no-ops; no AttributeError, no warning. + cache.put("k", "v", size_bytes=4) + assert "k" not in cache + assert cache.get("k") is None + assert list(iter(cache)) == [] + cache.invalidate("k") + cache.clear() + cache.close() + + def test_fresh_cache_writes_and_reads(self, tmp_path) -> None: + """A fresh disk directory accepts writes and serves them back.""" + cache = DiskCache(disk_path=tmp_path / "fresh") + try: + assert cache.disk_enabled + assert cache.disk_path == str(tmp_path / "fresh") + assert cache.disk_max_bytes == DiskCache.DEFAULT_DISK_MAX_BYTES + cache.put("k", {"hello": "world"}, size_bytes=64) + assert "k" in cache + assert cache.get("k") == {"hello": "world"} + finally: + cache.close() + + def test_reopen_existing_dir_sees_prior_session_entries(self, tmp_path) -> None: + """Closing then reopening at the same path surfaces prior entries. + + This is the cold-start reuse guarantee: a fresh process pointing + at a populated directory reads back what an earlier process wrote. + """ + path = tmp_path / "prev-session" + session1 = DiskCache(disk_path=path) + session1.put("k", [1, 2, 3], size_bytes=24) + session1.close() + + session2 = DiskCache(disk_path=path) + try: + assert "k" in session2 + assert session2.get("k") == [1, 2, 3] + finally: + session2.close() + + def test_repeated_put_overwrites(self, tmp_path) -> None: + cache = DiskCache(disk_path=tmp_path / "overwrite") + try: + cache.put("k", "first", size_bytes=8) + cache.put("k", "second", size_bytes=8) + assert cache.get("k") == "second" + finally: + cache.close() + + def test_invalidate_removes_entry(self, tmp_path) -> None: + cache = DiskCache(disk_path=tmp_path / "inval") + try: + cache.invalidate("never_added") # safe before any puts + cache.put("k", "v", size_bytes=4) + cache.invalidate("k") + assert "k" not in cache + assert cache.get("k") is None + finally: + cache.close() + + def test_clear_wipes_store(self, tmp_path) -> None: + cache = DiskCache(disk_path=tmp_path / "clear") + try: + cache.put("a", 1, size_bytes=8) + cache.put("b", 2, size_bytes=8) + cache.clear() + assert "a" not in cache + assert "b" not in cache + finally: + cache.close() + + def test_iter_yields_keys(self, tmp_path) -> None: + """``__iter__`` exposes the keyspace so adapters can prefix-clear.""" + cache = DiskCache(disk_path=tmp_path / "iter") + try: + cache.put("alpha:1", 1, size_bytes=8) + cache.put("beta:1", 2, size_bytes=8) + cache.put("alpha:2", 3, size_bytes=8) + assert set(cache) == {"alpha:1", "alpha:2", "beta:1"} + finally: + cache.close() + + def test_disable_disk_closes_handle(self, tmp_path) -> None: + """Turning off disk closes the handle and silences subsequent ops.""" + cache = DiskCache(disk_path=tmp_path / "disable") + try: + cache.put("k", "v", size_bytes=4) + cache.disable_disk() + assert not cache.disk_enabled + assert cache.disk_path is None + assert "k" not in cache + assert cache.get("k") is None + cache.put("new", "x", size_bytes=4) # silently dropped + assert "new" not in cache + finally: + cache.close() + + def test_enable_disk_reconfigures_path(self, tmp_path) -> None: + """Reconfiguring to a different path closes the old handle. + + The new directory starts empty: ``k`` lived in the old directory + so the lookup at the new path misses. + """ + cache = DiskCache(disk_path=tmp_path / "a") + try: + cache.put("k", "v", size_bytes=4) + cache.enable_disk(path=tmp_path / "b") + assert cache.disk_path == str(tmp_path / "b") + assert "k" not in cache + finally: + cache.close() + + def test_enable_disk_noop_when_same_settings(self, tmp_path) -> None: + """Re-enabling with identical settings doesn't churn the disk handle.""" + cache = DiskCache(disk_path=tmp_path / "noop") + try: + handle_before = cache._disk + cache.enable_disk( + path=tmp_path / "noop", max_bytes=DiskCache.DEFAULT_DISK_MAX_BYTES + ) + assert cache._disk is handle_before + finally: + cache.close() + + def test_oversized_entry_skipped_and_preserves_neighbours(self, tmp_path) -> None: + """An entry larger than the cap is skipped without evicting peers. + + Without this guard, ``diskcache``'s cull would evict every other + row trying to fit an unfittable entry, then drop the entry itself + — the wipe-everything failure mode the cache work originally fixed. + + Cap is sized to leave plenty of room for diskcache's pickle + envelope around the small entries while still being small enough + that the declared oversized ``size_bytes`` (10 MB) trips the + guard. ``size_bytes`` is the caller's contract — the store + compares that, not the actual on-disk size. + """ + cap = 1 * 1024 * 1024 # 1 MiB + cache = DiskCache(disk_path=tmp_path / "oversize", disk_max_bytes=cap) + try: + cache.put("small-1", "value", size_bytes=64) + cache.put("small-2", "value", size_bytes=64) + with _capture_disk_cache_warnings() as records: + cache.put("huge", "value", size_bytes=10 * 1024 * 1024) + assert "small-1" in cache + assert "small-2" in cache + assert "huge" not in cache + assert any("larger than the disk cache cap" in r.getMessage() for r in records) + finally: + cache.close() + + def test_oversized_put_drops_prior_entry(self, tmp_path) -> None: + """An oversized re-insert must drop the prior value, not silently keep it.""" + cap = 1 * 1024 * 1024 + cache = DiskCache(disk_path=tmp_path / "drop-prior", disk_max_bytes=cap) + try: + cache.put("k", "small", size_bytes=64) + assert "k" in cache + cache.put("k", "big", size_bytes=10 * 1024 * 1024) + assert "k" not in cache + finally: + cache.close() + + def test_oversized_put_warns_once_per_key(self, tmp_path) -> None: + """Repeated oversized puts for the same key log once, not every call.""" + cap = 1 * 1024 * 1024 + cache = DiskCache(disk_path=tmp_path / "dedup", disk_max_bytes=cap) + try: + with _capture_disk_cache_warnings() as records: + for _ in range(5): + cache.put("k", "v", size_bytes=10 * 1024 * 1024) + warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] + assert len(warnings) == 1 + finally: + cache.close() + + def test_oversized_warning_resets_after_normal_put(self, tmp_path) -> None: + """A successful normal-sized put clears the dedup bit for that key.""" + cap = 1 * 1024 * 1024 + cache = DiskCache(disk_path=tmp_path / "reset-normal", disk_max_bytes=cap) + try: + with _capture_disk_cache_warnings() as records: + cache.put("k", "v", size_bytes=10 * 1024 * 1024) # 1st warning + cache.put("k", "v", size_bytes=64) # resets state + cache.put("k", "v", size_bytes=10 * 1024 * 1024) # 2nd warning + warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] + assert len(warnings) == 2 + finally: + cache.close() + + def test_dedup_keys_on_full_namespaced_key(self, tmp_path) -> None: + """Dedup is per-key, so two adapters' colliding bare ids don't share state. + + Pins the design choice that the oversize warning dedup tracks the + full namespaced key handed to ``put`` (e.g. ``channel:foo`` vs + ``calc:foo``) rather than collapsing on the bare id. Two different + prefixes for the same suffix each get their own one-shot warning. + """ + cap = 1 * 1024 * 1024 + cache = DiskCache(disk_path=tmp_path / "two-prefixes", disk_max_bytes=cap) + try: + with _capture_disk_cache_warnings() as records: + cache.put("alpha:foo", "v", size_bytes=10 * 1024 * 1024) + cache.put("beta:foo", "v", size_bytes=10 * 1024 * 1024) + warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] + assert len(warnings) == 2 + messages = [r.getMessage() for r in warnings] + assert any("alpha:foo" in m for m in messages) + assert any("beta:foo" in m for m in messages) + finally: + cache.close() + + def test_invalidate_resets_oversized_warning(self, tmp_path) -> None: + cap = 1 * 1024 * 1024 + cache = DiskCache(disk_path=tmp_path / "reset-inval", disk_max_bytes=cap) + try: + with _capture_disk_cache_warnings() as records: + cache.put("k", "v", size_bytes=10 * 1024 * 1024) + cache.invalidate("k") + cache.put("k", "v", size_bytes=10 * 1024 * 1024) + warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] + assert len(warnings) == 2 + finally: + cache.close() + + def test_clear_resets_oversized_warning(self, tmp_path) -> None: + cap = 1 * 1024 * 1024 + cache = DiskCache(disk_path=tmp_path / "reset-clear", disk_max_bytes=cap) + try: + with _capture_disk_cache_warnings() as records: + cache.put("a", "v", size_bytes=10 * 1024 * 1024) + cache.put("b", "v", size_bytes=10 * 1024 * 1024) + cache.clear() + cache.put("a", "v", size_bytes=10 * 1024 * 1024) + cache.put("b", "v", size_bytes=10 * 1024 * 1024) + warnings = [r for r in records if "larger than the disk cache cap" in r.getMessage()] + assert len(warnings) == 4 + finally: + cache.close() + + +class TestClearDisk: + """:meth:`DiskCache.clear_disk` removes a cache dir, refuses other dirs.""" + + def test_clear_removes_directory(self, tmp_path) -> None: + path = tmp_path / "victim" + cache = DiskCache(disk_path=path) + cache.put("k", "v", size_bytes=4) + cache.close() + assert path.exists() + DiskCache.clear_disk(path) + assert not path.exists() + + def test_clear_missing_path_is_noop(self, tmp_path) -> None: + DiskCache.clear_disk(tmp_path / "never-existed") # no raise + + def test_clear_refuses_non_diskcache_directory(self, tmp_path) -> None: + """A typo'd path with unrelated contents must not be wiped.""" + target = tmp_path / "user-stuff" + target.mkdir() + (target / "important.txt").write_text("don't delete me") + with pytest.raises(ValueError, match="does not look like a sift data cache"): + DiskCache.clear_disk(target) + assert (target / "important.txt").read_text() == "don't delete me" + + def test_default_path_constant_under_tmp(self) -> None: + """Default lives under the OS tmp dir, not a user directory. + + Reads the module-level snapshot rather than ``DEFAULT_DISK_PATH`` + directly because the autouse fixture monkeypatches that attribute + for every test. + """ + import tempfile + + assert _PRODUCTION_DEFAULT_DISK_PATH.startswith(tempfile.gettempdir()) + assert _PRODUCTION_DEFAULT_DISK_PATH.endswith("sift-data-cache") diff --git a/python/lib/sift_client/_tests/conftest.py b/python/lib/sift_client/_tests/conftest.py index 31aebf03a..5790a2f7a 100644 --- a/python/lib/sift_client/_tests/conftest.py +++ b/python/lib/sift_client/_tests/conftest.py @@ -11,28 +11,25 @@ @pytest.fixture(autouse=True) def _isolate_default_disk_cache_path(monkeypatch, tmp_path): - """Redirect ``ChannelCache.DEFAULT_DISK_PATH`` to a per-test tmp dir. - - The channel data disk cache is **opt-out** — any test that triggers the - lazy ``DataLowLevelClient`` init through ``ChannelsAPIAsync`` would - otherwise create the real ``/tmp/sift-channel-data-cache`` directory and - leak state across runs. Redirecting the default to ``tmp_path`` keeps - every test self-contained without each test having to know that the disk - tier is on by default. - - The override deliberately preserves the ``sift-channel-data-cache`` - suffix so ``TestChannelCacheClearDisk::test_default_path_constant_under_tmp`` - keeps validating the real shape of the constant. - - Importing ``ChannelCache`` here pulls in pandas, but only once per - session — fixture body still runs per-test, just the monkeypatch. + """Redirect ``DiskCache.DEFAULT_DISK_PATH`` to a per-test tmp dir. + + On-disk caching is **opt-out** — any test that triggers the lazy + ``DiskCache`` init through ``SiftClient._get_disk_cache`` would + otherwise create the real ``/tmp/sift-data-cache`` directory and leak + state across runs. Redirecting the default to ``tmp_path`` keeps every + test self-contained without each test having to know the cache is on + by default. + + The override preserves the ``sift-data-cache`` suffix so + ``TestClearDisk::test_default_path_constant_under_tmp`` keeps + validating the real shape of the constant. """ - from sift_client._internal.low_level_wrappers.data import ChannelCache + from sift_client._internal.disk_cache import DiskCache monkeypatch.setattr( - ChannelCache, + DiskCache, "DEFAULT_DISK_PATH", - str(tmp_path / "sift-channel-data-cache"), + str(tmp_path / "sift-data-cache"), ) diff --git a/python/lib/sift_client/_tests/resources/test_channels.py b/python/lib/sift_client/_tests/resources/test_channels.py index ceee9ddef..f337bd3f5 100644 --- a/python/lib/sift_client/_tests/resources/test_channels.py +++ b/python/lib/sift_client/_tests/resources/test_channels.py @@ -501,145 +501,3 @@ async def fake_update_channel(update): api._units_low_level_client.create_unit.assert_not_awaited() assert captured["update"].unit == "" - - -class TestEnableDataCacheDisk: - """``enable_data_cache_disk`` / ``disable_data_cache_disk`` plumb the disk - cache settings to the underlying ``ChannelCache``, both pre- and post-init. - - The cache itself is exercised directly in - ``test_data.py::TestChannelCache``; the tests here just verify the - resource-level wiring around it. - """ - - def test_enabled_by_default(self): - """Disk caching is opt-out: the default-constructed resource lands - at ``ChannelCache.DEFAULT_DISK_PATH`` on first ``get_data``. - - The autouse ``_isolate_default_disk_cache_path`` fixture in - ``conftest.py`` redirects the constant to a per-test tmp dir so this - doesn't litter the real ``/tmp``. - """ - from sift_client._internal.low_level_wrappers.data import ChannelCache - - api = _make_api() - api._ensure_data_low_level_client() - cache = api._data_low_level_client.channel_cache - try: - assert cache.disk_enabled - assert cache.disk_path == ChannelCache.DEFAULT_DISK_PATH - finally: - cache.close() - - def test_enable_before_lazy_init_propagates(self, tmp_path): - api = _make_api() - api.enable_data_cache_disk(path=str(tmp_path / "pre-init"), max_bytes=4096) - api._ensure_data_low_level_client() - cache = api._data_low_level_client.channel_cache - try: - assert cache.disk_enabled - assert cache.disk_path == str(tmp_path / "pre-init") - assert cache.disk_max_bytes == 4096 - finally: - cache.close() - - def test_enable_after_lazy_init_updates_live_cache(self, tmp_path): - """``disable_data_cache_disk`` → ``enable_data_cache_disk`` round-trip - on a live cache swaps the disk handle without recreating the wrapper. - """ - api = _make_api() - # Start from the disk-off state so the test exercises the "off → on" - # transition rather than "default-on → reconfigured-on". - api.disable_data_cache_disk() - api._ensure_data_low_level_client() - cache = api._data_low_level_client.channel_cache - try: - assert not cache.disk_enabled - api.enable_data_cache_disk(path=str(tmp_path / "post-init")) - assert cache.disk_enabled - assert cache.disk_path == str(tmp_path / "post-init") - finally: - cache.close() - - def test_enable_with_default_path_lands_on_default(self, monkeypatch, tmp_path): - """Calling ``enable_data_cache_disk()`` with no args uses the default path. - - Redirects ``ChannelCache.DEFAULT_DISK_PATH`` to ``tmp_path`` so the - test doesn't create the real ``/tmp/sift-channel-data-cache`` - directory. - """ - from sift_client._internal.low_level_wrappers.data import ChannelCache - - fake_default = str(tmp_path / "fake-default") - monkeypatch.setattr(ChannelCache, "DEFAULT_DISK_PATH", fake_default) - - api = _make_api() - api.enable_data_cache_disk() - api._ensure_data_low_level_client() - cache = api._data_low_level_client.channel_cache - try: - assert cache.disk_path == fake_default - finally: - cache.close() - - def test_disable_closes_live_disk_handle(self, tmp_path): - api = _make_api() - api.enable_data_cache_disk(path=str(tmp_path / "to-close")) - api._ensure_data_low_level_client() - cache = api._data_low_level_client.channel_cache - try: - assert cache.disk_enabled - api.disable_data_cache_disk() - assert not cache.disk_enabled - assert cache.disk_path is None - finally: - cache.close() - - def test_clear_data_cache_on_disk_proxies_to_cache(self, tmp_path): - """The resource method removes the directory by proxying to ChannelCache.""" - from sift_client._internal.low_level_wrappers.data import ChannelCache - - path = tmp_path / "to-clear" - # Populate a real disk-cache directory so the marker check passes. - cache = ChannelCache(disk_path=path) - cache.close() - assert path.exists() - - api = _make_api() - api.clear_data_cache_on_disk(path) - assert not path.exists() - - def test_default_path_failure_falls_back_to_no_cache(self, monkeypatch, tmp_path): - """If the opt-out default cache path can't be opened, the wrapper - logs a warning and continues with caching disabled. - - Simulated by pointing ``DEFAULT_DISK_PATH`` at a path that already - exists as a regular file — ``os.makedirs(..., exist_ok=True)`` raises - ``FileExistsError`` for non-directory targets. - """ - from sift_client._internal.low_level_wrappers.data import ChannelCache - - blocker = tmp_path / "not-a-dir" - blocker.write_text("i am a file, not a directory") - monkeypatch.setattr(ChannelCache, "DEFAULT_DISK_PATH", str(blocker)) - - api = _make_api() - api._ensure_data_low_level_client() # must not raise - cache = api._data_low_level_client.channel_cache - try: - # Cache silently dropped; ``get_data`` will go straight to the wire. - assert not cache.disk_enabled - finally: - cache.close() - - def test_explicit_path_failure_propagates(self, tmp_path): - """An explicit ``enable_data_cache_disk(path=...)`` that can't open - propagates the OSError — silent fallback would hide a user mistake. - """ - blocker = tmp_path / "not-a-dir" - blocker.write_text("i am a file, not a directory") - - api = _make_api() - api.enable_data_cache_disk(path=str(blocker)) - with pytest.raises(FileExistsError): - api._ensure_data_low_level_client() diff --git a/python/lib/sift_client/_tests/test_client_cache.py b/python/lib/sift_client/_tests/test_client_cache.py new file mode 100644 index 000000000..bb7e85279 --- /dev/null +++ b/python/lib/sift_client/_tests/test_client_cache.py @@ -0,0 +1,261 @@ +"""Tests for :mod:`sift_client._internal.cache_namespace`. + +The namespace is the user-facing surface for the shared on-disk store +that lives on the :class:`SiftClient`. Three concerns get pinned here: + +1. Default policy (opt-out: caching on at the default path) lands on + the live store on first use. +2. Pre-init configuration (``client.cache.disable_disk()`` / + ``enable_disk(path=..., max_bytes=...)`` before any resource has + touched the cache) takes effect on the lazy build. +3. Post-init reconfiguration mutates the live :class:`DiskCache` in + place rather than swapping it out — every resource adapter holds a + reference to the same store. + +The single-instance-shared-across-resources invariant is the architectural +linchpin: a future second adapter must see the *same* handle as the channel +adapter so a global byte budget and LRU still apply. +""" + +from __future__ import annotations + +import pytest + +from sift_client._internal.disk_cache import DiskCache + + +def _make_client(): + """Build a SiftClient-like object with the bits the namespace needs. + + Reaching into ``sift_client.SiftClient.__init__`` requires a live gRPC + config; the namespace only touches ``_disk_cache_config`` and + ``_disk_cache``, so a tiny stand-in keeps these tests independent of + transport setup. + """ + from sift_client._internal.cache_namespace import CacheNamespace + from sift_client._internal.disk_cache_config import DiskCacheConfig + + class _StandinClient: + def __init__(self) -> None: + self._disk_cache_config = DiskCacheConfig(enabled=True) + self._disk_cache: DiskCache | None = None + self.cache = CacheNamespace(self) # type: ignore[arg-type] + + return _StandinClient() + + +# Pull the same lazy-init helper the real client uses so we exercise the +# default-path-fallback path against the live code rather than a mock. +def _get_disk_cache(client) -> DiskCache: + if client._disk_cache is None: + config = client._disk_cache_config + if not config.enabled: + client._disk_cache = DiskCache() + return client._disk_cache + target_path = config.path or DiskCache.DEFAULT_DISK_PATH + try: + client._disk_cache = DiskCache( + disk_path=target_path, + disk_max_bytes=config.max_bytes, + ) + except Exception: + if not config.using_default_path: + raise + client._disk_cache = DiskCache() + return client._disk_cache + + +class TestCacheNamespaceDefaults: + """Opt-out default: the namespace is on, default path, fresh start.""" + + def test_enabled_by_default(self): + """First lazy access lands at ``DiskCache.DEFAULT_DISK_PATH``.""" + client = _make_client() + store = _get_disk_cache(client) + try: + assert store.disk_enabled + assert store.disk_path == DiskCache.DEFAULT_DISK_PATH + finally: + store.close() + + def test_one_store_shared_across_lazy_calls(self): + """Re-entering ``_get_disk_cache`` returns the same handle.""" + client = _make_client() + first = _get_disk_cache(client) + second = _get_disk_cache(client) + try: + assert first is second + finally: + first.close() + + +class TestEnableDisk: + """``client.cache.enable_disk`` configures the store, pre- and post-init.""" + + def test_pre_init_path_lands_on_store(self, tmp_path): + client = _make_client() + client.cache.enable_disk(path=str(tmp_path / "pre"), max_bytes=4096) + store = _get_disk_cache(client) + try: + assert store.disk_enabled + assert store.disk_path == str(tmp_path / "pre") + assert store.disk_max_bytes == 4096 + finally: + store.close() + + def test_post_init_swap_uses_same_store_instance(self, tmp_path): + """Reconfiguring after first use mutates in place rather than re-creating. + + Every resource adapter holds a reference to ``client._disk_cache``; + if a reconfig replaced the handle, those adapters would still see + the stale one. ``DiskCache.enable_disk`` swaps the *contents* on + the same instance. + """ + client = _make_client() + client.cache.disable_disk() # start from off so this is a real on transition + store = _get_disk_cache(client) + try: + assert not store.disk_enabled + client.cache.enable_disk(path=str(tmp_path / "post")) + assert client._disk_cache is store # same instance + assert store.disk_enabled + assert store.disk_path == str(tmp_path / "post") + finally: + store.close() + + def test_enable_with_default_path_lands_on_default(self, monkeypatch, tmp_path): + """``enable_disk()`` with no args uses :attr:`DEFAULT_DISK_PATH`. + + Redirects the constant so the test doesn't create the real + ``/tmp/sift-data-cache`` directory. + """ + fake_default = str(tmp_path / "fake-default") + monkeypatch.setattr(DiskCache, "DEFAULT_DISK_PATH", fake_default) + + client = _make_client() + client.cache.enable_disk() + store = _get_disk_cache(client) + try: + assert store.disk_path == fake_default + finally: + store.close() + + +class TestDisableDisk: + """``client.cache.disable_disk`` turns the live cache off.""" + + def test_disable_closes_live_handle(self, tmp_path): + client = _make_client() + client.cache.enable_disk(path=str(tmp_path / "to-close")) + store = _get_disk_cache(client) + try: + assert store.disk_enabled + client.cache.disable_disk() + assert not store.disk_enabled + assert store.disk_path is None + finally: + store.close() + + def test_disable_before_lazy_init_keeps_store_off(self, tmp_path): + """Calling disable before first use means the lazy build skips the open.""" + client = _make_client() + client.cache.disable_disk() + store = _get_disk_cache(client) + try: + assert not store.disk_enabled + finally: + store.close() + + +class TestClearDiskProxy: + """``client.cache.clear_disk`` proxies through to :meth:`DiskCache.clear_disk`.""" + + def test_clear_removes_directory(self, tmp_path): + path = tmp_path / "to-clear" + # Populate a real cache directory so the marker check passes. + cache = DiskCache(disk_path=path) + cache.close() + assert path.exists() + + client = _make_client() + client.cache.clear_disk(path) + assert not path.exists() + + +class TestLazyInitFallback: + """The default-path-failure fallback used by ``SiftClient._get_disk_cache``.""" + + def test_default_path_failure_falls_back_to_no_cache(self, monkeypatch, tmp_path): + """If the default cache path can't be opened, the lazy init produces + a disabled :class:`DiskCache` rather than raising. + + Simulated by pointing ``DEFAULT_DISK_PATH`` at a path that already + exists as a regular file — ``os.makedirs(..., exist_ok=True)`` + raises ``FileExistsError`` for non-directory targets. + """ + blocker = tmp_path / "not-a-dir" + blocker.write_text("i am a file, not a directory") + monkeypatch.setattr(DiskCache, "DEFAULT_DISK_PATH", str(blocker)) + + client = _make_client() + store = _get_disk_cache(client) # must not raise + try: + assert not store.disk_enabled + finally: + store.close() + + def test_explicit_path_failure_propagates(self, tmp_path): + """An explicit path that can't be opened propagates the OSError. + + Silent fallback would hide a user mistake. + """ + blocker = tmp_path / "not-a-dir" + blocker.write_text("i am a file, not a directory") + + client = _make_client() + client.cache.enable_disk(path=str(blocker)) + with pytest.raises(FileExistsError): + _get_disk_cache(client) + + +class TestSiftClientIntegration: + """End-to-end through the real :class:`SiftClient.__init__` entry point. + + Asserts the wire-up: the namespace really lives at ``client.cache``, + the config is mutable through it, and the lazy ``_get_disk_cache`` + returns the configured store. + """ + + def _make_real_client(self): + from sift_client import SiftClient, SiftConnectionConfig + + return SiftClient( + connection_config=SiftConnectionConfig( + api_key="x", + grpc_url="disabled.invalid:0", + rest_url="https://disabled.invalid", + use_ssl=False, + ) + ) + + def test_attribute_present_and_uses_real_lazy_init(self, monkeypatch, tmp_path): + fake_default = str(tmp_path / "real-client-default") + monkeypatch.setattr(DiskCache, "DEFAULT_DISK_PATH", fake_default) + + client = self._make_real_client() + store = client._get_disk_cache() + try: + assert client.cache is not None + assert store.disk_enabled + assert store.disk_path == fake_default + finally: + store.close() + + def test_disable_before_first_get_data_keeps_store_off(self): + client = self._make_real_client() + client.cache.disable_disk() + store = client._get_disk_cache() + try: + assert not store.disk_enabled + finally: + store.close() diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py index 5db5bf473..2cda463f1 100644 --- a/python/lib/sift_client/client.py +++ b/python/lib/sift_client/client.py @@ -1,5 +1,10 @@ from __future__ import annotations +import logging +from typing import TYPE_CHECKING + +from sift_client._internal.cache_namespace import CacheNamespace +from sift_client._internal.disk_cache_config import DiskCacheConfig from sift_client._internal.urls import frontend_origin_for_api from sift_client.resources import ( AssetsAPI, @@ -45,6 +50,11 @@ ) from sift_client.util.util import AsyncAPIs +if TYPE_CHECKING: + from sift_client._internal.disk_cache import DiskCache + +logger = logging.getLogger(__name__) + class SiftClient( WithGrpcClient, @@ -126,6 +136,9 @@ class SiftClient( data_import: DataImportAPI """Instance of the Data Import API for making synchronous requests.""" + cache: CacheNamespace + """Surface for the shared on-disk cache used by every cache-aware resource.""" + async_: AsyncAPIs """Accessor for the asynchronous APIs. All asynchronous APIs are available as attributes on this accessor.""" @@ -180,6 +193,14 @@ def __init__( # pytest plugin's ``--sift-disabled`` mode. self._simulate: bool = False + # Shared on-disk cache: user intent in ``_disk_cache_config`` (opt-out + # default), live handle in ``_disk_cache`` (lazy so importing this + # module doesn't pay the diskcache cost up front). The + # ``client.cache`` namespace mutates both. + self._disk_cache_config = DiskCacheConfig(enabled=True) + self._disk_cache: DiskCache | None = None + self.cache = CacheNamespace(self) + self.ping = PingAPI(self) self.assets = AssetsAPI(self) self.calculated_channels = CalculatedChannelsAPI(self) @@ -231,6 +252,52 @@ def rest_client(self) -> RestClient: """The REST client used by the SiftClient for making REST API calls.""" return self._rest_client + def _get_disk_cache(self) -> DiskCache: + """Lazy accessor for the shared on-disk cache. Internal to resources. + + The cache is built on first use so that importing ``sift_client`` + doesn't pay the ``diskcache``/``sqlite`` cost up front. The opt-out + default ("disk caching on at the temp-dir path") is applied here, + along with the silent-fallback-on-default-path failure: if the + user left :class:`DiskCacheConfig` at its defaults and opening + fails (read-only ``/tmp``, restricted container, ...), we log a + warning and return a disabled :class:`DiskCache` so resources can + still serve requests by going to the wire. An explicit user- + supplied path that can't be opened propagates so the caller knows + their request didn't take. + + After the first call this just returns the memoized handle. + Subsequent ``client.cache.enable_disk(...)`` calls mutate the + existing handle in place; this method is not re-entered. + """ + if self._disk_cache is None: + from sift_client._internal.disk_cache import DiskCache + + config = self._disk_cache_config + if not config.enabled: + self._disk_cache = DiskCache() + return self._disk_cache + target_path = config.path or DiskCache.DEFAULT_DISK_PATH + try: + self._disk_cache = DiskCache( + disk_path=target_path, + disk_max_bytes=config.max_bytes, + ) + except Exception: + if not config.using_default_path: + raise + logger.warning( + "Could not open the default sift data cache at %r; " + "falling back to no caching. Call " + "``client.cache.disable_disk()`` to silence this " + "warning, or pass an explicit path via " + "``client.cache.enable_disk(path=...)``.", + target_path, + exc_info=True, + ) + self._disk_cache = DiskCache() + return self._disk_cache + @property def app_url(self) -> str | None: """The Sift web-app origin for this client, or None if it can't be determined. diff --git a/python/lib/sift_client/resources/channels.py b/python/lib/sift_client/resources/channels.py index 6be88f84e..df5d218a9 100644 --- a/python/lib/sift_client/resources/channels.py +++ b/python/lib/sift_client/resources/channels.py @@ -1,9 +1,7 @@ from __future__ import annotations -import logging from typing import TYPE_CHECKING -from sift_client._internal.disk_cache_config import DiskCacheConfig from sift_client._internal.low_level_wrappers.channels import ChannelsLowLevelClient from sift_client._internal.low_level_wrappers.units import UnitsLowLevelClient from sift_client.resources._base import ResourceBase @@ -13,7 +11,6 @@ from sift_client.util import cel_utils as cel if TYPE_CHECKING: - import os import re from datetime import datetime @@ -22,8 +19,6 @@ from sift_client.client import SiftClient -logger = logging.getLogger(__name__) - def _channel_ids_from_list(items: list[str | Channel]) -> list[str]: """Resolve a list of channel IDs or Channel objects to a list of channel IDs. @@ -69,89 +64,6 @@ def __init__(self, sift_client: SiftClient): self._low_level_client = ChannelsLowLevelClient(grpc_client=self.client.grpc_client) self._units_low_level_client = UnitsLowLevelClient(grpc_client=self.client.grpc_client) self._data_low_level_client = None - self._disk_cache_config = DiskCacheConfig(enabled=True) - - def enable_data_cache_disk( - self, - *, - path: str | os.PathLike[str] | None = None, - max_bytes: int | None = None, - ) -> None: - """Configure (or re-enable after ``disable_data_cache_disk``) the disk cache. - - Disk persistence is **on by default** at ``ChannelCache.DEFAULT_DISK_PATH``; - use this method when you want to override the path or size, or to turn - the cache back on after a prior ``disable_data_cache_disk`` call. - - Each entry that ``get_data`` returns is written to the cache and read - back on subsequent calls, even after process restart. The default - path lives under ``tempfile.gettempdir()`` and is shared across - sessions, so a re-run of the same workload picks up previously-cached - windows without a fetch. - - Safe to call before or after the first ``get_data``. Reconfiguring - (different ``path`` or ``max_bytes``) closes the previous handle and - opens a new one. - - An explicit ``path`` that can't be opened (e.g. permission denied, - read-only filesystem) raises so the caller knows the request didn't - take. The default-path open does *not* raise — see - ``_ensure_data_low_level_client`` for the silent fall-back behaviour. - - Args: - path: Directory to persist the cache to. ``None`` (the default) - uses ``ChannelCache.DEFAULT_DISK_PATH``. Existing entries at - the path become available as cache hits. - max_bytes: Byte cap on disk usage. ``None`` uses - ``ChannelCache.DEFAULT_DISK_MAX_BYTES`` (4 GiB). When the - bound is reached, ``diskcache``'s LRU eviction takes over. - - Example: - client.channels.enable_data_cache_disk(path="/data/sift-cache") - client.channels.enable_data_cache_disk(max_bytes=1024 ** 3) # 1 GiB - """ - self._disk_cache_config.enable(path=path, max_bytes=max_bytes) - if self._data_low_level_client is not None: - self._data_low_level_client.channel_cache.enable_disk(path=path, max_bytes=max_bytes) - - def disable_data_cache_disk(self) -> None: - """Opt out of caching for ``get_data`` (no reads or writes). - - Caching is on by default; call this when you don't want any cached - data written to or read from disk. Closes any open cache file - handle. The on-disk directory is NOT deleted — use - :meth:`clear_data_cache_on_disk` to wipe it. - """ - self._disk_cache_config.disable() - if self._data_low_level_client is not None: - self._data_low_level_client.channel_cache.disable_disk() - - def clear_data_cache_on_disk(self, path: str | os.PathLike[str] | None = None) -> None: - """Delete a previously-persisted on-disk channel data cache directory. - - Drops stale caches from previous sessions, recovers from a corrupt - cache, or reclaims disk space. Removes the directory entirely; if disk - persistence is on, the next ``get_data`` re-opens an empty cache at - the same path. - - This is a thin proxy around - :meth:`ChannelCache.clear_disk ` - — exposed on the resource so callers don't need to reach into - ``_internal`` modules. The underlying classmethod is also reachable - directly (``ChannelCache.clear_disk(...)``) if the caller doesn't have - a ``SiftClient`` handy. - - Args: - path: Directory of the cache to clear. ``None`` (the default) - targets ``ChannelCache.DEFAULT_DISK_PATH``. - - Raises: - ValueError: If ``path`` exists but does not look like a sift - channel data cache directory. - """ - from sift_client._internal.low_level_wrappers.data import ChannelCache - - ChannelCache.clear_disk(path) async def get( self, @@ -331,45 +243,19 @@ def _ensure_data_low_level_client(self): """Ensure that the data low level client is initialized. Separated out like this to not require large dependencies (pandas/pyarrow) for the client if not fetching data.""" if self._data_low_level_client is None: from sift_client._internal.low_level_wrappers.data import ( - ChannelCache, + ChannelDataCache, DataLowLevelClient, ) - kwargs: dict = {} - disk_config = self._disk_cache_config - if disk_config.enabled: - # ``disk_path=None`` means "no cache" to ChannelCache; substitute - # the default explicitly so the opt-out default still opens - # the cache. ``DEFAULT_DISK_PATH`` is read here (not at - # config construction) so test fixtures that monkeypatch the - # class attribute see the override. - kwargs["disk_cache_path"] = disk_config.path or ChannelCache.DEFAULT_DISK_PATH - if disk_config.max_bytes is not None: - kwargs["disk_cache_max_bytes"] = disk_config.max_bytes - try: - self._data_low_level_client = DataLowLevelClient( - grpc_client=self.client.grpc_client, - **kwargs, - ) - except Exception: - # Explicit user-supplied paths failures propagate so the - # caller knows their request didn't take. Default-path failures - # (read-only ``/tmp``, restricted containers, etc.) degrade - # silently to no-cache mode so ``get_data`` still works. - if not disk_config.using_default_path: - raise - logger.warning( - "Could not open the default channel data cache at %r; " - "falling back to no caching for ``get_data``. Call " - "``client.channels.disable_data_cache_disk()`` to silence " - "this warning, or pass an explicit path via " - "``enable_data_cache_disk(path=...)``.", - kwargs.get("disk_cache_path"), - exc_info=True, - ) - self._data_low_level_client = DataLowLevelClient( - grpc_client=self.client.grpc_client, - ) + # The shared on-disk store lives on the client; we just wrap it + # in the channel-side adapter. Cache configuration (enable / + # disable / clear / path / max_bytes) is owned by + # ``client.cache`` — there's no resource-level knob anymore. + store = self.client._get_disk_cache() + self._data_low_level_client = DataLowLevelClient( + grpc_client=self.client.grpc_client, + channel_cache=ChannelDataCache(store), + ) async def get_data( self, diff --git a/python/lib/sift_client/resources/sync_stubs/__init__.pyi b/python/lib/sift_client/resources/sync_stubs/__init__.pyi index cc3ec914f..c37c3aed3 100644 --- a/python/lib/sift_client/resources/sync_stubs/__init__.pyi +++ b/python/lib/sift_client/resources/sync_stubs/__init__.pyi @@ -5,7 +5,6 @@ from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: - import os import re from datetime import datetime, timedelta from pathlib import Path @@ -453,79 +452,6 @@ class ChannelsAPI: """ ... - def clear_data_cache_on_disk(self, path: str | os.PathLike[str] | None = None) -> None: - """Delete a previously-persisted on-disk channel data cache directory. - - Drops stale caches from previous sessions, recovers from a corrupt - cache, or reclaims disk space. Removes the directory entirely; if disk - persistence is on, the next ``get_data`` re-opens an empty cache at - the same path. - - This is a thin proxy around - :meth:`ChannelCache.clear_disk ` - — exposed on the resource so callers don't need to reach into - ``_internal`` modules. The underlying classmethod is also reachable - directly (``ChannelCache.clear_disk(...)``) if the caller doesn't have - a ``SiftClient`` handy. - - Args: - path: Directory of the cache to clear. ``None`` (the default) - targets ``ChannelCache.DEFAULT_DISK_PATH``. - - Raises: - ValueError: If ``path`` exists but does not look like a sift - channel data cache directory. - """ - ... - - def disable_data_cache_disk(self) -> None: - """Opt out of caching for ``get_data`` (no reads or writes). - - Caching is on by default; call this when you don't want any cached - data written to or read from disk. Closes any open cache file - handle. The on-disk directory is NOT deleted — use - :meth:`clear_data_cache_on_disk` to wipe it. - """ - ... - - def enable_data_cache_disk( - self, *, path: str | os.PathLike[str] | None = None, max_bytes: int | None = None - ) -> None: - """Configure (or re-enable after ``disable_data_cache_disk``) the disk cache. - - Disk persistence is **on by default** at ``ChannelCache.DEFAULT_DISK_PATH``; - use this method when you want to override the path or size, or to turn - the cache back on after a prior ``disable_data_cache_disk`` call. - - Each entry that ``get_data`` returns is written to the cache and read - back on subsequent calls, even after process restart. The default - path lives under ``tempfile.gettempdir()`` and is shared across - sessions, so a re-run of the same workload picks up previously-cached - windows without a fetch. - - Safe to call before or after the first ``get_data``. Reconfiguring - (different ``path`` or ``max_bytes``) closes the previous handle and - opens a new one. - - An explicit ``path`` that can't be opened (e.g. permission denied, - read-only filesystem) raises so the caller knows the request didn't - take. The default-path open does *not* raise — see - ``_ensure_data_low_level_client`` for the silent fall-back behaviour. - - Args: - path: Directory to persist the cache to. ``None`` (the default) - uses ``ChannelCache.DEFAULT_DISK_PATH``. Existing entries at - the path become available as cache hits. - max_bytes: Byte cap on disk usage. ``None`` uses - ``ChannelCache.DEFAULT_DISK_MAX_BYTES`` (4 GiB). When the - bound is reached, ``diskcache``'s LRU eviction takes over. - - Example: - client.channels.enable_data_cache_disk(path="/data/sift-cache") - client.channels.enable_data_cache_disk(max_bytes=1024 ** 3) # 1 GiB - """ - ... - def find(self, **kwargs) -> Channel | None: """Find a single channel matching the given query. Takes the same arguments as `list`. If more than one channel is found, raises an error. From 16b62a15564975288aea9faa04758ea186307e42 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 25 Jun 2026 23:53:24 -0700 Subject: [PATCH 14/14] fmt --- python/lib/sift_client/_tests/_internal/test_disk_cache.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/lib/sift_client/_tests/_internal/test_disk_cache.py b/python/lib/sift_client/_tests/_internal/test_disk_cache.py index 66e57a9e6..5711580a6 100644 --- a/python/lib/sift_client/_tests/_internal/test_disk_cache.py +++ b/python/lib/sift_client/_tests/_internal/test_disk_cache.py @@ -182,9 +182,7 @@ def test_enable_disk_noop_when_same_settings(self, tmp_path) -> None: cache = DiskCache(disk_path=tmp_path / "noop") try: handle_before = cache._disk - cache.enable_disk( - path=tmp_path / "noop", max_bytes=DiskCache.DEFAULT_DISK_MAX_BYTES - ) + cache.enable_disk(path=tmp_path / "noop", max_bytes=DiskCache.DEFAULT_DISK_MAX_BYTES) assert cache._disk is handle_before finally: cache.close()