From a80b556c15109945849d3013ba62798c48cf3252 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 18:45:37 +0200 Subject: [PATCH 01/45] docs(specs): spec files for adding scheduled provider indexing and normalization strategy --- .../00-scheduled-provider-indexing.md | 195 ++++++++++++++ .../01-normalization-and-id-mapping.md | 237 ++++++++++++++++++ .../017-provider-catalog-index/README.md | 4 + 3 files changed, 436 insertions(+) create mode 100644 internal/specs/017-provider-catalog-index/00-scheduled-provider-indexing.md create mode 100644 internal/specs/017-provider-catalog-index/01-normalization-and-id-mapping.md create mode 100644 internal/specs/017-provider-catalog-index/README.md diff --git a/internal/specs/017-provider-catalog-index/00-scheduled-provider-indexing.md b/internal/specs/017-provider-catalog-index/00-scheduled-provider-indexing.md new file mode 100644 index 00000000..78c235e3 --- /dev/null +++ b/internal/specs/017-provider-catalog-index/00-scheduled-provider-indexing.md @@ -0,0 +1,195 @@ +# 017 Provider Catalog Index - Scheduled Provider Indexing + +## Goal + +Move provider discovery work out of the request path and into a scheduled indexing pipeline that builds and refreshes a local SQLite-backed catalog. + +The implementation must remain modular and provider-scoped so that changing one provider does not create cross-provider regressions. + +## Problem Statement + +Current search and `tvsearch` behavior still performs meaningful live work on demand: + +- provider title lookup may need a live or warm index refresh +- season discovery can fall back to sequential live probing +- episode language discovery can require provider page fetches +- availability and quality cache entries are populated reactively per request + +This keeps request latency variable and makes cold or broad searches expensive. + +## Proposed Direction + +Instead of discovering provider data only when Sonarr or Radarr asks for it, AniBridge should periodically walk each supported provider catalog and persist a local index of: + +- titles available on that provider +- provider-specific slugs and URL suffixes +- provider-specific season and episode structure +- provider-side language availability per episode +- provider-side movie entries for providers that expose them +- lightweight provider or host hints when they are available without deep resolution + +The request path should query SQLite first and should not perform request-time targeted refresh of missing titles, seasons, or episodes. + +## Version One Scope + +Version one should index the full provider catalog surface that AniBridge can serve: + +- all enabled providers +- all series on those providers +- all episodes of those series +- all provider-exposed movies + +Each provider should have its own indexing implementation and refresh handling behind a shared orchestration contract. + +## Refresh Model + +Use a background refresh pipeline, not a blocking full recrawl during normal operation. + +### Startup behavior + +- app startup should initialize the refresh scheduler +- app startup should check whether the local provider index is missing or stale +- if no usable index exists yet, catalog-dependent interactions should fail until the first complete index build for all enabled providers completes +- blocked catalog-dependent requests should return a suitable HTTP error, include a short explanation, and log the reason clearly +- if an older index already exists, AniBridge should continue serving it while a replacement index is built in parallel + +### Recurring behavior + +- run a scheduled refresh every 24 hours by default +- make refresh cadence configurable through environment variables +- support provider-specific refresh intervals through environment variables +- do not perform request-time targeted refresh for one title, season, or episode + +## Scope of the Indexed Data + +Persist data that is relatively stable and expensive to recompute: + +- provider title lists +- provider slug mappings +- provider URL suffixes or paths +- provider season and episode existence +- provider language availability +- provider-to-canonical entity mappings +- last indexed timestamps and refresh status + +Do not persist data that is short-lived or operationally volatile as part of the main index: + +- final direct media URLs +- temporary redirect URLs +- host redirect chains +- full probe payloads +- large raw HTML responses +- response headers from resolved stream URLs + +These should remain short-TTL operational caches, not permanent catalog records. + +## Provider Crawl Strategy + +### Title discovery + +For each enabled provider: + +- load the provider-wide title index or search endpoint equivalent +- persist one row per provider title entry +- persist stable provider identifiers such as slug, URL suffix or path, and media type hints + +### Episode discovery + +For each provider title: + +- enumerate seasons and episodes from the provider structure +- persist provider-native season and episode coordinates +- persist provider language availability +- persist only lightweight host or provider hints when they are available without deep resolution +- do not resolve deeper into ephemeral redirect or direct stream URLs + +### Refresh checkpoints + +Track per-provider and per-title progress so the crawler can resume safely after restart: + +- last successful provider refresh timestamp +- last successful title refresh timestamp +- cursor or page state where applicable +- failure count and last error summary + +## Request Path Implications + +After this feature, the request path should prefer: + +1. local provider catalog index +2. local canonical mapping tables +3. short-TTL operational caches + +Expected effects: + +- lower `tvsearch` latency +- fewer provider page fetches on search +- fewer sequential live probes for season discovery +- more deterministic behavior under repeated Sonarr scans + +Misses caused by absent or unresolved index data should not trigger ad-hoc live re-indexing. + +## Anti-Bot and Rate-Limit Considerations + +Full indexing increases background traffic and must be bounded carefully. + +Required controls: + +- per-provider concurrency limits +- global crawler concurrency limits +- retry with backoff on `429`, `403`, and transient upstream failures +- refresh checkpoints to avoid restarting whole crawls + +The system should assume that provider anti-bot behavior may change over time. + +## Storage Strategy + +SQLite growth is acceptable only if the data is normalized aggressively. + +Safe principles: + +- store canonical entities once +- store provider mappings separately +- store availability as compact, current-state rows +- avoid duplicating long text or JSON across episode rows +- store current state only, not history, unless a later feature requires it + +Optimize for performance and functionality first while keeping the database ideally under 1 GB and at most around 5 GB when justified. + +Potentially expensive data should be excluded from the permanent catalog unless there is a clear request-path need. + +## Refresh Semantics + +- refreshed index data should overwrite the previous current-state rows +- if a provider refresh fails, AniBridge may continue serving the older indexed data +- if a provider base URL changes, persisted provider-relative suffixes or paths should remain valid when reassembled against the current base URL +- if a title, episode, or movie was present in the previous successful index but is absent from the next successful refresh, AniBridge should delete that stale row during the refresh replacement process + +## Suggested Deliverables + +- new refresh scheduler surface for provider indexing +- persistent provider catalog tables and migrations +- refresh status and checkpoint tables +- operational metrics for index freshness, crawl duration, and fallback rate +- clear provider-specific indexing boundaries in the codebase + +## Non-Goals + +- replacing SQLite at this stage +- storing every transient stream resolution artifact permanently +- request-time targeted re-indexing of cache misses +- tightly coupling provider implementations to each other + +## Selected Decisions + +- all enabled providers should be indexed +- version one should index full provider coverage for series, episodes, and provider-exposed movies +- refresh cadence should default to 24 hours and remain configurable through environment variables, including provider-specific overrides +- if no index exists yet, catalog-dependent routes should fail until bootstrap indexing completes +- first bootstrap indexing should be complete and fully blocking before catalog-dependent routes become usable +- if an index exists but is stale, AniBridge should keep serving the old index while a new one is built in parallel +- request-time targeted refresh is explicitly out of scope +- indexing should include only non-ephemeral, non-probe-intensive data +- provider URLs should be stored as provider-relative suffixes or paths, not as full base URLs +- current-state data should be overwritten on refresh +- entries missing from the next successful refresh should be deleted at refresh time diff --git a/internal/specs/017-provider-catalog-index/01-normalization-and-id-mapping.md b/internal/specs/017-provider-catalog-index/01-normalization-and-id-mapping.md new file mode 100644 index 00000000..644cc746 --- /dev/null +++ b/internal/specs/017-provider-catalog-index/01-normalization-and-id-mapping.md @@ -0,0 +1,237 @@ +# 017 Provider Catalog Index - Normalization and ID Mapping + +## Goal + +Define a canonical mapping layer so AniBridge can match provider-specific series, seasons, episodes, and movies to the metadata models expected by Sonarr and Radarr with a higher hit rate than the current on-the-fly approach. + +## Problem Statement + +Provider catalogs do not align cleanly with Sonarr and Radarr expectations: + +- provider titles may differ from canonical metadata titles +- aliases and localized names vary +- specials and extras often use provider-specific numbering +- provider season and episode structures may not match TVDB ordering +- some providers expose only partial identifiers or no canonical IDs at all + +Today this mismatch is handled by a mix of live title matching, special-case mapping, and request-time probing. That limits correctness and repeatability. + +## Canonical Metadata Targets + +### Sonarr-facing canonical model + +Use TVDB-style series and episode numbering as the primary canonical model for TV content. + +Persist at least: + +- `tvdb_id` +- canonical series title +- canonical season number +- canonical episode number + +Secondary IDs are optional and should be stored only when they materially improve matching: + +- `tmdb_id` +- `imdb_id` +- `tvmaze_id` +- `anilist_id` +- `mal_id` + +The schema should be designed so AniBridge can support additional non-provider-native canonical orderings later when they improve Sonarr compatibility. +When these secondary TV identifiers are cheap to obtain and do not add disproportionate engineering complexity, AniBridge should persist them. + +### Radarr-facing canonical model + +Use TMDb as the primary canonical model for movies. + +Persist at least: + +- `tmdb_id` +- canonical movie title +- release year + +Optional secondary IDs: + +- `imdb_id` +- `tvdb_id` + +For version one, TMDb should be treated as the authoritative movie identity. Secondary IDs may be null or omitted when they do not materially help. + +## Mapping Layers + +Split the model into explicit layers. + +### Canonical entity layer + +Stores metadata-system identity independent of any provider: + +- canonical series +- canonical seasons +- canonical episodes +- canonical movies + +### Provider entity layer + +Stores how a provider represents the same title: + +- provider title row +- provider slug +- provider URL suffix or path +- provider media type hint +- provider-native season and episode coordinates +- provider-native language availability + +### Mapping layer + +Stores the relationship between provider entities and canonical entities: + +- provider title -> canonical series or movie +- provider episode -> canonical episode +- provider special or film entry -> canonical special episode or movie +- mapping confidence +- mapping source and last verification time + +## Matching Strategy + +Use a staged mapping strategy rather than a single fuzzy-title match. + +### Preferred match order for series + +1. explicit canonical ID from provider metadata, if available +2. existing confirmed local mapping +3. exact title or alias match against canonical metadata +4. constrained fuzzy title match +5. unresolved or low-confidence automatic mapping + +### Preferred match order for episodes + +1. existing confirmed provider episode mapping +2. direct canonical numbering alignment +3. provider special or extra mapping rules +4. alias-based remap to canonical special or extra episode +5. unresolved best-effort automatic mapping + +### Preferred match order for movies + +1. explicit `tmdb_id` +2. explicit `imdb_id` +3. title plus year exact match +4. constrained fuzzy match + +## Specials, Extras, and Films + +Specials should be first-class mapping records, not ad-hoc request-time exceptions. + +Store: + +- provider source season and episode +- canonical alias season and episode when TV-mapped +- mapping rationale or source +- confidence flag + +This allows AniBridge to answer Sonarr requests consistently even when the provider exposes the content under `film-N`, season `0`, or another non-canonical structure. + +If provider content is clearly represented as a film or movie rather than a TV special, AniBridge should preserve that distinction and map it into the movie model for Radarr instead of force-normalizing it into TV season `0`. + +## Confidence and Verification + +Not all mappings are equally trustworthy. + +Suggested states: + +- `confirmed` +- `high_confidence` +- `low_confidence` +- `unresolved` +- `conflict` + +Why this matters: + +- request path can trust confirmed mappings immediately +- low-confidence mappings can be eligible for background re-check +- conflicting mappings can be excluded from automatic response emission + +Version one may still emit low-confidence matches on a best-effort basis when no better candidate exists. +If one provider episode plausibly maps to multiple canonical episodes, AniBridge should emit all plausible matches rather than suppressing output. + +## Request Path Usage + +When Sonarr or Radarr requests content: + +1. resolve the request to the canonical ID model expected by the client +2. query the local mapping tables for matching provider entities +3. return provider-backed results derived from confirmed or sufficiently strong mappings +4. do not perform request-time targeted metadata enrichment just because the local mapping is absent + +For generic AniBridge search that is not explicitly Sonarr or Radarr ID-driven, provider titles may still be returned even when canonical enrichment is not yet complete. + +This should raise both: + +- probability of finding the correct title +- probability of returning the correct season and episode numbering + +## Suggested Data to Persist + +Persist: + +- canonical IDs +- canonical titles and aliases +- provider slugs and URL suffixes or paths +- provider-native season and episode coordinates +- mapping confidence and timestamps +- language availability per mapped episode + +Prefer compact persistence: + +- provider-relative URL suffixes or paths instead of full base URLs +- current-state records instead of history tables +- only those secondary IDs that materially improve Sonarr or Radarr matching + +Avoid persisting in the canonical mapping tables: + +- raw HTML +- large API payloads +- direct stream URLs +- large probe artifacts + +## Failure Modes to Plan For + +- one provider title maps to multiple plausible canonical series +- one provider episode maps to multiple plausible canonical episodes +- one canonical episode is split or merged differently by a provider +- specials are exposed as films instead of episodes +- provider title aliases drift over time +- canonical metadata source updates numbering after the initial mapping + +The design should support re-mapping without rebuilding the entire catalog from scratch. + +Canonical metadata changes should remap automatically on the next successful refresh. + +## Recommended Outcome + +The indexing feature and this mapping layer should ship together conceptually: + +- indexing without canonical mapping improves speed but not match quality enough +- canonical mapping without scheduled indexing improves logic but still leaves too much live work + +Combined, they create a local database that is both faster and more compatible with Sonarr and Radarr than the current request-driven approach. + +Low-confidence or conflicting mappings do not need to be surfaced to operators in version one, but the persistence model should not prevent a later web-UI override layer. + +Provider-specific alias tables should be captured during provider crawl whenever the provider exposes usable alias data directly. This keeps provider-specific naming close to the source and reduces later enrichment work. + +## Selected Decisions + +- canonical TV mapping should prioritize Sonarr-friendly orderings and remain extensible beyond only TVDB ordering +- canonical movie mapping should prioritize TMDb and may omit or null secondary IDs when they do not materially help +- secondary TV identifiers such as `tvmaze_id`, `anilist_id`, and `mal_id` should be persisted when they are cheap to obtain and do not add significant engineering complexity +- ambiguous anime sequel, remake, split-cour, and similar cases should use best-effort automatic mapping in version one +- low-confidence mappings may still be used best-effort +- if one provider episode has multiple plausible canonical matches, AniBridge should return all plausible matches +- content clearly represented as films should stay in the movie domain rather than being forced into TV specials +- operator review flows for low-confidence mappings can be deferred +- the persistence model should leave room for future manual override support through the planned web UI +- automatic re-mapping should occur on later refreshes when canonical metadata changes +- stale mapped data may continue to be served if refresh is temporarily blocked upstream +- provider titles may still be returned for generic search even when canonical enrichment is incomplete +- provider-specific alias tables should be captured during provider crawl when feasible diff --git a/internal/specs/017-provider-catalog-index/README.md b/internal/specs/017-provider-catalog-index/README.md new file mode 100644 index 00000000..771ce377 --- /dev/null +++ b/internal/specs/017-provider-catalog-index/README.md @@ -0,0 +1,4 @@ +# 017 Provider Catalog Index - Spec Index + +- `00-scheduled-provider-indexing.md` - scheduled full-catalog indexing and refresh strategy +- `01-normalization-and-id-mapping.md` - canonical ID mapping and provider-to-Sonarr/Radarr normalization strategy From 9c084aca761f3eba9cc45d452544388501a182b3 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 20:33:13 +0200 Subject: [PATCH 02/45] feat(api): add provider catalog index and bootstrap progress Implement the scheduled provider catalog index and canonical mapping layer for AniBridge. Move provider catalog discovery out of the request path and into a persistent SQLite-backed index with bootstrap gating, provider refresh status tracking, and canonical TV episode mappings. Update Torznab search and tvsearch to serve indexed provider and mapping data instead of triggering request-time title refresh or episode probing. Expose bootstrap visibility through health endpoints and terminal progress output so first-time catalog builds are easier to monitor. --- apps/api/.env.example | 29 + apps/api/app/api/health.py | 12 +- apps/api/app/api/torznab/api.py | 449 ++++++--- apps/api/app/catalog/__init__.py | 11 + apps/api/app/catalog/exceptions.py | 5 + apps/api/app/catalog/indexer.py | 532 ++++++++++ apps/api/app/catalog/metadata.py | 139 +++ apps/api/app/catalog/providers.py | 385 ++++++++ apps/api/app/config.py | 62 ++ apps/api/app/core/lifespan.py | 9 + .../20260428_0004_provider_catalog_index.py | 605 ++++++++++++ apps/api/app/db/models.py | 916 ++++++++++++++++++ apps/api/app/utils/terminal.py | 24 +- apps/api/app/utils/title_resolver.py | 45 + 14 files changed, 3105 insertions(+), 118 deletions(-) create mode 100644 apps/api/app/catalog/__init__.py create mode 100644 apps/api/app/catalog/exceptions.py create mode 100644 apps/api/app/catalog/indexer.py create mode 100644 apps/api/app/catalog/metadata.py create mode 100644 apps/api/app/catalog/providers.py create mode 100644 apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py diff --git a/apps/api/.env.example b/apps/api/.env.example index 37f298ab..3ea36d86 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -78,6 +78,35 @@ MEGAKINO_BASE_URL=https://megakino1.to # Default: 12 MEGAKINO_TITLES_REFRESH_HOURS=12 +## Scheduled Provider Catalog Index +# What: Default refresh cadence (hours) for the persistent provider catalog index +# Default: 24 +PROVIDER_INDEX_REFRESH_HOURS=24 +# What: Provider-specific refresh cadence override for AniWorld +# Default: PROVIDER_INDEX_REFRESH_HOURS +PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD=24 +# What: Provider-specific refresh cadence override for s.to +# Default: PROVIDER_INDEX_REFRESH_HOURS +PROVIDER_INDEX_REFRESH_HOURS_STO=24 +# What: Provider-specific refresh cadence override for megakino +# Default: PROVIDER_INDEX_REFRESH_HOURS +PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO=24 +# What: Scheduler poll interval in seconds for checking due provider refreshes +# Default: 60 +PROVIDER_INDEX_SCHEDULER_POLL_SECONDS=60 +# What: Maximum number of provider refreshes allowed to run in parallel +# Default: 1 +PROVIDER_INDEX_GLOBAL_CONCURRENCY=1 +# What: Per-provider crawl worker count for AniWorld title refreshes +# Default: 1 +PROVIDER_INDEX_CONCURRENCY_ANIWORLD=1 +# What: Per-provider crawl worker count for s.to title refreshes +# Default: 1 +PROVIDER_INDEX_CONCURRENCY_STO=1 +# What: Per-provider crawl worker count for megakino title refreshes +# Default: 1 +PROVIDER_INDEX_CONCURRENCY_MEGAKINO=1 + # What: Domain check interval in minutes (0 disables background checks) # Default: 100 MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN=100 diff --git a/apps/api/app/api/health.py b/apps/api/app/api/health.py index 29526a0a..49f996bb 100644 --- a/apps/api/app/api/health.py +++ b/apps/api/app/api/health.py @@ -2,9 +2,19 @@ from fastapi import APIRouter +from app.catalog import get_catalog_indexer + router = APIRouter() @router.get("/health") async def healthcheck(): - return {"status": "ok"} + return { + "status": "ok", + "catalog": get_catalog_indexer().get_progress_snapshot(), + } + + +@router.get("/health/catalog") +async def catalog_healthcheck(): + return get_catalog_indexer().get_progress_snapshot() diff --git a/apps/api/app/api/torznab/api.py b/apps/api/app/api/torznab/api.py index 2e6e5337..eb74d8c8 100644 --- a/apps/api/app/api/torznab/api.py +++ b/apps/api/app/api/torznab/api.py @@ -9,8 +9,11 @@ from loguru import logger from sqlmodel import Session +from app.catalog import require_catalog_ready +from app.catalog.exceptions import CatalogNotReadyError from app.config import ( ANIBRIDGE_TEST_MODE, + CATALOG_SITES_LIST, SPECIALS_METADATA_ENABLED, STRM_FILES_MODE, TORZNAB_CAT_ANIME, @@ -18,14 +21,23 @@ TORZNAB_RETURN_TEST_RESULT, TORZNAB_SEASON_SEARCH_MAX_CONSECUTIVE_MISSES, TORZNAB_SEASON_SEARCH_MAX_EPISODES, - TORZNAB_SEASON_SEARCH_MODE, TORZNAB_TEST_EPISODE, TORZNAB_TEST_LANGUAGE, TORZNAB_TEST_SEASON, TORZNAB_TEST_SLUG, TORZNAB_TEST_TITLE, ) -from app.db import get_session +from app.db import ( + find_canonical_series_by_ids_or_title, + find_provider_episode_mapping, + find_provider_episode_mappings_for_canonical_episode, + find_provider_episode_mappings_for_canonical_season, + get_indexed_episode_languages, + get_session, + list_indexed_provider_episodes, + resolve_indexed_title, + search_indexed_provider_titles, +) from app.providers.aniworld.specials import ( SpecialIds, resolve_special_mapping_from_episode_request, @@ -355,6 +367,212 @@ def _rss_response(rss: ET.Element) -> Response: return Response(content=xml, media_type="application/rss+xml; charset=utf-8") +def _indexed_preview_results( + *, + tn_module, + session: Session, + q_str: str, + channel: ET.Element, + cat_id: int, + providers: list[str], + limit: int, + strm_suffix: str, +) -> int: + rows = search_indexed_provider_titles( + session, + query=q_str, + providers=providers, + limit=max(1, limit), + ) + now = datetime.now(timezone.utc) + count = 0 + for row in rows: + provider = row.provider + title = row.title + episodes = list_indexed_provider_episodes( + session, + provider=provider, + slug=row.slug, + ) + if episodes: + target = sorted(episodes, key=lambda item: (item.season, item.episode))[0] + mapping = find_provider_episode_mapping( + session, + provider=provider, + slug=row.slug, + provider_season=target.season, + provider_episode=target.episode, + ) + languages = get_indexed_episode_languages( + session, + provider=provider, + slug=row.slug, + season=target.season, + episode=target.episode, + ) + language_values = [item.language for item in languages] or _default_languages_for_site(provider) + season_i = mapping.canonical_season if mapping is not None else target.season + episode_i = mapping.canonical_episode if mapping is not None else target.episode + provider_season_i = target.season + provider_episode_i = target.episode + else: + language_values = _default_languages_for_site(provider) + season_i = 1 + episode_i = 1 + provider_season_i = 1 + provider_episode_i = 1 + for language in language_values: + release_title = tn_module.build_release_name( + series_title=title, + season=None if cat_id == TORZNAB_CAT_MOVIE else season_i, + episode=None if cat_id == TORZNAB_CAT_MOVIE else episode_i, + height=None, + vcodec=None, + language=language, + site=provider, + ) + magnet = tn_module.build_magnet( + title=release_title, + slug=row.slug, + season=provider_season_i, + episode=provider_episode_i, + language=language, + provider=None, + site=provider, + ) + _build_item( + channel=channel, + title=release_title, + magnet=magnet, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{row.slug}:{season_i}:{episode_i}:{language}", + language=language, + ) + count += 1 + if count >= max(1, limit): + return count + if STRM_FILES_MODE in ("only", "both"): + magnet_strm = tn_module.build_magnet( + title=release_title + strm_suffix, + slug=row.slug, + season=provider_season_i, + episode=provider_episode_i, + language=language, + provider=None, + site=provider, + mode="strm", + ) + _build_item( + channel=channel, + title=release_title + strm_suffix, + magnet=magnet_strm, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{row.slug}:{season_i}:{episode_i}:{language}:strm", + language=language, + ) + count += 1 + if count >= max(1, limit): + return count + return count + + +def _emit_indexed_mapped_episode( + *, + tn_module, + session: Session, + channel: ET.Element, + provider: str, + slug: str, + title: str, + canonical_season: int, + canonical_episode: int, + provider_season: int, + provider_episode: int, + cat_id: int, + now: datetime, + strm_suffix: str, + max_items: int, +) -> int: + languages = get_indexed_episode_languages( + session, + provider=provider, + slug=slug, + season=provider_season, + episode=provider_episode, + ) + emitted = 0 + for language_row in languages or []: + release_title = tn_module.build_release_name( + series_title=title, + season=canonical_season, + episode=canonical_episode, + height=None, + vcodec=None, + language=language_row.language, + site=provider, + ) + magnet = tn_module.build_magnet( + title=release_title, + slug=slug, + season=provider_season, + episode=provider_episode, + language=language_row.language, + provider=None, + site=provider, + ) + _build_item( + channel=channel, + title=release_title, + magnet=magnet, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{slug}:S{canonical_season}E{canonical_episode}:{language_row.language}", + language=language_row.language, + ) + emitted += 1 + if emitted >= max_items: + return emitted + if STRM_FILES_MODE in ("only", "both"): + magnet_strm = tn_module.build_magnet( + title=release_title + strm_suffix, + slug=slug, + season=provider_season, + episode=provider_episode, + language=language_row.language, + provider=None, + site=provider, + mode="strm", + ) + _build_item( + channel=channel, + title=release_title + strm_suffix, + magnet=magnet_strm, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{slug}:S{canonical_season}E{canonical_episode}:{language_row.language}:strm", + language=language_row.language, + ) + emitted += 1 + if emitted >= max_items: + return emitted + return emitted + + +def _indexed_display_title( + *, + session: Session, + provider: str, + slug: str, + fallback_title: str, +) -> str: + title = resolve_indexed_title(session, provider=provider, slug=slug) + if title: + return title + return fallback_title + + @router.get("/api", response_class=FastAPIResponse) def torznab_api( request: Request, @@ -406,6 +624,13 @@ def torznab_api( if t == "search": import app.api.torznab as tn + try: + require_catalog_ready() + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc + rss, channel = _rss_root() q_str = (q or "").strip() strm_suffix = " [STRM]" @@ -430,55 +655,51 @@ def torznab_api( return _rss_response(rss) if movie_preferred: - count = _handle_preview_search( - session, - q_str, - channel, - TORZNAB_CAT_MOVIE, - site="megakino", + count = _indexed_preview_results( + tn_module=tn, + session=session, + q_str=q_str, + channel=channel, + cat_id=TORZNAB_CAT_MOVIE, + providers=["megakino"], limit=limit, strm_suffix=strm_suffix, ) if count == 0: - _handle_preview_search( - session, - q_str, - channel, - TORZNAB_CAT_ANIME, + _indexed_preview_results( + tn_module=tn, + session=session, + q_str=q_str, + channel=channel, + cat_id=TORZNAB_CAT_ANIME, + providers=[site for site in CATALOG_SITES_LIST if site != "megakino"], limit=limit, strm_suffix=strm_suffix, ) return _rss_response(rss) - special_count = _handle_special_search( - session, - q_str, - channel, - cat_id, - ids=SpecialIds( - tvdbid=tvdbid, - tmdbid=tmdbid, - imdbid=imdbid, - rid=rid, - tvmazeid=tvmazeid, - ), + _indexed_preview_results( + tn_module=tn, + session=session, + q_str=q_str, + channel=channel, + cat_id=cat_id, + providers=[site for site in CATALOG_SITES_LIST if site != "megakino"], limit=limit, strm_suffix=strm_suffix, ) - if special_count == 0: - _handle_preview_search( - session, - q_str, - channel, - cat_id, - limit=limit, - strm_suffix=strm_suffix, - ) return _rss_response(rss) if t in ("movie", "movie-search"): import app.api.torznab as tn + try: + require_catalog_ready() + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc + rss, channel = _rss_root() q_str = (q or "").strip() strm_suffix = " [STRM]" @@ -491,12 +712,13 @@ def torznab_api( ) return _rss_response(rss) if q_str: - _handle_preview_search( - session, - q_str, - channel, - TORZNAB_CAT_MOVIE, - site="megakino", + _indexed_preview_results( + tn_module=tn, + session=session, + q_str=q_str, + channel=channel, + cat_id=TORZNAB_CAT_MOVIE, + providers=["megakino"], limit=limit, strm_suffix=strm_suffix, ) @@ -508,6 +730,12 @@ def torznab_api( raise HTTPException(status_code=400, detail="invalid t") import app.api.torznab as tn + try: + require_catalog_ready() + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc if season is None: logger.debug("Returning empty RSS feed due to missing season.") @@ -530,100 +758,97 @@ def torznab_api( logger.debug("Returning empty RSS feed due to unresolved tvsearch query.") return _empty_rss_response() - result = tn._slug_from_query(q_str) - if not result: - logger.warning("No slug found for query '{}'. Returning empty RSS feed.", q_str) + canonical_series = find_canonical_series_by_ids_or_title( + session, + tvdb_id=tvdbid, + tmdb_id=tmdbid, + imdb_id=imdbid, + query=q_str, + ) + if canonical_series is None: + logger.warning( + "No canonical series found for query '{}'. Returning empty RSS feed.", + q_str, + ) return _empty_rss_response() - - site_found, slug = result - display_title = tn.resolve_series_title(slug, site_found) or q_str rss, channel = _rss_root() - count = 0 limit_i = max(1, int(limit)) now = datetime.now(timezone.utc) strm_suffix = " [STRM]" - ids = SpecialIds( - tvdbid=tvdbid, - tmdbid=tmdbid, - imdbid=imdbid, - rid=rid, - tvmazeid=tvmazeid, - ) if search_mode == "episode-search": assert ep_i is not None - emitted, limit_hit = emit_tvsearch_episode_items( - tn_module=tn, - session=session, - channel=channel, - slug=slug, - site_found=site_found, - display_title=display_title, - q_str=q_str, - request_season=season_i, - request_episode=ep_i, - ids=ids, - now=now, - strm_suffix=strm_suffix, - max_items=limit_i, - allow_live_probe=True, - ) - count += emitted - if limit_hit: - logger.info( - "tvsearch episode-search terminated due to limit hit (limit={})", - limit_i, + count = 0 + for mapping in find_provider_episode_mappings_for_canonical_episode( + session, + tvdb_id=canonical_series.tvdb_id, + canonical_season=season_i, + canonical_episode=ep_i, + providers=CATALOG_SITES_LIST, + ): + display_title = _indexed_display_title( + session=session, + provider=mapping.provider, + slug=mapping.slug, + fallback_title=canonical_series.title, + ) + remaining = limit_i - count + if remaining <= 0: + break + count += _emit_indexed_mapped_episode( + tn_module=tn, + session=session, + channel=channel, + provider=mapping.provider, + slug=mapping.slug, + title=display_title, + canonical_season=season_i, + canonical_episode=ep_i, + provider_season=mapping.provider_season, + provider_episode=mapping.provider_episode, + cat_id=TORZNAB_CAT_ANIME, + now=now, + strm_suffix=strm_suffix, + max_items=remaining, ) return _rss_response(rss) - fast_season_mode = TORZNAB_SEASON_SEARCH_MODE == "fast" - episode_numbers = resolve_season_episode_numbers( - tn_module=tn, - session=session, - slug=slug, - season_i=season_i, - site_found=site_found, - q_str=q_str, - ids=ids, - allow_fallback_probe=not fast_season_mode, + count = 0 + season_mappings = sorted( + find_provider_episode_mappings_for_canonical_season( + session, + tvdb_id=canonical_series.tvdb_id, + canonical_season=season_i, + providers=CATALOG_SITES_LIST, + ), + key=lambda item: (item.canonical_episode, item.provider, item.slug), ) - for episode_i in episode_numbers: + for mapping in season_mappings: remaining = limit_i - count if remaining <= 0: - logger.info( - "tvsearch season-search termination reason=limit hit limit={}", - limit_i, - ) break - - emitted, limit_hit = emit_tvsearch_episode_items( + display_title = _indexed_display_title( + session=session, + provider=mapping.provider, + slug=mapping.slug, + fallback_title=canonical_series.title, + ) + count += _emit_indexed_mapped_episode( tn_module=tn, session=session, channel=channel, - slug=slug, - site_found=site_found, - display_title=display_title, - q_str=q_str, - request_season=season_i, - request_episode=episode_i, - ids=ids, + provider=mapping.provider, + slug=mapping.slug, + title=display_title, + canonical_season=season_i, + canonical_episode=mapping.canonical_episode, + provider_season=mapping.provider_season, + provider_episode=mapping.provider_episode, + cat_id=TORZNAB_CAT_ANIME, now=now, strm_suffix=strm_suffix, max_items=remaining, - allow_live_probe=not fast_season_mode, - fast_episode_languages=None, ) - count += emitted - if limit_hit: - logger.info( - ( - "tvsearch season-search termination reason=limit hit " - "limit={} emitted_items={}" - ), - limit_i, - count, - ) - break logger.info("Returning RSS feed with {} items.", count) return _rss_response(rss) diff --git a/apps/api/app/catalog/__init__.py b/apps/api/app/catalog/__init__.py new file mode 100644 index 00000000..13743dcf --- /dev/null +++ b/apps/api/app/catalog/__init__.py @@ -0,0 +1,11 @@ +from .indexer import ( + get_catalog_indexer, + get_catalog_readiness_error, + require_catalog_ready, +) + +__all__ = [ + "get_catalog_indexer", + "get_catalog_readiness_error", + "require_catalog_ready", +] diff --git a/apps/api/app/catalog/exceptions.py b/apps/api/app/catalog/exceptions.py new file mode 100644 index 00000000..c2218131 --- /dev/null +++ b/apps/api/app/catalog/exceptions.py @@ -0,0 +1,5 @@ +from __future__ import annotations + + +class CatalogNotReadyError(RuntimeError): + """Raised when catalog-dependent routes are hit before bootstrap completes.""" diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py new file mode 100644 index 00000000..99287c85 --- /dev/null +++ b/apps/api/app/catalog/indexer.py @@ -0,0 +1,532 @@ +from __future__ import annotations + +import threading +from dataclasses import dataclass +from datetime import timedelta +from uuid import uuid4 + +from loguru import logger +from sqlmodel import Session + +from app.catalog.exceptions import CatalogNotReadyError +from app.catalog.providers import crawl_provider_catalog +from app.config import ( + ANIBRIDGE_TEST_MODE, + CATALOG_SITES_LIST, + CATALOG_SITE_CONFIGS, + PROGRESS_STEP_PERCENT, + PROVIDER_INDEX_GLOBAL_CONCURRENCY, + PROVIDER_INDEX_SCHEDULER_POLL_SECONDS, +) +from app.db import ( + delete_provider_generation, + engine, + get_provider_index_status, + is_catalog_bootstrap_ready, + list_provider_index_statuses, + prune_provider_generation, + replace_canonical_episodes, + replace_provider_catalog_aliases, + replace_provider_catalog_episodes, + replace_provider_catalog_title, + replace_provider_episode_mappings, + replace_provider_movie_mappings, + replace_provider_series_mappings, + upsert_canonical_series, + upsert_provider_index_status, + upsert_provider_title_index_state, + utcnow, +) +from app.utils.terminal import ProgressReporter, ProgressSnapshot + +_INDEXER: "ProviderCatalogIndexer | None" = None +_INDEXER_LOCK = threading.Lock() +_UNSET = object() + + +@dataclass(slots=True) +class ProviderCatalogProgress: + provider: str + phase: str = "pending" + processed_titles: int = 0 + total_titles: int | None = None + current_slug: str = "" + last_logged_step: int = -1 + + @property + def progress_percent(self) -> float | None: + if not self.total_titles: + return None + if self.total_titles <= 0: + return 100.0 + return round( + max(0.0, min(100.0, self.processed_titles / self.total_titles * 100.0)), + 1, + ) + + +def get_catalog_indexer() -> "ProviderCatalogIndexer": + global _INDEXER + with _INDEXER_LOCK: + if _INDEXER is None: + _INDEXER = ProviderCatalogIndexer() + return _INDEXER + + +def get_catalog_readiness_error() -> str | None: + indexer = get_catalog_indexer() + with Session(engine) as session: + statuses = list_provider_index_statuses(session) + if ANIBRIDGE_TEST_MODE and not statuses: + return None + if is_catalog_bootstrap_ready(session, providers=CATALOG_SITES_LIST): + return None + pending: list[str] = [] + snapshot = indexer.get_progress_snapshot() + by_provider = { + item["provider"]: item for item in snapshot.get("providers", []) + } + for provider in CATALOG_SITES_LIST: + status = get_provider_index_status(session, provider=provider) + if status is None or not status.bootstrap_completed: + progress = by_provider.get(provider, {}) + processed = progress.get("processed_titles") + total = progress.get("total_titles") + percent = progress.get("progress_percent") + phase = progress.get("phase") or "pending" + if isinstance(processed, int) and isinstance(total, int) and total > 0: + pending.append( + f"{provider} ({processed}/{total}, {percent:.1f}%, {phase})" + ) + else: + pending.append(f"{provider} ({phase})") + if not pending: + return None + return ( + "Provider catalog bootstrap is still running. " + f"Pending providers: {', '.join(pending)}." + ) + + +def require_catalog_ready() -> None: + message = get_catalog_readiness_error() + if message: + logger.warning("Catalog-dependent request blocked: {}", message) + raise CatalogNotReadyError(message) + + +class ProviderCatalogIndexer: + def __init__(self) -> None: + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + self._active = threading.Semaphore(PROVIDER_INDEX_GLOBAL_CONCURRENCY) + self._progress_lock = threading.Lock() + self._progress: dict[str, ProviderCatalogProgress] = {} + + def start(self) -> None: + self._ensure_status_rows() + if ANIBRIDGE_TEST_MODE: + return + if self._thread is not None and self._thread.is_alive(): + return + self._thread = threading.Thread( + target=self._run_loop, + name="provider-catalog-indexer", + daemon=True, + ) + self._thread.start() + + def stop(self) -> None: + self._stop_event.set() + if self._thread is not None: + self._thread.join(timeout=5) + + def run_due_once(self) -> None: + with Session(engine) as session: + statuses = list_provider_index_statuses(session) + for status in statuses: + if self._is_due(status): + self.refresh_provider(status.provider) + + def refresh_provider(self, provider: str) -> None: + if not self._active.acquire(blocking=False): + return + try: + self._refresh_provider(provider) + finally: + self._active.release() + + def get_progress_snapshot(self) -> dict[str, object]: + with Session(engine) as session: + statuses = { + status.provider: status + for status in list_provider_index_statuses(session) + } + bootstrap_ready = is_catalog_bootstrap_ready( + session, providers=CATALOG_SITES_LIST + ) + with self._progress_lock: + runtime = { + provider: ProviderCatalogProgress( + provider=snapshot.provider, + phase=snapshot.phase, + processed_titles=snapshot.processed_titles, + total_titles=snapshot.total_titles, + current_slug=snapshot.current_slug, + last_logged_step=snapshot.last_logged_step, + ) + for provider, snapshot in self._progress.items() + } + providers: list[dict[str, object]] = [] + for provider in CATALOG_SITES_LIST: + status = statuses.get(provider) + progress = runtime.get(provider, ProviderCatalogProgress(provider=provider)) + phase = progress.phase + if phase == "pending" and status is not None: + phase = status.status + providers.append( + { + "provider": provider, + "status": status.status if status is not None else "pending", + "bootstrap_completed": ( + bool(status.bootstrap_completed) if status is not None else False + ), + "phase": phase, + "processed_titles": progress.processed_titles, + "total_titles": progress.total_titles, + "progress_percent": progress.progress_percent, + "current_slug": progress.current_slug or None, + "last_error_summary": ( + status.last_error_summary if status is not None else "" + ), + "latest_started_at": ( + status.latest_started_at.isoformat() + if status is not None and status.latest_started_at is not None + else None + ), + "latest_completed_at": ( + status.latest_completed_at.isoformat() + if status is not None and status.latest_completed_at is not None + else None + ), + } + ) + return { + "bootstrap_ready": bootstrap_ready, + "bootstrapping": not bootstrap_ready, + "providers": providers, + } + + def _run_loop(self) -> None: + while not self._stop_event.wait(PROVIDER_INDEX_SCHEDULER_POLL_SECONDS): + try: + self.run_due_once() + except Exception as exc: + logger.exception("Provider catalog scheduler loop failed: {}", exc) + + def _ensure_status_rows(self) -> None: + with Session(engine) as session: + now = None + for provider in CATALOG_SITES_LIST: + self._set_progress(provider, phase="pending") + hours = float( + CATALOG_SITE_CONFIGS.get(provider, {}).get( + "provider_index_refresh_hours", 24.0 + ) + ) + status = get_provider_index_status(session, provider=provider) + if status is None: + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=hours, + status="pending", + bootstrap_completed=False, + next_refresh_after=now, + ) + + def _is_due(self, status) -> bool: + if status.status == "running": + return False + if status.latest_success_at is None: + return True + if status.next_refresh_after is None: + return True + return status.next_refresh_after <= utcnow() + + def _refresh_provider(self, provider: str) -> None: + refresh_interval_hours = float( + CATALOG_SITE_CONFIGS.get(provider, {}).get("provider_index_refresh_hours", 24.0) + ) + generation = uuid4().hex + reporter: ProgressReporter | None = None + self._set_progress( + provider, + phase="discovering_titles", + processed_titles=0, + total_titles=None, + current_slug="", + reset_log_step=True, + ) + logger.info("Provider catalog {}: discovering titles", provider) + with Session(engine) as session: + current = get_provider_index_status(session, provider=provider) + failure_count = 0 if current is None else current.failure_count + _ = failure_count + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="running", + current_generation=generation, + latest_started_at=utcnow(), + ) + + try: + titles = crawl_provider_catalog(provider) + reporter = ProgressReporter( + label=f"Catalog {provider}", + unit="title", + unit_scale=False, + ) + reporter.update( + ProgressSnapshot( + downloaded=0, + total=len(titles), + status="indexing_titles", + ) + ) + self._set_progress( + provider, + phase="indexing_titles", + total_titles=len(titles), + processed_titles=0, + current_slug="", + reset_log_step=True, + ) + for title_record in titles: + with Session(engine) as session: + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + cursor_title_slug=title_record.slug, + ) + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_record.slug, + attempted_at=utcnow(), + ) + replace_provider_catalog_title( + session, + provider=provider, + slug=title_record.slug, + title=title_record.title, + media_type_hint=title_record.media_type_hint, + relative_path=title_record.relative_path, + indexed_generation=generation, + ) + replace_provider_catalog_aliases( + session, + provider=provider, + slug=title_record.slug, + aliases=title_record.aliases, + indexed_generation=generation, + ) + replace_provider_catalog_episodes( + session, + provider=provider, + slug=title_record.slug, + episodes=[ + { + "season": episode.season, + "episode": episode.episode, + "relative_path": episode.relative_path, + "title_primary": episode.title_primary, + "title_secondary": episode.title_secondary, + "media_type_hint": episode.media_type_hint, + "languages": [ + { + "language": lang.language, + "host_hints": lang.host_hints, + } + for lang in episode.languages + ], + } + for episode in title_record.episodes + ], + indexed_generation=generation, + ) + if title_record.canonical.series is not None: + series = title_record.canonical.series + upsert_canonical_series( + session, + tvdb_id=int(series["tvdb_id"]), + title=str(series["title"]), + tmdb_id=series.get("tmdb_id"), + imdb_id=series.get("imdb_id"), + tvmaze_id=series.get("tvmaze_id"), + anilist_id=series.get("anilist_id"), + mal_id=series.get("mal_id"), + aliases=list(series.get("aliases") or []), + ) + replace_canonical_episodes( + session, + tvdb_id=int(series["tvdb_id"]), + episodes=title_record.canonical.episodes, + ) + replace_provider_series_mappings( + session, + provider=provider, + slug=title_record.slug, + mappings=title_record.canonical.series_mappings, + ) + replace_provider_episode_mappings( + session, + provider=provider, + slug=title_record.slug, + mappings=title_record.canonical.episode_mappings, + ) + replace_provider_movie_mappings( + session, + provider=provider, + slug=title_record.slug, + mappings=title_record.canonical.movie_mappings, + ) + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_record.slug, + succeeded_at=utcnow(), + failure_count=0, + last_error_summary="", + ) + session.commit() + self._advance_progress(provider, current_slug=title_record.slug) + processed_titles = self._get_processed_titles(provider) + reporter.update( + ProgressSnapshot( + downloaded=min(len(titles), processed_titles), + total=len(titles), + status="indexing_titles", + ) + ) + completed_at = utcnow() + with Session(engine) as session: + prune_provider_generation( + session, + provider=provider, + keep_generation=generation, + ) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="ready", + current_generation=generation, + latest_success_generation=generation, + latest_completed_at=completed_at, + latest_success_at=completed_at, + next_refresh_after=completed_at + + timedelta(hours=refresh_interval_hours), + bootstrap_completed=True, + failure_count=0, + last_error_summary="", + cursor_title_slug="", + ) + self._set_progress( + provider, + phase="ready", + current_slug="", + ) + if reporter is not None: + reporter.close() + except Exception as exc: + logger.exception("Provider catalog refresh failed for {}: {}", provider, exc) + if reporter is not None: + reporter.close() + completed_at = utcnow() + with Session(engine) as session: + delete_provider_generation( + session, + provider=provider, + generation=generation, + ) + current = get_provider_index_status(session, provider=provider) + failure_count = 1 if current is None else current.failure_count + 1 + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="failed", + latest_completed_at=completed_at, + next_refresh_after=completed_at + + timedelta(hours=refresh_interval_hours), + failure_count=failure_count, + last_error_summary=str(exc)[:500], + ) + self._set_progress( + provider, + phase="failed", + current_slug="", + ) + + def _set_progress( + self, + provider: str, + *, + phase: str | None = None, + processed_titles: int | None = None, + total_titles: int | None | object = _UNSET, + current_slug: str | None = None, + reset_log_step: bool = False, + ) -> None: + with self._progress_lock: + snapshot = self._progress.get(provider) + if snapshot is None: + snapshot = ProviderCatalogProgress(provider=provider) + self._progress[provider] = snapshot + if phase is not None: + snapshot.phase = phase + if processed_titles is not None: + snapshot.processed_titles = processed_titles + if total_titles is not _UNSET: + snapshot.total_titles = total_titles + if current_slug is not None: + snapshot.current_slug = current_slug + if reset_log_step: + snapshot.last_logged_step = -1 + + def _advance_progress(self, provider: str, *, current_slug: str) -> None: + with self._progress_lock: + snapshot = self._progress.get(provider) + if snapshot is None: + snapshot = ProviderCatalogProgress(provider=provider) + self._progress[provider] = snapshot + snapshot.processed_titles += 1 + snapshot.current_slug = current_slug + total = snapshot.total_titles + percent = snapshot.progress_percent + if total is None or percent is None: + return + step = max(1, int(PROGRESS_STEP_PERCENT)) + current_step = int(percent) // step + if percent < 100.0 and current_step <= snapshot.last_logged_step: + return + snapshot.last_logged_step = current_step + logger.info( + "Provider catalog {} progress: {}/{} ({}%) current={}", + provider, + snapshot.processed_titles, + total, + percent, + current_slug, + ) + + def _get_processed_titles(self, provider: str) -> int: + with self._progress_lock: + snapshot = self._progress.get(provider) + if snapshot is None: + return 0 + return snapshot.processed_titles diff --git a/apps/api/app/catalog/metadata.py b/apps/api/app/catalog/metadata.py new file mode 100644 index 00000000..76585f27 --- /dev/null +++ b/apps/api/app/catalog/metadata.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +from dataclasses import dataclass +from difflib import SequenceMatcher +from typing import Any, Optional +from urllib.parse import urlencode + +from loguru import logger + +from app.db import normalize_catalog_text +from app.utils.http_client import get as http_get + +SKYHOOK_SEARCH_URL = "https://skyhook.sonarr.tv/v1/tvdb/search/en/" +SKYHOOK_SHOW_URL = "https://skyhook.sonarr.tv/v1/tvdb/shows/en/{tvdb_id}" + + +@dataclass(slots=True) +class TvCanonicalMatch: + tvdb_id: int + title: str + confidence: str + source: str + rationale: str + payload: dict[str, Any] + + +def _score_title(query: str, candidate: str) -> float: + left = normalize_catalog_text(query) + right = normalize_catalog_text(candidate) + if not left or not right: + return 0.0 + if left == right: + return 1.0 + return SequenceMatcher(None, left, right).ratio() + + +def _candidate_terms( + *, + title: str, + aliases: list[str], + imdb_id: Optional[str], + tmdb_id: Optional[int], +) -> list[tuple[str, str]]: + terms: list[tuple[str, str]] = [] + if imdb_id: + terms.append((f"imdb:{imdb_id}", "explicit_imdb")) + if tmdb_id: + terms.append((f"tmdb:{tmdb_id}", "explicit_tmdb")) + if title: + terms.append((title, "title")) + for alias in aliases: + alias_clean = (alias or "").strip() + if alias_clean and alias_clean != title: + terms.append((alias_clean, "alias")) + seen: set[str] = set() + deduped: list[tuple[str, str]] = [] + for term, source in terms: + if term in seen: + continue + seen.add(term) + deduped.append((term, source)) + return deduped + + +def resolve_tv_canonical_match( + *, + title: str, + aliases: list[str], + imdb_id: Optional[str], + tmdb_id: Optional[int], +) -> Optional[TvCanonicalMatch]: + candidates: list[dict[str, Any]] = [] + for term, source in _candidate_terms( + title=title, + aliases=aliases, + imdb_id=imdb_id, + tmdb_id=tmdb_id, + ): + try: + query = urlencode({"term": term}) + response = http_get(f"{SKYHOOK_SEARCH_URL}?{query}", timeout=8.0) + response.raise_for_status() + payload = response.json() + except Exception as exc: + logger.debug("SkyHook search failed for '{}': {}", term, exc) + continue + if not isinstance(payload, list): + continue + for item in payload: + if isinstance(item, dict): + item["_ab_source"] = source + item["_ab_term"] = term + candidates.append(item) + + best_match: Optional[tuple[float, dict[str, Any]]] = None + for item in candidates: + candidate_title = str(item.get("title") or "").strip() + candidate_tvdb = item.get("tvdbId") + if not candidate_title or not isinstance(candidate_tvdb, int): + continue + scores = [_score_title(title, candidate_title)] + scores.extend(_score_title(alias, candidate_title) for alias in aliases) + score = max(scores or [0.0]) + if item.get("_ab_source") in {"explicit_imdb", "explicit_tmdb"}: + score = max(score, 0.99) + current = (score, item) + if best_match is None or current[0] > best_match[0]: + best_match = current + + if best_match is None or best_match[0] < 0.45: + return None + + score, item = best_match + tvdb_id = int(item["tvdbId"]) + try: + response = http_get(SKYHOOK_SHOW_URL.format(tvdb_id=tvdb_id), timeout=8.0) + response.raise_for_status() + payload = response.json() + except Exception as exc: + logger.debug("SkyHook show fetch failed for tvdb {}: {}", tvdb_id, exc) + return None + if not isinstance(payload, dict): + return None + + if score >= 0.99: + confidence = "confirmed" + elif score >= 0.85: + confidence = "high_confidence" + else: + confidence = "low_confidence" + + return TvCanonicalMatch( + tvdb_id=tvdb_id, + title=str(payload.get("title") or item.get("title") or title).strip(), + confidence=confidence, + source=str(item.get("_ab_source") or "title"), + rationale=f"score={score:.2f} term={item.get('_ab_term')}", + payload=payload, + ) diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py new file mode 100644 index 00000000..57db875c --- /dev/null +++ b/apps/api/app/catalog/providers.py @@ -0,0 +1,385 @@ +from __future__ import annotations + +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import Any, Optional +from urllib.parse import urlparse + +from bs4 import BeautifulSoup # type: ignore +from loguru import logger + +from app.catalog.metadata import resolve_tv_canonical_match +from app.config import CATALOG_SITE_CONFIGS +from app.db import normalize_catalog_text +from app.providers import get_provider +from app.providers.megakino.client import ( + get_default_client as get_default_megakino_client, +) +from app.utils.domain_resolver import get_megakino_base_url +from app.utils.http_client import get as http_get + + +@dataclass(slots=True) +class EpisodeLanguageRecord: + language: str + host_hints: list[str] = field(default_factory=list) + + +@dataclass(slots=True) +class EpisodeRecord: + season: int + episode: int + relative_path: str + title_primary: Optional[str] + title_secondary: Optional[str] + media_type_hint: str + languages: list[EpisodeLanguageRecord] = field(default_factory=list) + + +@dataclass(slots=True) +class CanonicalPayload: + series: Optional[dict[str, Any]] = None + episodes: list[dict[str, Any]] = field(default_factory=list) + series_mappings: list[dict[str, Any]] = field(default_factory=list) + episode_mappings: list[dict[str, Any]] = field(default_factory=list) + movie_mappings: list[dict[str, Any]] = field(default_factory=list) + + +@dataclass(slots=True) +class TitleRecord: + provider: str + slug: str + title: str + aliases: list[str] + media_type_hint: str + relative_path: str + episodes: list[EpisodeRecord] = field(default_factory=list) + canonical: CanonicalPayload = field(default_factory=CanonicalPayload) + + +def _relative_path(url: str) -> str: + parsed = urlparse(url) + path = parsed.path or "/" + if parsed.query: + return f"{path}?{parsed.query}" + return path + + +def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageRecord]: + if not isinstance(raw, dict): + return [] + languages: list[EpisodeLanguageRecord] = [] + for key, provider_map in raw.items(): + if site == "aniworld.to": + audio = getattr(key[0], "value", str(key[0])) if isinstance(key, tuple) else "" + subtitles = ( + getattr(key[1], "value", str(key[1])) if isinstance(key, tuple) and len(key) > 1 else "" + ) + if audio == "German" and subtitles == "None": + language = "German Dub" + elif audio == "Japanese" and subtitles == "German": + language = "German Sub" + elif audio == "Japanese" and subtitles == "English": + language = "English Sub" + else: + language = f"{audio} {subtitles}".strip() + else: + lang_id = int(key) if isinstance(key, int) or str(key).isdigit() else None + if lang_id == 1: + language = "German Dub" + elif lang_id == 2: + language = "English Dub" + elif lang_id == 3: + language = "German Sub" + else: + language = str(key) + host_hints = sorted(str(name) for name in (provider_map or {}).keys()) + languages.append(EpisodeLanguageRecord(language=language, host_hints=host_hints)) + languages.sort(key=lambda entry: entry.language) + return languages + + +def _score_episode_title(left: str, right: str) -> float: + a = normalize_catalog_text(left) + b = normalize_catalog_text(right) + if not a or not b: + return 0.0 + if a == b: + return 1.0 + return SequenceMatcher(None, a, b).ratio() + + +def _build_tv_canonical_payload( + *, + provider: str, + slug: str, + title: str, + aliases: list[str], + imdb_id: Optional[str], + mal_id: Optional[int], + episodes: list[EpisodeRecord], +) -> CanonicalPayload: + match = resolve_tv_canonical_match( + title=title, + aliases=aliases, + imdb_id=imdb_id, + tmdb_id=None, + ) + if match is None: + return CanonicalPayload() + + payload = match.payload + raw_episodes = payload.get("episodes") + if not isinstance(raw_episodes, list): + raw_episodes = [] + canonical_episodes: list[dict[str, Any]] = [] + for item in raw_episodes: + if not isinstance(item, dict): + continue + season_number = item.get("seasonNumber") + episode_number = item.get("episodeNumber") + episode_title = str(item.get("title") or "").strip() + if not isinstance(season_number, int) or not isinstance(episode_number, int) or not episode_title: + continue + canonical_episodes.append( + { + "season": season_number, + "episode": episode_number, + "title": episode_title, + } + ) + + series_payload = { + "tvdb_id": match.tvdb_id, + "title": match.title, + "tmdb_id": payload.get("tmdbId") if isinstance(payload.get("tmdbId"), int) else None, + "imdb_id": imdb_id or str(payload.get("imdbId") or "").strip() or None, + "tvmaze_id": payload.get("tvMazeId") if isinstance(payload.get("tvMazeId"), int) else None, + "anilist_id": None, + "mal_id": mal_id, + "aliases": aliases, + } + series_mappings = [ + { + "tvdb_id": match.tvdb_id, + "confidence": match.confidence, + "source": match.source, + "rationale": match.rationale, + } + ] + + by_number = { + (item["season"], item["episode"]): item for item in canonical_episodes + } + by_season: dict[int, list[dict[str, Any]]] = {} + for item in canonical_episodes: + by_season.setdefault(int(item["season"]), []).append(item) + + episode_mappings: list[dict[str, Any]] = [] + for provider_episode in episodes: + direct = by_number.get((provider_episode.season, provider_episode.episode)) + if direct is not None: + episode_mappings.append( + { + "provider_season": provider_episode.season, + "provider_episode": provider_episode.episode, + "tvdb_id": match.tvdb_id, + "canonical_season": direct["season"], + "canonical_episode": direct["episode"], + "confidence": "confirmed", + "source": "direct_numbering", + "rationale": "season+episode match", + } + ) + continue + + candidate_pool = by_season.get(provider_episode.season, canonical_episodes) + scored: list[tuple[float, dict[str, Any]]] = [] + search_titles = [ + value + for value in [provider_episode.title_primary, provider_episode.title_secondary] + if value + ] + for candidate in candidate_pool: + score = max( + (_score_episode_title(search_title, candidate["title"]) for search_title in search_titles), + default=0.0, + ) + if score >= 0.65: + scored.append((score, candidate)) + scored.sort(key=lambda item: item[0], reverse=True) + if not scored: + continue + top_score = scored[0][0] + plausible = [ + candidate + for score, candidate in scored + if score >= top_score - 0.05 + ] + confidence = "high_confidence" if top_score >= 0.85 else "low_confidence" + for candidate in plausible: + episode_mappings.append( + { + "provider_season": provider_episode.season, + "provider_episode": provider_episode.episode, + "tvdb_id": match.tvdb_id, + "canonical_season": int(candidate["season"]), + "canonical_episode": int(candidate["episode"]), + "confidence": confidence, + "source": "title_match", + "rationale": f"title score={top_score:.2f}", + } + ) + + return CanonicalPayload( + series=series_payload, + episodes=canonical_episodes, + series_mappings=series_mappings, + episode_mappings=episode_mappings, + ) + + +def _crawl_aniworld_like_title( + *, + provider_key: str, + slug: str, + title: str, + aliases: list[str], +) -> TitleRecord: + site_cfg = CATALOG_SITE_CONFIGS[provider_key] + base_url = str(site_cfg["base_url"]).rstrip("/") + relative_root = ( + f"/anime/stream/{slug}" if provider_key == "aniworld.to" else f"/serie/{slug}" + ) + url = f"{base_url}{relative_root}" + if provider_key == "aniworld.to": + from aniworld.models import AniworldSeries + + series = AniworldSeries(url) + imdb_id = series.imdb + mal_id = None + raw_mal = series.mal_id + if isinstance(raw_mal, list) and raw_mal: + try: + mal_id = int(raw_mal[0]) + except (TypeError, ValueError): + mal_id = None + else: + from aniworld.models import SerienstreamSeries + + series = SerienstreamSeries(url) + imdb_id = series.imdb + mal_id = None + + episodes: list[EpisodeRecord] = [] + for season in series.seasons: + for episode in season.episodes: + provider_data = getattr(episode.provider_data, "_data", None) + if provider_data is None: + provider_data = getattr(episode.provider_data, "data", None) + episodes.append( + EpisodeRecord( + season=int(getattr(season, "season_number", 0) or 0), + episode=int(getattr(episode, "episode_number", 0) or 0), + relative_path=_relative_path(episode.url), + title_primary=getattr(episode, "title_de", None), + title_secondary=getattr(episode, "title_en", None), + media_type_hint="movie" + if provider_key == "aniworld.to" and getattr(episode, "is_movie", False) + else "episode", + languages=_normalize_provider_data(provider_data, site=provider_key), + ) + ) + + canonical = _build_tv_canonical_payload( + provider=provider_key, + slug=slug, + title=series.title or title, + aliases=aliases, + imdb_id=imdb_id, + mal_id=mal_id, + episodes=episodes, + ) + return TitleRecord( + provider=provider_key, + slug=slug, + title=series.title or title, + aliases=aliases, + media_type_hint="series", + relative_path=relative_root, + episodes=episodes, + canonical=canonical, + ) + + +def _parse_megakino_page_metadata(url: str) -> tuple[str | None, int | None]: + base_url = get_megakino_base_url().rstrip("/") + response = http_get(url, timeout=20, headers={"Referer": base_url}) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + title = None + title_node = soup.find("h1") + if title_node: + title = title_node.get_text(" ", strip=True) + year = None + for node in soup.find_all(["a", "span", "div"]): + text = node.get_text(" ", strip=True) + if text and text.isdigit() and len(text) == 4 and 1900 <= int(text) <= 2100: + year = int(text) + break + return title, year + + +def crawl_provider_catalog(provider_key: str) -> list[TitleRecord]: + provider = get_provider(provider_key) + if provider is None: + return [] + + if provider_key == "megakino": + client = get_default_megakino_client() + entries = client.load_index() + titles: list[TitleRecord] = [] + for entry in entries.values(): + parsed_title = entry.slug.replace("-", " ").title() + try: + live_title, _year = _parse_megakino_page_metadata(entry.url) + if live_title: + parsed_title = live_title + except Exception as exc: + logger.debug("Megakino metadata fetch failed for {}: {}", entry.url, exc) + titles.append( + TitleRecord( + provider=provider_key, + slug=entry.slug, + title=parsed_title, + aliases=[], + media_type_hint="movie" if entry.kind == "film" else "series", + relative_path=_relative_path(entry.url), + episodes=[], + canonical=CanonicalPayload(), + ) + ) + return titles + + index = provider.load_or_refresh_index() + alternatives = provider.load_or_refresh_alternatives() + workers = int(CATALOG_SITE_CONFIGS[provider_key].get("provider_index_concurrency", 1)) + futures = [] + results: list[TitleRecord] = [] + with ThreadPoolExecutor(max_workers=max(1, workers)) as executor: + for slug, title in index.items(): + aliases = list(dict.fromkeys(alternatives.get(slug, []) or [title])) + futures.append( + executor.submit( + _crawl_aniworld_like_title, + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + ) + ) + for future in as_completed(futures): + results.append(future.result()) + results.sort(key=lambda item: item.slug) + return results diff --git a/apps/api/app/config.py b/apps/api/app/config.py index a79480d3..932d03cc 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -229,6 +229,47 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN = int( os.getenv("MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN", "100") ) +PROVIDER_INDEX_REFRESH_HOURS = float( + os.getenv("PROVIDER_INDEX_REFRESH_HOURS", "24") +) +PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD = float( + os.getenv( + "PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD", str(PROVIDER_INDEX_REFRESH_HOURS) + ) +) +PROVIDER_INDEX_REFRESH_HOURS_STO = float( + os.getenv("PROVIDER_INDEX_REFRESH_HOURS_STO", str(PROVIDER_INDEX_REFRESH_HOURS)) +) +PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO = float( + os.getenv( + "PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO", str(PROVIDER_INDEX_REFRESH_HOURS) + ) +) +PROVIDER_INDEX_SCHEDULER_POLL_SECONDS = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_SCHEDULER_POLL_SECONDS"), 60 +) +if PROVIDER_INDEX_SCHEDULER_POLL_SECONDS < 5: + PROVIDER_INDEX_SCHEDULER_POLL_SECONDS = 5 +PROVIDER_INDEX_GLOBAL_CONCURRENCY = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_GLOBAL_CONCURRENCY"), 1 +) +if PROVIDER_INDEX_GLOBAL_CONCURRENCY < 1: + PROVIDER_INDEX_GLOBAL_CONCURRENCY = 1 +PROVIDER_INDEX_CONCURRENCY_ANIWORLD = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_CONCURRENCY_ANIWORLD"), 1 +) +if PROVIDER_INDEX_CONCURRENCY_ANIWORLD < 1: + PROVIDER_INDEX_CONCURRENCY_ANIWORLD = 1 +PROVIDER_INDEX_CONCURRENCY_STO = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_CONCURRENCY_STO"), 1 +) +if PROVIDER_INDEX_CONCURRENCY_STO < 1: + PROVIDER_INDEX_CONCURRENCY_STO = 1 +PROVIDER_INDEX_CONCURRENCY_MEGAKINO = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_CONCURRENCY_MEGAKINO"), 1 +) +if PROVIDER_INDEX_CONCURRENCY_MEGAKINO < 1: + PROVIDER_INDEX_CONCURRENCY_MEGAKINO = 1 logger.debug( f"ANIWORLD_ALPHABET_HTML={ANIWORLD_ALPHABET_HTML}, ANIWORLD_ALPHABET_URL={ANIWORLD_ALPHABET_URL}" @@ -239,6 +280,21 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: logger.debug(f"MEGAKINO_BASE_URL={MEGAKINO_BASE_URL}") logger.debug(f"MEGAKINO_TITLES_REFRESH_HOURS={MEGAKINO_TITLES_REFRESH_HOURS}") logger.debug(f"MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN={MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN}") +logger.debug("PROVIDER_INDEX_REFRESH_HOURS={}", PROVIDER_INDEX_REFRESH_HOURS) +logger.debug( + "Provider index refresh overrides: aniworld={} sto={} megakino={}", + PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD, + PROVIDER_INDEX_REFRESH_HOURS_STO, + PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO, +) +logger.debug( + "Provider index scheduler: poll_seconds={} global_concurrency={} per_provider=({}, {}, {})", + PROVIDER_INDEX_SCHEDULER_POLL_SECONDS, + PROVIDER_INDEX_GLOBAL_CONCURRENCY, + PROVIDER_INDEX_CONCURRENCY_ANIWORLD, + PROVIDER_INDEX_CONCURRENCY_STO, + PROVIDER_INDEX_CONCURRENCY_MEGAKINO, +) # TTL (Stunden) für Live-Index; 0 = nie neu laden (nur einmal pro Prozess) ANIWORLD_TITLES_REFRESH_HOURS = float(os.getenv("ANIWORLD_TITLES_REFRESH_HOURS", "24")) @@ -266,6 +322,8 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: "alphabet_html": ANIWORLD_ALPHABET_HTML, "alphabet_url": ANIWORLD_ALPHABET_URL, "titles_refresh_hours": ANIWORLD_TITLES_REFRESH_HOURS, + "provider_index_refresh_hours": PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD, + "provider_index_concurrency": PROVIDER_INDEX_CONCURRENCY_ANIWORLD, "default_languages": ["German Dub", "German Sub", "English Sub"], "release_group": RELEASE_GROUP_ANIWORLD, }, @@ -274,6 +332,8 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: "alphabet_html": STO_ALPHABET_HTML, "alphabet_url": STO_ALPHABET_URL, "titles_refresh_hours": STO_TITLES_REFRESH_HOURS, + "provider_index_refresh_hours": PROVIDER_INDEX_REFRESH_HOURS_STO, + "provider_index_concurrency": PROVIDER_INDEX_CONCURRENCY_STO, "default_languages": ["German Dub", "English Dub"], "release_group": RELEASE_GROUP_STO, }, @@ -282,6 +342,8 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: "alphabet_html": None, "alphabet_url": None, "titles_refresh_hours": MEGAKINO_TITLES_REFRESH_HOURS, + "provider_index_refresh_hours": PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO, + "provider_index_concurrency": PROVIDER_INDEX_CONCURRENCY_MEGAKINO, "default_languages": ["Deutsch", "German Dub"], "release_group": "megakino", }, diff --git a/apps/api/app/core/lifespan.py b/apps/api/app/core/lifespan.py index 0c0c0631..9defbdf9 100644 --- a/apps/api/app/core/lifespan.py +++ b/apps/api/app/core/lifespan.py @@ -30,6 +30,7 @@ ) from app.core.scheduler import init_executor, shutdown_executor +from app.catalog import get_catalog_indexer from app.db import ( engine, dispose_engine, @@ -129,6 +130,10 @@ async def lifespan(app: FastAPI): if cleaned: logger.warning(f"Reset {cleaned} dangling jobs to 'failed'") init_executor() + try: + get_catalog_indexer().start() + except Exception as e: + logger.warning("provider catalog indexer start failed: {}", e) # Start background workers cleanup_stop = threading.Event() @@ -159,6 +164,10 @@ async def lifespan(app: FastAPI): finally: # Shutdown services shutdown_executor() + try: + get_catalog_indexer().stop() + except Exception as e: + logger.warning("provider catalog indexer stop failed: {}", e) try: from app.api import strm as strm_api diff --git a/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py b/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py new file mode 100644 index 00000000..536286c3 --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py @@ -0,0 +1,605 @@ +"""Add provider catalog index and canonical mapping tables + +Revision ID: 20260428_0004 +Revises: 20260204_0003 +Create Date: 2026-04-28 00:00:00.000000 +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260428_0004" +down_revision = "20260204_0003" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + + if not inspector.has_table("providerindexstatus"): + op.create_table( + "providerindexstatus", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("refresh_interval_hours", sa.Float(), nullable=False), + sa.Column("status", sa.String(), nullable=False), + sa.Column("current_generation", sa.String(), nullable=True), + sa.Column("latest_success_generation", sa.String(), nullable=True), + sa.Column("latest_started_at", sa.DateTime(), nullable=True), + sa.Column("latest_completed_at", sa.DateTime(), nullable=True), + sa.Column("latest_success_at", sa.DateTime(), nullable=True), + sa.Column("next_refresh_after", sa.DateTime(), nullable=True), + sa.Column("bootstrap_completed", sa.Boolean(), nullable=False), + sa.Column("failure_count", sa.Integer(), nullable=False), + sa.Column("last_error_summary", sa.String(), nullable=True), + sa.Column("cursor_title_slug", sa.String(), nullable=True), + sa.Column("updated_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("provider", name="pk_providerindexstatus"), + ) + op.create_index( + "ix_providerindexstatus_status", + "providerindexstatus", + ["status"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_latest_started_at", + "providerindexstatus", + ["latest_started_at"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_latest_completed_at", + "providerindexstatus", + ["latest_completed_at"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_latest_success_at", + "providerindexstatus", + ["latest_success_at"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_next_refresh_after", + "providerindexstatus", + ["next_refresh_after"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_bootstrap_completed", + "providerindexstatus", + ["bootstrap_completed"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_updated_at", + "providerindexstatus", + ["updated_at"], + unique=False, + ) + + if not inspector.has_table("providertitleindexstate"): + op.create_table( + "providertitleindexstate", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("last_attempted_at", sa.DateTime(), nullable=True), + sa.Column("last_success_at", sa.DateTime(), nullable=True), + sa.Column("failure_count", sa.Integer(), nullable=False), + sa.Column("last_error_summary", sa.String(), nullable=True), + sa.Column("updated_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", name="pk_providertitleindexstate" + ), + ) + op.create_index( + "ix_providertitleindexstate_last_attempted_at", + "providertitleindexstate", + ["last_attempted_at"], + unique=False, + ) + op.create_index( + "ix_providertitleindexstate_last_success_at", + "providertitleindexstate", + ["last_success_at"], + unique=False, + ) + op.create_index( + "ix_providertitleindexstate_updated_at", + "providertitleindexstate", + ["updated_at"], + unique=False, + ) + + if not inspector.has_table("providercatalogtitle"): + op.create_table( + "providercatalogtitle", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("normalized_title", sa.String(), nullable=False), + sa.Column("media_type_hint", sa.String(), nullable=False), + sa.Column("relative_path", sa.String(), nullable=False), + sa.Column("indexed_generation", sa.String(), nullable=False), + sa.Column("last_indexed_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("provider", "slug", name="pk_providercatalogtitle"), + ) + op.create_index( + "ix_providercatalogtitle_title", + "providercatalogtitle", + ["title"], + unique=False, + ) + op.create_index( + "ix_providercatalogtitle_normalized_title", + "providercatalogtitle", + ["normalized_title"], + unique=False, + ) + op.create_index( + "ix_providercatalogtitle_media_type_hint", + "providercatalogtitle", + ["media_type_hint"], + unique=False, + ) + op.create_index( + "ix_providercatalogtitle_indexed_generation", + "providercatalogtitle", + ["indexed_generation"], + unique=False, + ) + op.create_index( + "ix_providercatalogtitle_last_indexed_at", + "providercatalogtitle", + ["last_indexed_at"], + unique=False, + ) + + if not inspector.has_table("providercatalogalias"): + op.create_table( + "providercatalogalias", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("alias", sa.String(), nullable=False), + sa.Column("normalized_alias", sa.String(), nullable=False), + sa.Column("indexed_generation", sa.String(), nullable=False), + sa.Column("last_indexed_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", "alias", name="pk_providercatalogalias" + ), + ) + op.create_index( + "ix_providercatalogalias_normalized_alias", + "providercatalogalias", + ["normalized_alias"], + unique=False, + ) + op.create_index( + "ix_providercatalogalias_indexed_generation", + "providercatalogalias", + ["indexed_generation"], + unique=False, + ) + op.create_index( + "ix_providercatalogalias_last_indexed_at", + "providercatalogalias", + ["last_indexed_at"], + unique=False, + ) + + if not inspector.has_table("providercatalogepisode"): + op.create_table( + "providercatalogepisode", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("season", sa.Integer(), nullable=False), + sa.Column("episode", sa.Integer(), nullable=False), + sa.Column("title_primary", sa.String(), nullable=True), + sa.Column("title_secondary", sa.String(), nullable=True), + sa.Column("relative_path", sa.String(), nullable=False), + sa.Column("media_type_hint", sa.String(), nullable=False), + sa.Column("indexed_generation", sa.String(), nullable=False), + sa.Column("last_indexed_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", "season", "episode", + name="pk_providercatalogepisode", + ), + ) + op.create_index( + "ix_providercatalogepisode_media_type_hint", + "providercatalogepisode", + ["media_type_hint"], + unique=False, + ) + op.create_index( + "ix_providercatalogepisode_indexed_generation", + "providercatalogepisode", + ["indexed_generation"], + unique=False, + ) + op.create_index( + "ix_providercatalogepisode_last_indexed_at", + "providercatalogepisode", + ["last_indexed_at"], + unique=False, + ) + + if not inspector.has_table("providerepisodelanguage"): + op.create_table( + "providerepisodelanguage", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("season", sa.Integer(), nullable=False), + sa.Column("episode", sa.Integer(), nullable=False), + sa.Column("language", sa.String(), nullable=False), + sa.Column("normalized_language", sa.String(), nullable=False), + sa.Column("host_hints", sa.JSON(), nullable=True), + sa.Column("indexed_generation", sa.String(), nullable=False), + sa.Column("last_indexed_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", "season", "episode", "language", + name="pk_providerepisodelanguage", + ), + ) + op.create_index( + "ix_providerepisodelanguage_normalized_language", + "providerepisodelanguage", + ["normalized_language"], + unique=False, + ) + op.create_index( + "ix_providerepisodelanguage_indexed_generation", + "providerepisodelanguage", + ["indexed_generation"], + unique=False, + ) + op.create_index( + "ix_providerepisodelanguage_last_indexed_at", + "providerepisodelanguage", + ["last_indexed_at"], + unique=False, + ) + + if not inspector.has_table("canonicalseries"): + op.create_table( + "canonicalseries", + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("normalized_title", sa.String(), nullable=False), + sa.Column("tmdb_id", sa.Integer(), nullable=True), + sa.Column("imdb_id", sa.String(), nullable=True), + sa.Column("tvmaze_id", sa.Integer(), nullable=True), + sa.Column("anilist_id", sa.Integer(), nullable=True), + sa.Column("mal_id", sa.Integer(), nullable=True), + sa.Column("last_synced_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("tvdb_id", name="pk_canonicalseries"), + ) + op.create_index( + "ix_canonicalseries_title", "canonicalseries", ["title"], unique=False + ) + op.create_index( + "ix_canonicalseries_normalized_title", + "canonicalseries", + ["normalized_title"], + unique=False, + ) + op.create_index( + "ix_canonicalseries_tmdb_id", "canonicalseries", ["tmdb_id"], unique=False + ) + op.create_index( + "ix_canonicalseries_imdb_id", "canonicalseries", ["imdb_id"], unique=False + ) + op.create_index( + "ix_canonicalseries_tvmaze_id", + "canonicalseries", + ["tvmaze_id"], + unique=False, + ) + op.create_index( + "ix_canonicalseries_anilist_id", + "canonicalseries", + ["anilist_id"], + unique=False, + ) + op.create_index( + "ix_canonicalseries_mal_id", "canonicalseries", ["mal_id"], unique=False + ) + op.create_index( + "ix_canonicalseries_last_synced_at", + "canonicalseries", + ["last_synced_at"], + unique=False, + ) + + if not inspector.has_table("canonicalseriesalias"): + op.create_table( + "canonicalseriesalias", + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("alias", sa.String(), nullable=False), + sa.Column("normalized_alias", sa.String(), nullable=False), + sa.PrimaryKeyConstraint( + "tvdb_id", "alias", name="pk_canonicalseriesalias" + ), + ) + op.create_index( + "ix_canonicalseriesalias_normalized_alias", + "canonicalseriesalias", + ["normalized_alias"], + unique=False, + ) + + if not inspector.has_table("canonicalepisode"): + op.create_table( + "canonicalepisode", + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("season", sa.Integer(), nullable=False), + sa.Column("episode", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("normalized_title", sa.String(), nullable=False), + sa.Column("last_synced_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "tvdb_id", "season", "episode", name="pk_canonicalepisode" + ), + ) + op.create_index( + "ix_canonicalepisode_title", + "canonicalepisode", + ["title"], + unique=False, + ) + op.create_index( + "ix_canonicalepisode_normalized_title", + "canonicalepisode", + ["normalized_title"], + unique=False, + ) + op.create_index( + "ix_canonicalepisode_last_synced_at", + "canonicalepisode", + ["last_synced_at"], + unique=False, + ) + + if not inspector.has_table("canonicalmovie"): + op.create_table( + "canonicalmovie", + sa.Column("tmdb_id", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("normalized_title", sa.String(), nullable=False), + sa.Column("release_year", sa.Integer(), nullable=False), + sa.Column("imdb_id", sa.String(), nullable=True), + sa.Column("tvdb_id", sa.Integer(), nullable=True), + sa.Column("last_synced_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("tmdb_id", name="pk_canonicalmovie"), + ) + op.create_index( + "ix_canonicalmovie_title", "canonicalmovie", ["title"], unique=False + ) + op.create_index( + "ix_canonicalmovie_normalized_title", + "canonicalmovie", + ["normalized_title"], + unique=False, + ) + op.create_index( + "ix_canonicalmovie_release_year", + "canonicalmovie", + ["release_year"], + unique=False, + ) + op.create_index( + "ix_canonicalmovie_imdb_id", "canonicalmovie", ["imdb_id"], unique=False + ) + op.create_index( + "ix_canonicalmovie_tvdb_id", "canonicalmovie", ["tvdb_id"], unique=False + ) + op.create_index( + "ix_canonicalmovie_last_synced_at", + "canonicalmovie", + ["last_synced_at"], + unique=False, + ) + + if not inspector.has_table("providerseriesmapping"): + op.create_table( + "providerseriesmapping", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("confidence", sa.String(), nullable=False), + sa.Column("source", sa.String(), nullable=False), + sa.Column("rationale", sa.String(), nullable=True), + sa.Column("last_verified_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", "tvdb_id", name="pk_providerseriesmapping" + ), + ) + op.create_index( + "ix_providerseriesmapping_confidence", + "providerseriesmapping", + ["confidence"], + unique=False, + ) + op.create_index( + "ix_providerseriesmapping_source", + "providerseriesmapping", + ["source"], + unique=False, + ) + op.create_index( + "ix_providerseriesmapping_last_verified_at", + "providerseriesmapping", + ["last_verified_at"], + unique=False, + ) + + if not inspector.has_table("providerepisodemapping"): + op.create_table( + "providerepisodemapping", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("provider_season", sa.Integer(), nullable=False), + sa.Column("provider_episode", sa.Integer(), nullable=False), + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("canonical_season", sa.Integer(), nullable=False), + sa.Column("canonical_episode", sa.Integer(), nullable=False), + sa.Column("confidence", sa.String(), nullable=False), + sa.Column("source", sa.String(), nullable=False), + sa.Column("rationale", sa.String(), nullable=True), + sa.Column("last_verified_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", + "slug", + "provider_season", + "provider_episode", + "tvdb_id", + "canonical_season", + "canonical_episode", + name="pk_providerepisodemapping", + ), + ) + op.create_index( + "ix_providerepisodemapping_confidence", + "providerepisodemapping", + ["confidence"], + unique=False, + ) + op.create_index( + "ix_providerepisodemapping_source", + "providerepisodemapping", + ["source"], + unique=False, + ) + op.create_index( + "ix_providerepisodemapping_last_verified_at", + "providerepisodemapping", + ["last_verified_at"], + unique=False, + ) + + if not inspector.has_table("providermoviemapping"): + op.create_table( + "providermoviemapping", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("tmdb_id", sa.Integer(), nullable=False), + sa.Column("confidence", sa.String(), nullable=False), + sa.Column("source", sa.String(), nullable=False), + sa.Column("rationale", sa.String(), nullable=True), + sa.Column("last_verified_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", "tmdb_id", name="pk_providermoviemapping" + ), + ) + op.create_index( + "ix_providermoviemapping_confidence", + "providermoviemapping", + ["confidence"], + unique=False, + ) + op.create_index( + "ix_providermoviemapping_source", + "providermoviemapping", + ["source"], + unique=False, + ) + op.create_index( + "ix_providermoviemapping_last_verified_at", + "providermoviemapping", + ["last_verified_at"], + unique=False, + ) + + +def downgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + + table_indexes = { + "providermoviemapping": [ + "ix_providermoviemapping_last_verified_at", + "ix_providermoviemapping_source", + "ix_providermoviemapping_confidence", + ], + "providerepisodemapping": [ + "ix_providerepisodemapping_last_verified_at", + "ix_providerepisodemapping_source", + "ix_providerepisodemapping_confidence", + ], + "providerseriesmapping": [ + "ix_providerseriesmapping_last_verified_at", + "ix_providerseriesmapping_source", + "ix_providerseriesmapping_confidence", + ], + "canonicalmovie": [ + "ix_canonicalmovie_last_synced_at", + "ix_canonicalmovie_tvdb_id", + "ix_canonicalmovie_imdb_id", + "ix_canonicalmovie_release_year", + "ix_canonicalmovie_normalized_title", + "ix_canonicalmovie_title", + ], + "canonicalepisode": [ + "ix_canonicalepisode_last_synced_at", + "ix_canonicalepisode_normalized_title", + "ix_canonicalepisode_title", + ], + "canonicalseriesalias": ["ix_canonicalseriesalias_normalized_alias"], + "canonicalseries": [ + "ix_canonicalseries_last_synced_at", + "ix_canonicalseries_mal_id", + "ix_canonicalseries_anilist_id", + "ix_canonicalseries_tvmaze_id", + "ix_canonicalseries_imdb_id", + "ix_canonicalseries_tmdb_id", + "ix_canonicalseries_normalized_title", + "ix_canonicalseries_title", + ], + "providerepisodelanguage": [ + "ix_providerepisodelanguage_last_indexed_at", + "ix_providerepisodelanguage_indexed_generation", + "ix_providerepisodelanguage_normalized_language", + ], + "providercatalogepisode": [ + "ix_providercatalogepisode_last_indexed_at", + "ix_providercatalogepisode_indexed_generation", + "ix_providercatalogepisode_media_type_hint", + ], + "providercatalogalias": [ + "ix_providercatalogalias_last_indexed_at", + "ix_providercatalogalias_indexed_generation", + "ix_providercatalogalias_normalized_alias", + ], + "providercatalogtitle": [ + "ix_providercatalogtitle_last_indexed_at", + "ix_providercatalogtitle_indexed_generation", + "ix_providercatalogtitle_media_type_hint", + "ix_providercatalogtitle_normalized_title", + "ix_providercatalogtitle_title", + ], + "providertitleindexstate": [ + "ix_providertitleindexstate_updated_at", + "ix_providertitleindexstate_last_success_at", + "ix_providertitleindexstate_last_attempted_at", + ], + "providerindexstatus": [ + "ix_providerindexstatus_updated_at", + "ix_providerindexstatus_bootstrap_completed", + "ix_providerindexstatus_next_refresh_after", + "ix_providerindexstatus_latest_success_at", + "ix_providerindexstatus_latest_completed_at", + "ix_providerindexstatus_latest_started_at", + "ix_providerindexstatus_status", + ], + } + for table, indexes in table_indexes.items(): + if not inspector.has_table(table): + continue + for index in indexes: + op.drop_index(index, table_name=table) + op.drop_table(table) diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index ffcaabd4..3d7db2b7 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -3,6 +3,8 @@ from typing import Optional, Literal, Generator, Any, Dict, List, TYPE_CHECKING from datetime import datetime, timezone, timedelta from uuid import uuid4 +import re +import unicodedata from loguru import logger from fastapi import HTTPException @@ -18,6 +20,19 @@ from app.config import AVAILABILITY_TTL_HOURS, DATA_DIR JobStatus = Literal["queued", "downloading", "completed", "failed", "cancelled"] +CatalogRefreshStatus = Literal[ + "pending", + "running", + "ready", + "failed", +] +CatalogMappingConfidence = Literal[ + "confirmed", + "high_confidence", + "low_confidence", + "unresolved", + "conflict", +] # ---- Datetime Helpers @@ -164,6 +179,150 @@ class ClientTask(ModelBase, table=True): ) # queued/downloading/paused/completed/error +# ---------------- Provider Catalog Index +class ProviderIndexStatus(ModelBase, table=True): + provider: str = Field(primary_key=True) + refresh_interval_hours: float = 24.0 + status: str = Field(default="pending", index=True) + current_generation: Optional[str] = None + latest_success_generation: Optional[str] = None + latest_started_at: Optional[datetime] = Field(default=None, index=True) + latest_completed_at: Optional[datetime] = Field(default=None, index=True) + latest_success_at: Optional[datetime] = Field(default=None, index=True) + next_refresh_after: Optional[datetime] = Field(default=None, index=True) + bootstrap_completed: bool = Field(default=False, index=True) + failure_count: int = 0 + last_error_summary: Optional[str] = None + cursor_title_slug: Optional[str] = None + updated_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderTitleIndexState(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + last_attempted_at: Optional[datetime] = Field(default=None, index=True) + last_success_at: Optional[datetime] = Field(default=None, index=True) + failure_count: int = 0 + last_error_summary: Optional[str] = None + updated_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderCatalogTitle(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + title: str = Field(index=True) + normalized_title: str = Field(index=True) + media_type_hint: str = Field(default="series", index=True) + relative_path: str + indexed_generation: str = Field(index=True) + last_indexed_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderCatalogAlias(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + alias: str = Field(primary_key=True) + normalized_alias: str = Field(index=True) + indexed_generation: str = Field(index=True) + last_indexed_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderCatalogEpisode(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + season: int = Field(primary_key=True) + episode: int = Field(primary_key=True) + title_primary: Optional[str] = None + title_secondary: Optional[str] = None + relative_path: str + media_type_hint: str = Field(default="episode", index=True) + indexed_generation: str = Field(index=True) + last_indexed_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderEpisodeLanguage(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + season: int = Field(primary_key=True) + episode: int = Field(primary_key=True) + language: str = Field(primary_key=True) + normalized_language: str = Field(index=True) + host_hints: Optional[list[str]] = Field(sa_column=Column(JSON), default=None) + indexed_generation: str = Field(index=True) + last_indexed_at: datetime = Field(default_factory=utcnow, index=True) + + +class CanonicalSeries(ModelBase, table=True): + tvdb_id: int = Field(primary_key=True) + title: str = Field(index=True) + normalized_title: str = Field(index=True) + tmdb_id: Optional[int] = Field(default=None, index=True) + imdb_id: Optional[str] = Field(default=None, index=True) + tvmaze_id: Optional[int] = Field(default=None, index=True) + anilist_id: Optional[int] = Field(default=None, index=True) + mal_id: Optional[int] = Field(default=None, index=True) + last_synced_at: datetime = Field(default_factory=utcnow, index=True) + + +class CanonicalSeriesAlias(ModelBase, table=True): + tvdb_id: int = Field(primary_key=True) + alias: str = Field(primary_key=True) + normalized_alias: str = Field(index=True) + + +class CanonicalEpisode(ModelBase, table=True): + tvdb_id: int = Field(primary_key=True) + season: int = Field(primary_key=True) + episode: int = Field(primary_key=True) + title: str = Field(index=True) + normalized_title: str = Field(index=True) + last_synced_at: datetime = Field(default_factory=utcnow, index=True) + + +class CanonicalMovie(ModelBase, table=True): + tmdb_id: int = Field(primary_key=True) + title: str = Field(index=True) + normalized_title: str = Field(index=True) + release_year: int = Field(index=True) + imdb_id: Optional[str] = Field(default=None, index=True) + tvdb_id: Optional[int] = Field(default=None, index=True) + last_synced_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderSeriesMapping(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + tvdb_id: int = Field(primary_key=True) + confidence: str = Field(default="unresolved", index=True) + source: str = Field(default="title_match", index=True) + rationale: Optional[str] = None + last_verified_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderEpisodeMapping(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + provider_season: int = Field(primary_key=True) + provider_episode: int = Field(primary_key=True) + tvdb_id: int = Field(primary_key=True) + canonical_season: int = Field(primary_key=True) + canonical_episode: int = Field(primary_key=True) + confidence: str = Field(default="unresolved", index=True) + source: str = Field(default="numbering", index=True) + rationale: Optional[str] = None + last_verified_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderMovieMapping(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + tmdb_id: int = Field(primary_key=True) + confidence: str = Field(default="unresolved", index=True) + source: str = Field(default="title_year", index=True) + rationale: Optional[str] = None + last_verified_at: datetime = Field(default_factory=utcnow, index=True) + + # ---------------- Engine and Session utilities DATABASE_URL = f"sqlite:///{(DATA_DIR / 'anibridge_jobs.db').as_posix()}" logger.debug(f"DATABASE_URL: {DATABASE_URL}") @@ -558,6 +717,763 @@ def list_cached_episode_numbers_for_season( return episodes +def normalize_catalog_text(value: str) -> str: + normalized = unicodedata.normalize("NFKD", value or "") + normalized = "".join(ch for ch in normalized if not unicodedata.combining(ch)) + normalized = normalized.replace("’", "'").replace("`", "'") + normalized = re.sub(r"[^a-zA-Z0-9]+", " ", normalized) + return normalized.lower().strip() + + +def upsert_provider_index_status( + session: Session, + *, + provider: str, + refresh_interval_hours: float, + status: Optional[str] = None, + current_generation: Optional[str] = None, + latest_success_generation: Optional[str] = None, + latest_started_at: Optional[datetime] = None, + latest_completed_at: Optional[datetime] = None, + latest_success_at: Optional[datetime] = None, + next_refresh_after: Optional[datetime] = None, + bootstrap_completed: Optional[bool] = None, + failure_count: Optional[int] = None, + last_error_summary: Optional[str] = None, + cursor_title_slug: Optional[str] = None, +) -> ProviderIndexStatus: + rec = session.get(ProviderIndexStatus, provider) + if rec is None: + rec = ProviderIndexStatus( + provider=provider, + refresh_interval_hours=refresh_interval_hours, + ) + rec.refresh_interval_hours = refresh_interval_hours + if status is not None: + rec.status = status + if current_generation is not None: + rec.current_generation = current_generation + if latest_success_generation is not None: + rec.latest_success_generation = latest_success_generation + if latest_started_at is not None: + rec.latest_started_at = latest_started_at + if latest_completed_at is not None: + rec.latest_completed_at = latest_completed_at + if latest_success_at is not None: + rec.latest_success_at = latest_success_at + if next_refresh_after is not None: + rec.next_refresh_after = next_refresh_after + if bootstrap_completed is not None: + rec.bootstrap_completed = bootstrap_completed + if failure_count is not None: + rec.failure_count = failure_count + if last_error_summary is not None: + rec.last_error_summary = last_error_summary + if cursor_title_slug is not None or status == "ready": + rec.cursor_title_slug = cursor_title_slug + rec.updated_at = utcnow() + session.add(rec) + session.commit() + session.refresh(rec) + return rec + + +def get_provider_index_status( + session: Session, + *, + provider: str, +) -> Optional[ProviderIndexStatus]: + return session.get(ProviderIndexStatus, provider) + + +def list_provider_index_statuses(session: Session) -> List[ProviderIndexStatus]: + return list(session.exec(select(ProviderIndexStatus)).all()) + + +def upsert_provider_title_index_state( + session: Session, + *, + provider: str, + slug: str, + attempted_at: Optional[datetime] = None, + succeeded_at: Optional[datetime] = None, + failure_count: Optional[int] = None, + last_error_summary: Optional[str] = None, +) -> ProviderTitleIndexState: + rec = session.get(ProviderTitleIndexState, (provider, slug)) + if rec is None: + rec = ProviderTitleIndexState(provider=provider, slug=slug) + if attempted_at is not None: + rec.last_attempted_at = attempted_at + if succeeded_at is not None: + rec.last_success_at = succeeded_at + if failure_count is not None: + rec.failure_count = failure_count + if last_error_summary is not None: + rec.last_error_summary = last_error_summary + rec.updated_at = utcnow() + session.add(rec) + session.commit() + session.refresh(rec) + return rec + + +def replace_provider_catalog_title( + session: Session, + *, + provider: str, + slug: str, + title: str, + media_type_hint: str, + relative_path: str, + indexed_generation: str, +) -> ProviderCatalogTitle: + rec = session.get(ProviderCatalogTitle, (provider, slug)) + if rec is None: + rec = ProviderCatalogTitle( + provider=provider, + slug=slug, + title=title, + normalized_title=normalize_catalog_text(title), + media_type_hint=media_type_hint, + relative_path=relative_path, + indexed_generation=indexed_generation, + last_indexed_at=utcnow(), + ) + else: + rec.title = title + rec.normalized_title = normalize_catalog_text(title) + rec.media_type_hint = media_type_hint + rec.relative_path = relative_path + rec.indexed_generation = indexed_generation + rec.last_indexed_at = utcnow() + session.add(rec) + return rec + + +def replace_provider_catalog_aliases( + session: Session, + *, + provider: str, + slug: str, + aliases: List[str], + indexed_generation: str, +) -> None: + session.exec( + select(ProviderCatalogAlias).where( + (ProviderCatalogAlias.provider == provider) + & (ProviderCatalogAlias.slug == slug) + ) + ).all() + session.exec( + ProviderCatalogAlias.__table__.delete().where( + (ProviderCatalogAlias.provider == provider) + & (ProviderCatalogAlias.slug == slug) + ) + ) + seen: set[str] = set() + for alias in aliases: + alias_clean = (alias or "").strip() + if not alias_clean or alias_clean in seen: + continue + seen.add(alias_clean) + session.add( + ProviderCatalogAlias( + provider=provider, + slug=slug, + alias=alias_clean, + normalized_alias=normalize_catalog_text(alias_clean), + indexed_generation=indexed_generation, + last_indexed_at=utcnow(), + ) + ) + + +def replace_provider_catalog_episodes( + session: Session, + *, + provider: str, + slug: str, + episodes: List[dict[str, Any]], + indexed_generation: str, +) -> None: + session.exec( + ProviderEpisodeLanguage.__table__.delete().where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.slug == slug) + ) + ) + session.exec( + ProviderCatalogEpisode.__table__.delete().where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.slug == slug) + ) + ) + for item in episodes: + session.add( + ProviderCatalogEpisode( + provider=provider, + slug=slug, + season=int(item["season"]), + episode=int(item["episode"]), + title_primary=item.get("title_primary"), + title_secondary=item.get("title_secondary"), + relative_path=item["relative_path"], + media_type_hint=item.get("media_type_hint", "episode"), + indexed_generation=indexed_generation, + last_indexed_at=utcnow(), + ) + ) + for language_payload in item.get("languages", []): + language = str(language_payload.get("language") or "").strip() + if not language: + continue + session.add( + ProviderEpisodeLanguage( + provider=provider, + slug=slug, + season=int(item["season"]), + episode=int(item["episode"]), + language=language, + normalized_language=normalize_catalog_text(language), + host_hints=list(language_payload.get("host_hints") or []), + indexed_generation=indexed_generation, + last_indexed_at=utcnow(), + ) + ) + + +def prune_provider_generation( + session: Session, + *, + provider: str, + keep_generation: str, +) -> None: + session.exec( + ProviderCatalogAlias.__table__.delete().where( + (ProviderCatalogAlias.provider == provider) + & (ProviderCatalogAlias.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderEpisodeLanguage.__table__.delete().where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderCatalogEpisode.__table__.delete().where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderCatalogTitle.__table__.delete().where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation != keep_generation) + ) + ) + session.commit() + + +def delete_provider_generation( + session: Session, + *, + provider: str, + generation: str, +) -> None: + session.exec( + ProviderCatalogAlias.__table__.delete().where( + (ProviderCatalogAlias.provider == provider) + & (ProviderCatalogAlias.indexed_generation == generation) + ) + ) + session.exec( + ProviderEpisodeLanguage.__table__.delete().where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.indexed_generation == generation) + ) + ) + session.exec( + ProviderCatalogEpisode.__table__.delete().where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.indexed_generation == generation) + ) + ) + session.exec( + ProviderCatalogTitle.__table__.delete().where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ) + session.commit() + + +def _visible_generation_map( + session: Session, + *, + providers: List[str], +) -> dict[str, str]: + rows = session.exec( + select(ProviderIndexStatus).where(ProviderIndexStatus.provider.in_(providers)) + ).all() + return { + row.provider: row.latest_success_generation + for row in rows + if row.latest_success_generation + } + + +def replace_provider_series_mappings( + session: Session, + *, + provider: str, + slug: str, + mappings: List[dict[str, Any]], +) -> None: + session.exec( + ProviderSeriesMapping.__table__.delete().where( + (ProviderSeriesMapping.provider == provider) + & (ProviderSeriesMapping.slug == slug) + ) + ) + for mapping in mappings: + session.add( + ProviderSeriesMapping( + provider=provider, + slug=slug, + tvdb_id=int(mapping["tvdb_id"]), + confidence=str(mapping.get("confidence", "unresolved")), + source=str(mapping.get("source", "title_match")), + rationale=mapping.get("rationale"), + last_verified_at=utcnow(), + ) + ) + + +def replace_provider_episode_mappings( + session: Session, + *, + provider: str, + slug: str, + mappings: List[dict[str, Any]], +) -> None: + session.exec( + ProviderEpisodeMapping.__table__.delete().where( + (ProviderEpisodeMapping.provider == provider) + & (ProviderEpisodeMapping.slug == slug) + ) + ) + for mapping in mappings: + session.add( + ProviderEpisodeMapping( + provider=provider, + slug=slug, + provider_season=int(mapping["provider_season"]), + provider_episode=int(mapping["provider_episode"]), + tvdb_id=int(mapping["tvdb_id"]), + canonical_season=int(mapping["canonical_season"]), + canonical_episode=int(mapping["canonical_episode"]), + confidence=str(mapping.get("confidence", "unresolved")), + source=str(mapping.get("source", "numbering")), + rationale=mapping.get("rationale"), + last_verified_at=utcnow(), + ) + ) + + +def replace_provider_movie_mappings( + session: Session, + *, + provider: str, + slug: str, + mappings: List[dict[str, Any]], +) -> None: + session.exec( + ProviderMovieMapping.__table__.delete().where( + (ProviderMovieMapping.provider == provider) + & (ProviderMovieMapping.slug == slug) + ) + ) + for mapping in mappings: + session.add( + ProviderMovieMapping( + provider=provider, + slug=slug, + tmdb_id=int(mapping["tmdb_id"]), + confidence=str(mapping.get("confidence", "unresolved")), + source=str(mapping.get("source", "title_year")), + rationale=mapping.get("rationale"), + last_verified_at=utcnow(), + ) + ) + + +def upsert_canonical_series( + session: Session, + *, + tvdb_id: int, + title: str, + tmdb_id: Optional[int] = None, + imdb_id: Optional[str] = None, + tvmaze_id: Optional[int] = None, + anilist_id: Optional[int] = None, + mal_id: Optional[int] = None, + aliases: Optional[List[str]] = None, +) -> CanonicalSeries: + rec = session.get(CanonicalSeries, tvdb_id) + if rec is None: + rec = CanonicalSeries(tvdb_id=tvdb_id, title=title, normalized_title="") + rec.title = title + rec.normalized_title = normalize_catalog_text(title) + rec.tmdb_id = tmdb_id + rec.imdb_id = imdb_id + rec.tvmaze_id = tvmaze_id + rec.anilist_id = anilist_id + rec.mal_id = mal_id + rec.last_synced_at = utcnow() + session.add(rec) + session.exec(CanonicalSeriesAlias.__table__.delete().where(CanonicalSeriesAlias.tvdb_id == tvdb_id)) + for alias in aliases or []: + alias_clean = (alias or "").strip() + if not alias_clean: + continue + session.add( + CanonicalSeriesAlias( + tvdb_id=tvdb_id, + alias=alias_clean, + normalized_alias=normalize_catalog_text(alias_clean), + ) + ) + return rec + + +def replace_canonical_episodes( + session: Session, + *, + tvdb_id: int, + episodes: List[dict[str, Any]], +) -> None: + session.exec( + CanonicalEpisode.__table__.delete().where(CanonicalEpisode.tvdb_id == tvdb_id) + ) + for episode in episodes: + title = str(episode.get("title") or "").strip() + if not title: + continue + session.add( + CanonicalEpisode( + tvdb_id=tvdb_id, + season=int(episode["season"]), + episode=int(episode["episode"]), + title=title, + normalized_title=normalize_catalog_text(title), + last_synced_at=utcnow(), + ) + ) + + +def is_catalog_bootstrap_ready( + session: Session, + *, + providers: List[str], +) -> bool: + if not providers: + return True + statuses = { + row.provider: row + for row in session.exec( + select(ProviderIndexStatus).where(ProviderIndexStatus.provider.in_(providers)) + ).all() + } + return all( + status is not None + and status.bootstrap_completed + and bool(status.latest_success_generation) + for status in (statuses.get(provider) for provider in providers) + ) + + +def catalog_title_count(session: Session, *, provider: Optional[str] = None) -> int: + stmt = select(ProviderCatalogTitle) + if provider: + stmt = stmt.where(ProviderCatalogTitle.provider == provider) + return len(session.exec(stmt).all()) + + +def resolve_indexed_title( + session: Session, + *, + provider: str, + slug: str, +) -> Optional[str]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return None + row = session.get(ProviderCatalogTitle, (provider, slug)) + if row is None or row.indexed_generation != status.latest_success_generation: + return None + return row.title if row else None + + +def search_indexed_provider_titles( + session: Session, + *, + query: str, + providers: List[str], + media_type_hint: Optional[str] = None, + limit: int = 20, +) -> List[ProviderCatalogTitle]: + q_norm = normalize_catalog_text(query) + if not q_norm: + return [] + tokens = [token for token in q_norm.split(" ") if token] + if not tokens: + return [] + visible_generations = _visible_generation_map(session, providers=providers) + if not visible_generations: + return [] + stmt = select(ProviderCatalogTitle).where(ProviderCatalogTitle.provider.in_(providers)) + if media_type_hint is not None: + stmt = stmt.where(ProviderCatalogTitle.media_type_hint == media_type_hint) + rows = [ + row + for row in session.exec(stmt).all() + if visible_generations.get(row.provider) == row.indexed_generation + ] + + def _score(row: ProviderCatalogTitle) -> tuple[int, int]: + names = [row.normalized_title] + alias_rows = session.exec( + select(ProviderCatalogAlias).where( + (ProviderCatalogAlias.provider == row.provider) + & (ProviderCatalogAlias.slug == row.slug) + ) + ).all() + names.extend(alias.normalized_alias for alias in alias_rows) + best = 0 + exact = 0 + for name in names: + name_tokens = set(token for token in name.split(" ") if token) + overlap = len(name_tokens & set(tokens)) + if name == q_norm: + exact = 1 + if overlap > best: + best = overlap + return (exact, best) + + ranked = sorted( + rows, + key=lambda row: _score(row), + reverse=True, + ) + filtered = [row for row in ranked if _score(row)[1] > 0 or _score(row)[0] > 0] + return filtered[: max(1, limit)] + + +def list_indexed_titles_for_provider( + session: Session, + *, + provider: str, +) -> List[ProviderCatalogTitle]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return [] + return list( + session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & ( + ProviderCatalogTitle.indexed_generation + == status.latest_success_generation + ) + ) + ).all() + ) + + +def get_indexed_episode_languages( + session: Session, + *, + provider: str, + slug: str, + season: int, + episode: int, +) -> List[ProviderEpisodeLanguage]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return [] + return list( + session.exec( + select(ProviderEpisodeLanguage).where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.slug == slug) + & (ProviderEpisodeLanguage.season == season) + & (ProviderEpisodeLanguage.episode == episode) + & ( + ProviderEpisodeLanguage.indexed_generation + == status.latest_success_generation + ) + ) + ).all() + ) + + +def list_indexed_provider_episodes( + session: Session, + *, + provider: str, + slug: str, +) -> List[ProviderCatalogEpisode]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return [] + return list( + session.exec( + select(ProviderCatalogEpisode).where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.slug == slug) + & ( + ProviderCatalogEpisode.indexed_generation + == status.latest_success_generation + ) + ) + ).all() + ) + + +def list_indexed_episode_numbers_for_season( + session: Session, + *, + provider: str, + slug: str, + season: int, +) -> List[int]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return [] + episodes = [ + int(row.episode) + for row in session.exec( + select(ProviderCatalogEpisode).where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.slug == slug) + & (ProviderCatalogEpisode.season == season) + & ( + ProviderCatalogEpisode.indexed_generation + == status.latest_success_generation + ) + ) + ).all() + ] + return sorted(set(episodes)) + + +def find_canonical_series_by_ids_or_title( + session: Session, + *, + tvdb_id: Optional[int] = None, + tmdb_id: Optional[int] = None, + imdb_id: Optional[str] = None, + query: Optional[str] = None, +) -> Optional[CanonicalSeries]: + if tvdb_id: + row = session.get(CanonicalSeries, tvdb_id) + if row is not None: + return row + if tmdb_id: + row = session.exec( + select(CanonicalSeries).where(CanonicalSeries.tmdb_id == tmdb_id) + ).first() + if row is not None: + return row + if imdb_id: + row = session.exec( + select(CanonicalSeries).where(CanonicalSeries.imdb_id == imdb_id) + ).first() + if row is not None: + return row + q_norm = normalize_catalog_text(query or "") + if not q_norm: + return None + row = session.exec( + select(CanonicalSeries).where(CanonicalSeries.normalized_title == q_norm) + ).first() + if row is not None: + return row + alias = session.exec( + select(CanonicalSeriesAlias).where(CanonicalSeriesAlias.normalized_alias == q_norm) + ).first() + if alias is not None: + return session.get(CanonicalSeries, alias.tvdb_id) + return None + + +def find_provider_episode_mappings_for_canonical_episode( + session: Session, + *, + tvdb_id: int, + canonical_season: int, + canonical_episode: int, + providers: List[str], +) -> List[ProviderEpisodeMapping]: + return list( + session.exec( + select(ProviderEpisodeMapping).where( + (ProviderEpisodeMapping.tvdb_id == tvdb_id) + & (ProviderEpisodeMapping.canonical_season == canonical_season) + & (ProviderEpisodeMapping.canonical_episode == canonical_episode) + & (ProviderEpisodeMapping.provider.in_(providers)) + & ProviderEpisodeMapping.confidence.in_( + ["confirmed", "high_confidence", "low_confidence"] + ) + ) + ).all() + ) + + +def find_provider_episode_mappings_for_canonical_season( + session: Session, + *, + tvdb_id: int, + canonical_season: int, + providers: List[str], +) -> List[ProviderEpisodeMapping]: + return list( + session.exec( + select(ProviderEpisodeMapping).where( + (ProviderEpisodeMapping.tvdb_id == tvdb_id) + & (ProviderEpisodeMapping.canonical_season == canonical_season) + & (ProviderEpisodeMapping.provider.in_(providers)) + & ProviderEpisodeMapping.confidence.in_( + ["confirmed", "high_confidence", "low_confidence"] + ) + ) + ).all() + ) + + +def find_provider_episode_mapping( + session: Session, + *, + provider: str, + slug: str, + provider_season: int, + provider_episode: int, +) -> Optional[ProviderEpisodeMapping]: + return session.exec( + select(ProviderEpisodeMapping).where( + (ProviderEpisodeMapping.provider == provider) + & (ProviderEpisodeMapping.slug == slug) + & (ProviderEpisodeMapping.provider_season == provider_season) + & (ProviderEpisodeMapping.provider_episode == provider_episode) + & ProviderEpisodeMapping.confidence.in_( + ["confirmed", "high_confidence", "low_confidence"] + ) + ) + ).first() + + # --- STRM URL Mapping CRUD def get_strm_mapping( session: Session, diff --git a/apps/api/app/utils/terminal.py b/apps/api/app/utils/terminal.py index 8443ada9..065446d7 100644 --- a/apps/api/app/utils/terminal.py +++ b/apps/api/app/utils/terminal.py @@ -41,8 +41,16 @@ class ProgressReporter: - Non-interactive: print an info line every PROGRESS_STEP_PERCENT. """ - def __init__(self, label: str) -> None: + def __init__( + self, + label: str, + *, + unit: str = "B", + unit_scale: bool = True, + ) -> None: self.label = label + self.unit = unit + self.unit_scale = unit_scale self._bar = None self._last_step_pct = -1 # last printed step percentage (integer) self._interactive = is_interactive_terminal() @@ -64,8 +72,8 @@ def update(self, snap: ProgressSnapshot) -> None: self._bar = tqdm( total=int(total), desc=self.label, - unit="B", - unit_scale=True, + unit=self.unit, + unit_scale=self.unit_scale, leave=True, file=bar_file, ascii=False, # force unicode blocks (█▉▊▌ etc.) @@ -106,15 +114,21 @@ def update(self, snap: ProgressSnapshot) -> None: else "-" ) eta = f"{int(snap.eta)}s" if snap.eta is not None else "-" + progress_text = ( + f"{downloaded}/{total} bytes" + if self.unit == "B" + else f"{downloaded}/{total} {self.unit}" + ) logger.info( - f"{self.label}: {pct}% ({downloaded}/{total} bytes) speed={speed} eta={eta}" + f"{self.label}: {pct}% ({progress_text}) speed={speed} eta={eta}" ) else: # Total unknown: avoid spamming; print on large increments threshold = 8 * 1024 * 1024 # 8 MiB if downloaded // threshold > self._last_step_pct: self._last_step_pct = downloaded // threshold - logger.info(f"{self.label}: downloaded {downloaded} bytes...") + suffix = "bytes" if self.unit == "B" else self.unit + logger.info(f"{self.label}: downloaded {downloaded} {suffix}...") def close(self) -> None: if self._bar is not None: diff --git a/apps/api/app/utils/title_resolver.py b/apps/api/app/utils/title_resolver.py index 04a424c1..b69f5b12 100644 --- a/apps/api/app/utils/title_resolver.py +++ b/apps/api/app/utils/title_resolver.py @@ -11,9 +11,17 @@ import requests.exceptions from bs4 import BeautifulSoup # type: ignore from loguru import logger +from sqlmodel import Session from app.utils.logger import config as configure_logger from app.utils.http_client import get as http_get # type: ignore +from app.catalog import get_catalog_readiness_error +from app.db import ( + engine, + list_indexed_titles_for_provider, + resolve_indexed_title, + search_indexed_provider_titles, +) from app.config import ( CATALOG_SITES_LIST, @@ -490,6 +498,18 @@ def resolve_series_title( if not slug: logger.warning("No slug provided to resolve_series_title.") return None + with Session(engine) as session: + title = resolve_indexed_title(session, provider=site, slug=slug) + if title: + logger.info(f"Resolved title for slug '{slug}' on {site}: {title}") + return title + if get_catalog_readiness_error() is None: + logger.warning( + "No indexed title found for slug '{}' on {} after catalog bootstrap.", + slug, + site, + ) + return None index = load_or_refresh_index(site) title = index.get(slug) if title: @@ -508,6 +528,14 @@ def load_or_refresh_alternatives(site: str = "aniworld.to") -> Dict[str, List[st Dict[str, List[str]]: Mapping from slug to list of alternative titles (primary title first). """ global _cached_alts + readiness_error = get_catalog_readiness_error() + if readiness_error is None: + with Session(engine) as session: + rows = list_indexed_titles_for_provider(session, provider=site) + if rows: + # The indexed request path no longer needs a full alternatives dump. + # Keep a minimal compatibility shape for older helper call sites. + return {row.slug: [row.title] for row in rows} now = time.time() site_cfg = _get_site_cfg(site) or CATALOG_SITE_CONFIGS.get("aniworld.to", {}) refresh_hours = float(site_cfg.get("titles_refresh_hours", 24.0)) @@ -692,6 +720,23 @@ def slug_from_query(q: str, site: Optional[str] = None) -> Optional[Tuple[str, s """ if not q: return None + readiness_error = get_catalog_readiness_error() + if readiness_error is None: + providers = [site] if site else list(CATALOG_SITES_LIST) + preferred = [provider for provider in providers if provider != "megakino"] + fallback = [provider for provider in providers if provider == "megakino"] + with Session(engine) as session: + for batch in (preferred, fallback): + if not batch: + continue + rows = search_indexed_provider_titles( + session, + query=q, + providers=batch, + limit=1, + ) + if rows: + return (rows[0].provider, rows[0].slug) def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: """ From ff201822ad9be1e5cf72af8cb2c0e13bf733be5c Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 20:33:18 +0200 Subject: [PATCH 03/45] test(api): cover indexed catalog request paths Update integration and migration tests for the catalog-indexed Torznab flow. Cover bootstrap blocking, indexed generic search, canonical tvsearch mapping, special mapping behavior, health progress reporting, and dynamic Alembic head verification. --- apps/api/tests/conftest.py | 5 + apps/api/tests/integration/api/test_health.py | 9 + .../tests/integration/api/torznab/test_api.py | 549 ++++++------------ .../api/torznab/test_indexed_catalog.py | 164 ++++++ .../api/torznab/test_specials_mapping.py | 400 +++++-------- apps/api/tests/unit/db/test_migrations.py | 26 +- 6 files changed, 512 insertions(+), 641 deletions(-) create mode 100644 apps/api/tests/integration/api/torznab/test_indexed_catalog.py diff --git a/apps/api/tests/conftest.py b/apps/api/tests/conftest.py index 39235f52..0a03f74d 100644 --- a/apps/api/tests/conftest.py +++ b/apps/api/tests/conftest.py @@ -91,6 +91,11 @@ def client(tmp_path, monkeypatch): "app.config", "app.db", "app.db.models", + "app.catalog", + "app.catalog.exceptions", + "app.catalog.indexer", + "app.catalog.metadata", + "app.catalog.providers", "app.core.strm_proxy", "app.core.strm_proxy.auth", "app.core.strm_proxy.cache", diff --git a/apps/api/tests/integration/api/test_health.py b/apps/api/tests/integration/api/test_health.py index b8a9dd0f..ed38224f 100644 --- a/apps/api/tests/integration/api/test_health.py +++ b/apps/api/tests/integration/api/test_health.py @@ -2,3 +2,12 @@ def test_health_endpoint(client): r = client.get("/health") assert r.status_code == 200 assert r.json().get("status") == "ok" + assert "catalog" in r.json() + + +def test_catalog_health_endpoint(client): + r = client.get("/health/catalog") + assert r.status_code == 200 + payload = r.json() + assert "bootstrap_ready" in payload + assert "providers" in payload diff --git a/apps/api/tests/integration/api/torznab/test_api.py b/apps/api/tests/integration/api/torznab/test_api.py index 98eb7f24..f8224ec4 100644 --- a/apps/api/tests/integration/api/torznab/test_api.py +++ b/apps/api/tests/integration/api/torznab/test_api.py @@ -1,5 +1,134 @@ +from __future__ import annotations + import xml.etree.ElementTree as ET +from sqlmodel import Session + + +def _seed_ready_tv_catalog( + *, + canonical_title: str, + query_aliases: list[str], + provider_title: str | None = None, + slug: str = "slug", + tvdb_id: int = 12345, + episode_mappings: list[tuple[int, int, int, int]] | None = None, +) -> None: + from app.db import ( + engine, + replace_canonical_episodes, + replace_provider_catalog_aliases, + replace_provider_catalog_episodes, + replace_provider_catalog_title, + replace_provider_episode_mappings, + replace_provider_series_mappings, + upsert_canonical_series, + upsert_provider_index_status, + ) + + generation = f"gen-{slug}" + mapped_episodes = episode_mappings or [(1, 1, 1, 1)] + provider_title = provider_title or canonical_title + + with Session(engine) as session: + for provider in ("aniworld.to", "s.to", "megakino"): + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=24.0, + status="ready", + current_generation=generation, + latest_success_generation=generation, + bootstrap_completed=True, + ) + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug=slug, + title=provider_title, + media_type_hint="series", + relative_path=f"/anime/stream/{slug}", + indexed_generation=generation, + ) + replace_provider_catalog_aliases( + session, + provider="aniworld.to", + slug=slug, + aliases=[provider_title, *query_aliases], + indexed_generation=generation, + ) + replace_provider_catalog_episodes( + session, + provider="aniworld.to", + slug=slug, + indexed_generation=generation, + episodes=[ + { + "season": provider_season, + "episode": provider_episode, + "relative_path": f"/anime/stream/{slug}/staffel-{provider_season}/episode-{provider_episode}", + "title_primary": f"Episode {canonical_episode}", + "title_secondary": None, + "media_type_hint": "episode", + "languages": [ + {"language": "German Sub", "host_hints": ["VOE"]}, + ], + } + for provider_season, provider_episode, _canonical_season, canonical_episode in mapped_episodes + ], + ) + upsert_canonical_series( + session, + tvdb_id=tvdb_id, + title=canonical_title, + imdb_id=f"tt{tvdb_id:07d}", + aliases=query_aliases, + ) + replace_canonical_episodes( + session, + tvdb_id=tvdb_id, + episodes=[ + { + "season": canonical_season, + "episode": canonical_episode, + "title": f"Episode {canonical_episode}", + } + for _provider_season, _provider_episode, canonical_season, canonical_episode in mapped_episodes + ], + ) + replace_provider_series_mappings( + session, + provider="aniworld.to", + slug=slug, + mappings=[ + { + "tvdb_id": tvdb_id, + "confidence": "confirmed", + "source": "title", + "rationale": "test", + } + ], + ) + replace_provider_episode_mappings( + session, + provider="aniworld.to", + slug=slug, + mappings=[ + { + "provider_season": provider_season, + "provider_episode": provider_episode, + "tvdb_id": tvdb_id, + "canonical_season": canonical_season, + "canonical_episode": canonical_episode, + "confidence": "confirmed", + "source": "direct_numbering", + "rationale": "test", + } + for provider_season, provider_episode, canonical_season, canonical_episode in mapped_episodes + ], + ) + session.commit() + def test_caps(client): resp = client.get("/torznab/api", params={"t": "caps"}) @@ -17,43 +146,11 @@ def test_search(client): def test_tvsearch_happy_path(client, monkeypatch): import app.api.torznab as tn - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" - - # Return (site, slug) tuple for new multi-site API - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) + _seed_ready_tv_catalog(canonical_title="Series", query_aliases=["foo"]) monkeypatch.setattr( tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), - ) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - "Title" - ), - ) - monkeypatch.setattr( - tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - "magnet:?xt=urn:btih:test&dn=Title&aw_slug=slug&aw_s=1&aw_e=1&aw_lang=German+Sub&aw_site=aniworld.to" - ), + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), ) resp = client.get( @@ -86,64 +183,22 @@ def test_tvsearch_uses_id_resolved_query_when_q_missing(client, monkeypatch): import app.api.torznab as tn import app.api.torznab.api as torznab_api_mod - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" - - seen = {"query": None} - - def _slug_from_query(query, site=None): - """ - Record the provided query in the shared `seen` mapping and return a fixed (site, slug) pair. - - Parameters: - query (str): The query string to record. - site (str | None): Optional site hint (unused by this stub). - - Returns: - tuple: A two-element tuple (site, slug) where `site` is `"aniworld.to"` and `slug` is `"slug"`. - - Side effects: - Mutates the `seen` mapping by setting `seen["query"] = query`. - """ - seen["query"] = query - return ("aniworld.to", "slug") - + _seed_ready_tv_catalog( + canonical_title="The Rookie", + query_aliases=[], + provider_title="The Rookie", + slug="the-rookie", + tvdb_id=350665, + ) monkeypatch.setattr( torznab_api_mod, "_resolve_tvsearch_query_from_ids", lambda **_kwargs: "The Rookie", ) - monkeypatch.setattr(tn, "_slug_from_query", _slug_from_query) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) monkeypatch.setattr( tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), - ) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - "Title" - ), - ) - monkeypatch.setattr( - tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - "magnet:?xt=urn:btih:test&dn=Title&aw_slug=slug&aw_s=1&aw_e=1&aw_lang=German+Sub&aw_site=aniworld.to" - ), + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), ) resp = client.get( @@ -153,60 +208,20 @@ def _slug_from_query(query, site=None): assert resp.status_code == 200 root = ET.fromstring(resp.text) assert root.find("./channel/item") is not None - assert seen["query"] == "The Rookie" def test_tvsearch_season_search_emits_multiple_episodes(client, monkeypatch) -> None: - """Emit one item per discovered season episode in season-search mode.""" import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod - - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr( - torznab_api_mod, "_metadata_episode_numbers_for_season", lambda **_kwargs: [] - ) - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "no") - monkeypatch.setattr( - tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [1, 2, 3], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2), (1, 3, 1, 3)], ) monkeypatch.setattr( tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang=German+Sub&aw_site={site}" - ), + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), ) resp = client.get( @@ -216,7 +231,7 @@ class Rec: assert resp.status_code == 200 root = ET.fromstring(resp.text) items = root.findall("./channel/item") - assert len(items) == 3 + assert len(items) == 6 urls = [ ( item.find("enclosure").get("url") @@ -233,95 +248,17 @@ class Rec: def test_tvsearch_season_search_fallback_stops_on_consecutive_misses( client, monkeypatch ) -> None: - """Stop strict fallback probing after configured consecutive misses.""" import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod - - class Rec: - def __init__(self, height=1080, vcodec="h264", provider="VOE"): - self.available = True - self.is_fresh = True - self.height = height - self.vcodec = vcodec - self.provider = provider - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr( - torznab_api_mod, "_metadata_episode_numbers_for_season", lambda **_kwargs: [] - ) - monkeypatch.setattr(torznab_api_mod, "TORZNAB_SEASON_SEARCH_MODE", "strict") - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "no") - monkeypatch.setattr( - tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - cached: dict[tuple[int, int, str], Rec] = {} - - def _get_availability(session, slug, season, episode, language, site="aniworld.to"): - _ = (session, slug, site) - return cached.get((season, episode, language)) - - monkeypatch.setattr(tn, "get_availability", _get_availability) - - probe_calls: list[int] = [] - - def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwargs): - _ = (slug, language, site) - probe_calls.append(episode) - if episode in (1, 2): - return (True, 1080, "h264", "VOE", {}) - return (False, None, None, None, None) - - monkeypatch.setattr(tn, "probe_episode_quality", _probe_quality) - - def _upsert_availability( - session, - slug, - season, - episode, - language, - available, - height=None, - vcodec=None, - provider=None, - extra=None, - site="aniworld.to", - ): - _ = (session, slug, extra, site) - if available: - cached[(season, episode, language)] = Rec(height, vcodec, provider) - - monkeypatch.setattr(tn, "upsert_availability", _upsert_availability) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2)], ) monkeypatch.setattr( tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang=German+Sub&aw_site={site}" - ), - ) - - monkeypatch.setattr(torznab_api_mod, "TORZNAB_SEASON_SEARCH_MAX_EPISODES", 10) - monkeypatch.setattr( - torznab_api_mod, "TORZNAB_SEASON_SEARCH_MAX_CONSECUTIVE_MISSES", 2 + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), ) resp = client.get( @@ -331,63 +268,21 @@ def _upsert_availability( assert resp.status_code == 200 root = ET.fromstring(resp.text) items = root.findall("./channel/item") - assert len(items) == 2 - assert probe_calls == [1, 2, 3, 4] + assert len(items) == 4 def test_tvsearch_ep_zero_is_treated_as_season_search(client, monkeypatch) -> None: - """Treat ep=0 as season-search and emit multiple season episode items.""" import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr(torznab_api_mod, "TORZNAB_SEASON_SEARCH_MODE", "fast") - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "no") - monkeypatch.setattr( - torznab_api_mod, - "_metadata_episode_numbers_for_season", - lambda **_kwargs: [1, 2], + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2)], ) monkeypatch.setattr( tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), - ) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), - ) - monkeypatch.setattr( - tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang=German+Sub&aw_site={site}" - ), + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), ) resp = client.get( @@ -397,7 +292,7 @@ class Rec: assert resp.status_code == 200 root = ET.fromstring(resp.text) items = root.findall("./channel/item") - assert len(items) == 2 + assert len(items) == 4 urls = [ ( item.find("enclosure").get("url") @@ -411,66 +306,17 @@ class Rec: def test_tvsearch_fast_season_mode_avoids_live_probe(client, monkeypatch) -> None: - """Avoid live quality probing when fast season-search mode is enabled.""" import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr(torznab_api_mod, "TORZNAB_SEASON_SEARCH_MODE", "fast") - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "no") - monkeypatch.setattr( - torznab_api_mod, - "_metadata_episode_numbers_for_season", - lambda **_kwargs: [1, 2], + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2)], ) monkeypatch.setattr( tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": [], - ) - monkeypatch.setattr( - torznab_api_mod, - "_discover_episode_languages_for_fast_season_mode", - lambda **_kwargs: ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": None, - ) - - probe_calls: list[tuple[int, str]] = [] - - def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwargs): - _ = (slug, site) - probe_calls.append((episode, language)) - return (True, 1080, "h264", "VOE", {}) - - monkeypatch.setattr(tn, "probe_episode_quality", _probe_quality) - - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), - ) - monkeypatch.setattr( - tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang={language.replace(' ', '+')}&aw_site={site}" - ), + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), ) resp = client.get( @@ -480,62 +326,15 @@ def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwarg assert resp.status_code == 200 root = ET.fromstring(resp.text) items = root.findall("./channel/item") - assert len(items) == 2 - assert probe_calls == [] + assert len(items) == 4 -def test_tvsearch_season_search_limit_is_hard_item_cap(client, monkeypatch) -> None: - """Cap season-search output by item limit across episode/language variants.""" - import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod - - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr( - torznab_api_mod, "_metadata_episode_numbers_for_season", lambda **_kwargs: [] - ) - monkeypatch.setattr( - tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [1, 2], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), - ) - monkeypatch.setattr( - tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang=German+Sub&aw_site={site}" - ), +def test_tvsearch_season_search_limit_is_hard_item_cap(client) -> None: + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2)], ) - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "both") resp = client.get( "/torznab/api", diff --git a/apps/api/tests/integration/api/torznab/test_indexed_catalog.py b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py new file mode 100644 index 00000000..524c49dd --- /dev/null +++ b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +from sqlmodel import Session + + +def _seed_ready_catalog() -> None: + from app.db import ( + engine, + replace_canonical_episodes, + replace_provider_catalog_aliases, + replace_provider_catalog_episodes, + replace_provider_catalog_title, + replace_provider_episode_mappings, + replace_provider_series_mappings, + upsert_canonical_series, + upsert_provider_index_status, + ) + + generation = "gen-1" + with Session(engine) as session: + for provider in ("aniworld.to", "s.to", "megakino"): + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=24.0, + status="ready", + current_generation=generation, + latest_success_generation=generation, + bootstrap_completed=True, + ) + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug="kaguya-sama", + title="Kaguya-sama", + media_type_hint="series", + relative_path="/anime/stream/kaguya-sama", + indexed_generation=generation, + ) + replace_provider_catalog_aliases( + session, + provider="aniworld.to", + slug="kaguya-sama", + aliases=["Kaguya-sama", "Kaguya"], + indexed_generation=generation, + ) + replace_provider_catalog_episodes( + session, + provider="aniworld.to", + slug="kaguya-sama", + indexed_generation=generation, + episodes=[ + { + "season": 1, + "episode": 1, + "relative_path": "/anime/stream/kaguya-sama/staffel-1/episode-1", + "title_primary": "I Want To Be Invited To A Movie", + "title_secondary": None, + "media_type_hint": "episode", + "languages": [ + {"language": "German Sub", "host_hints": ["VOE"]}, + ], + } + ], + ) + upsert_canonical_series( + session, + tvdb_id=12345, + title="Kaguya-sama", + imdb_id="tt0000001", + aliases=["Kaguya"], + ) + replace_canonical_episodes( + session, + tvdb_id=12345, + episodes=[ + {"season": 1, "episode": 1, "title": "I Want To Be Invited To A Movie"} + ], + ) + replace_provider_series_mappings( + session, + provider="aniworld.to", + slug="kaguya-sama", + mappings=[ + { + "tvdb_id": 12345, + "confidence": "confirmed", + "source": "title", + "rationale": "test", + } + ], + ) + replace_provider_episode_mappings( + session, + provider="aniworld.to", + slug="kaguya-sama", + mappings=[ + { + "provider_season": 1, + "provider_episode": 1, + "tvdb_id": 12345, + "canonical_season": 1, + "canonical_episode": 1, + "confidence": "confirmed", + "source": "direct_numbering", + "rationale": "test", + } + ], + ) + session.commit() + + +def test_search_returns_503_when_catalog_bootstrap_pending(client) -> None: + from app.db import engine, upsert_provider_index_status + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="pending", + bootstrap_completed=False, + ) + + response = client.get("/torznab/api", params={"t": "search", "q": "Kaguya"}) + + assert response.status_code == 503 + assert "bootstrap" in response.json()["detail"].lower() + + +def test_search_uses_indexed_catalog_without_live_probe(client, monkeypatch) -> None: + _seed_ready_catalog() + monkeypatch.setattr( + "app.utils.title_resolver.load_or_refresh_index", + lambda site="aniworld.to": (_ for _ in ()).throw( + AssertionError("unexpected live index refresh") + ), + ) + + response = client.get("/torznab/api", params={"t": "search", "q": "Kaguya"}) + + assert response.status_code == 200 + assert "Kaguya.sama.S01E01" in response.text + assert "aw_slug=kaguya-sama" in response.text + + +def test_tvsearch_uses_indexed_canonical_mapping(client, monkeypatch) -> None: + _seed_ready_catalog() + monkeypatch.setattr( + "app.utils.title_resolver.load_or_refresh_index", + lambda site="aniworld.to": (_ for _ in ()).throw( + AssertionError("unexpected live index refresh") + ), + ) + + response = client.get( + "/torznab/api", + params={"t": "tvsearch", "q": "Kaguya", "season": 1, "ep": 1}, + ) + + assert response.status_code == 200 + assert "Kaguya.sama.S01E01" in response.text + assert "aw_s=1" in response.text + assert "aw_e=1" in response.text diff --git a/apps/api/tests/integration/api/torznab/test_specials_mapping.py b/apps/api/tests/integration/api/torznab/test_specials_mapping.py index 6670452f..f6a07546 100644 --- a/apps/api/tests/integration/api/torznab/test_specials_mapping.py +++ b/apps/api/tests/integration/api/torznab/test_specials_mapping.py @@ -1,112 +1,124 @@ -import xml.etree.ElementTree as ET - - -def _fake_release_name( - series_title: str, - season: int, - episode: int, - height: int, - vcodec: str, - language: str, - site: str = "aniworld.to", -) -> str: - """ - Constructs a fake release name for a series episode using the provided season and episode numbers. - - Parameters: - series_title (str): Ignored; present for signature compatibility. - season (int): Season number; used and zero-padded to two digits in the result. - episode (int): Episode number; used and zero-padded to two digits in the result. - height (int): Ignored; present for signature compatibility. - vcodec (str): Ignored; present for signature compatibility. - language (str): Ignored; present for signature compatibility. - site (str): Ignored; present for signature compatibility. - - Returns: - str: A release-name string in the form "Kaguya.SXXEYY.1080p.WEB.H264.GER.SUB-ANIWORLD" - where XX is the zero-padded season and YY is the zero-padded episode. - """ - _ = (series_title, height, vcodec, language, site) - return ( - f"Kaguya.S{int(season):02d}E{int(episode):02d}.1080p.WEB.H264.GER.SUB-ANIWORLD" - ) - - -def _fake_magnet( - title: str, - slug: str, - season: int, - episode: int, - language: str, - provider: str | None, - site: str = "aniworld.to", - **_kwargs, -) -> str: - """ - Builds a fake magnet URI that encodes the provided slug, season, episode, and site as aw_* query parameters. - - Parameters: - slug (str): Series slug to include as the `aw_slug` query parameter. - season (int): Season index to include as the `aw_s` query parameter. - episode (int): Episode index to include as the `aw_e` query parameter. - site (str): Site identifier to include as the `aw_site` query parameter (default "aniworld.to"). - - Returns: - str: A magnet URI string containing `aw_slug`, `aw_s`, `aw_e`, and `aw_site` set to the provided values. - """ - _ = (title, language, provider) - return ( - f"magnet:?xt=urn:btih:test&dn=Kaguya" - f"&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_site={site}" - ) - - -def test_search_uses_special_mapping_alias_in_title(client, monkeypatch): - """ - Verify that a torznab search result applies a special episode mapping alias to the item title and enclosure parameters. +from __future__ import annotations - Performs a search request and asserts a 200 response, that an item exists whose title contains "S00E05", and that the enclosure URL includes "aw_s=0" and "aw_e=4". - """ - import app.api.torznab as tn - import app.api.torznab.api as torznab_api - from app.providers.aniworld.specials import SpecialEpisodeMapping +import xml.etree.ElementTree as ET - monkeypatch.setattr(torznab_api, "ANIBRIDGE_TEST_MODE", False) - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "kaguya") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Kaguya-sama" - ) - monkeypatch.setattr( - torznab_api, - "resolve_special_mapping_from_query", - lambda **_kwargs: SpecialEpisodeMapping( - source_season=0, - source_episode=4, - alias_season=0, - alias_episode=5, - metadata_title="special title", - metadata_tvdb_id=12345, - ), - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "probe_episode_quality", - lambda **_kwargs: (True, 1080, "h264", "VOE", {}), - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr(tn, "build_release_name", _fake_release_name) - monkeypatch.setattr(tn, "build_magnet", _fake_magnet) +from sqlmodel import Session + + +def _seed_special_mapping_catalog(*, languages: list[str]) -> None: + from app.db import ( + engine, + replace_canonical_episodes, + replace_provider_catalog_aliases, + replace_provider_catalog_episodes, + replace_provider_catalog_title, + replace_provider_episode_mappings, + replace_provider_series_mappings, + upsert_canonical_series, + upsert_provider_index_status, + ) + + generation = "gen-special" + with Session(engine) as session: + for provider in ("aniworld.to", "s.to", "megakino"): + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=24.0, + status="ready", + current_generation=generation, + latest_success_generation=generation, + bootstrap_completed=True, + ) + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug="kaguya", + title="Kaguya-sama", + media_type_hint="series", + relative_path="/anime/stream/kaguya", + indexed_generation=generation, + ) + replace_provider_catalog_aliases( + session, + provider="aniworld.to", + slug="kaguya", + aliases=["Kaguya-sama", "Kaguya"], + indexed_generation=generation, + ) + replace_provider_catalog_episodes( + session, + provider="aniworld.to", + slug="kaguya", + indexed_generation=generation, + episodes=[ + { + "season": 0, + "episode": 4, + "relative_path": "/anime/stream/kaguya/filme/film-4", + "title_primary": "special title", + "title_secondary": None, + "media_type_hint": "episode", + "languages": [ + {"language": language, "host_hints": ["VOE"]} + for language in languages + ], + } + ], + ) + upsert_canonical_series( + session, + tvdb_id=12345, + title="Kaguya-sama", + imdb_id="tt0000001", + aliases=["Kaguya"], + ) + replace_canonical_episodes( + session, + tvdb_id=12345, + episodes=[ + {"season": 0, "episode": 5, "title": "special title"}, + ], + ) + replace_provider_series_mappings( + session, + provider="aniworld.to", + slug="kaguya", + mappings=[ + { + "tvdb_id": 12345, + "confidence": "confirmed", + "source": "title", + "rationale": "test", + } + ], + ) + replace_provider_episode_mappings( + session, + provider="aniworld.to", + slug="kaguya", + mappings=[ + { + "provider_season": 0, + "provider_episode": 4, + "tvdb_id": 12345, + "canonical_season": 0, + "canonical_episode": 5, + "confidence": "confirmed", + "source": "special_alias", + "rationale": "test", + } + ], + ) + session.commit() + + +def test_search_uses_special_mapping_alias_in_title(client): + _seed_special_mapping_catalog(languages=["German Sub"]) resp = client.get( "/torznab/api", - params={"t": "search", "q": "Kaguya special title", "cat": "5070"}, + params={"t": "search", "q": "Kaguya", "cat": "5070"}, ) assert resp.status_code == 200 root = ET.fromstring(resp.text) @@ -123,73 +135,8 @@ def test_search_uses_special_mapping_alias_in_title(client, monkeypatch): def test_tvsearch_falls_back_to_special_mapping_when_requested_episode_missing( client, - monkeypatch, -): - """ - Verifies that a tvsearch request falls back to a SpecialEpisodeMapping when the requested episode is unavailable and that the resulting item reflects the alias episode and mapped source. - - Asserts the response contains an item whose title includes "S00E05" (the alias episode) and whose enclosure URL includes the mapped source parameters `aw_s=0` and `aw_e=4`. - """ - import app.api.torznab as tn - import app.api.torznab.api as torznab_api - from app.providers.aniworld.specials import SpecialEpisodeMapping - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "kaguya") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Kaguya-sama" - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": None, - ) - - def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwargs): - """ - Determine if the specified episode is available as a 1080p H.264 release and provide its metadata. - - Parameters: - slug (str): Series identifier (ignored by this probe). - season (int): Season number to probe. - episode (int): Episode number to probe. - language (str): Language tag (ignored by this probe). - site (str): Site identifier (ignored by this probe). - - Returns: - available (bool): `True` if the probe found a matching 1080p H.264 release, `False` otherwise. - height (int or None): Video height in pixels when available (1080), otherwise `None`. - vcodec (str or None): Video codec string when available ("h264"), otherwise `None`. - release_language (str or None): Release language tag when available ("VOE"), otherwise `None`. - extra (dict or None): Additional metadata (empty dict when available), otherwise `None`. - """ - _ = (slug, language, site) - if season == 0 and episode == 4: - return (True, 1080, "h264", "VOE", {}) - return (False, None, None, None, None) - - monkeypatch.setattr(tn, "probe_episode_quality", _probe_quality) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr(tn, "build_release_name", _fake_release_name) - monkeypatch.setattr(tn, "build_magnet", _fake_magnet) - monkeypatch.setattr( - torznab_api, - "resolve_special_mapping_from_episode_request", - lambda **_kwargs: SpecialEpisodeMapping( - source_season=0, - source_episode=4, - alias_season=0, - alias_episode=5, - metadata_title="special title", - metadata_tvdb_id=12345, - ), - ) +) -> None: + _seed_special_mapping_catalog(languages=["German Sub"]) resp = client.get( "/torznab/api", @@ -208,106 +155,35 @@ def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwarg assert "aw_e=4" in url -def test_tvsearch_reuses_resolved_special_mapping_across_languages( - client, - monkeypatch, -): - """Verify a resolved special mapping is reused for subsequent languages.""" - import app.api.torznab as tn - import app.api.torznab.api as torznab_api - from app.providers.aniworld.specials import SpecialEpisodeMapping - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "kaguya") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Kaguya-sama" - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": [ - "German Sub", - "English Sub", - ], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": None, - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr(tn, "build_release_name", _fake_release_name) - monkeypatch.setattr(tn, "build_magnet", _fake_magnet) - monkeypatch.setattr( - torznab_api, - "resolve_special_mapping_from_episode_request", - lambda **_kwargs: SpecialEpisodeMapping( - source_season=0, - source_episode=4, - alias_season=0, - alias_episode=5, - metadata_title="special title", - metadata_tvdb_id=12345, - ), - ) - - probe_calls: list[tuple[int, int, str]] = [] - - def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwargs): - _ = (slug, site) - probe_calls.append((season, episode, language)) - if season == 0 and episode == 4: - return (True, 1080, "h264", "VOE", {}) - return (False, None, None, None, None) - - monkeypatch.setattr(tn, "probe_episode_quality", _probe_quality) +def test_tvsearch_reuses_resolved_special_mapping_across_languages(client) -> None: + _seed_special_mapping_catalog(languages=["German Sub", "English Sub"]) resp = client.get( "/torznab/api", params={"t": "tvsearch", "q": "Kaguya", "season": 0, "ep": 5, "cat": "5070"}, ) assert resp.status_code == 200 - requested_calls = [c for c in probe_calls if c[0] == 0 and c[1] == 5] - assert len(requested_calls) == 1 - mapped_calls = [c for c in probe_calls if c[0] == 0 and c[1] == 4] - assert len(mapped_calls) == 2 - - -def test_tvsearch_guid_alias_suffix_only_when_alias_differs( - client, - monkeypatch, -): - import app.api.torznab as tn - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "kaguya") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Kaguya-sama" - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": None, - ) - monkeypatch.setattr( - tn, - "probe_episode_quality", - lambda **_kwargs: (True, 1080, "h264", "VOE", {}), - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr(tn, "build_release_name", _fake_release_name) - monkeypatch.setattr(tn, "build_magnet", _fake_magnet) + root = ET.fromstring(resp.text) + items = root.findall("./channel/item") + assert len(items) == 4 + urls = [ + ( + item.find("enclosure").get("url") + if item.find("enclosure") is not None + else "" + ) + for item in items + ] + assert sum("aw_lang=German+Sub" in url for url in urls) == 2 + assert sum("aw_lang=English+Sub" in url for url in urls) == 2 + + +def test_tvsearch_guid_alias_suffix_only_when_alias_differs(client) -> None: + _seed_special_mapping_catalog(languages=["German Sub"]) resp = client.get( "/torznab/api", - params={"t": "tvsearch", "q": "Kaguya", "season": 1, "ep": 1, "cat": "5070"}, + params={"t": "tvsearch", "q": "Kaguya", "season": 0, "ep": 5, "cat": "5070"}, ) assert resp.status_code == 200 root = ET.fromstring(resp.text) diff --git a/apps/api/tests/unit/db/test_migrations.py b/apps/api/tests/unit/db/test_migrations.py index c6a67ae3..b6f2afb2 100644 --- a/apps/api/tests/unit/db/test_migrations.py +++ b/apps/api/tests/unit/db/test_migrations.py @@ -1,10 +1,28 @@ from __future__ import annotations from pathlib import Path +from alembic.config import Config +from alembic.script import ScriptDirectory from sqlalchemy import inspect -HEAD_REVISION = "20260204_0003" +def _head_revision() -> str: + config = Config( + str( + Path(__file__) + .resolve() + .parents[3] + / "app" + / "db" + / "migrations" + / "alembic.ini" + ) + ) + config.set_main_option( + "script_location", + str(Path(__file__).resolve().parents[3] / "app" / "db" / "migrations"), + ) + return ScriptDirectory.from_config(config).get_current_head() def _load_db(tmp_path: Path, monkeypatch): @@ -62,7 +80,7 @@ def test_apply_migrations_fresh_db(tmp_path, monkeypatch): assert "clienttask" in tables assert "strmurlmapping" in tables assert "alembic_version" in tables - assert _get_version(models) == HEAD_REVISION + assert _get_version(models) == _head_revision() def test_apply_migrations_legacy_db(tmp_path, monkeypatch): @@ -74,7 +92,7 @@ def test_apply_migrations_legacy_db(tmp_path, monkeypatch): inspector = inspect(models.engine) tables = set(inspector.get_table_names()) assert "alembic_version" in tables - assert _get_version(models) == HEAD_REVISION + assert _get_version(models) == _head_revision() def test_apply_migrations_empty_version_table(tmp_path, monkeypatch): @@ -97,4 +115,4 @@ def test_apply_migrations_empty_version_table(tmp_path, monkeypatch): assert "job" in tables assert "strmurlmapping" in tables assert "alembic_version" in tables - assert _get_version(models) == HEAD_REVISION + assert _get_version(models) == _head_revision() From dc63ea2434815c7d3ab04034577390712521b315 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 20:59:27 +0200 Subject: [PATCH 04/45] fix(api): honor mounted data dir for terminal logs Use DATA_DIR as the default base for the generated terminal log path instead of the container working directory. This keeps development Compose logs on the mounted /data volume rather than writing them into /app/data inside the container. --- apps/api/app/config.py | 17 +++++++++++++++++ apps/api/app/utils/logger.py | 9 ++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/apps/api/app/config.py b/apps/api/app/config.py index 932d03cc..30970089 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -188,6 +188,23 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: DOWNLOAD_DIR = _ensure_dir(download_candidates, "DOWNLOAD_DIR") DATA_DIR = _ensure_dir(data_candidates, "DATA_DIR") + +def _ensure_runtime_home() -> Path: + runtime_home = DATA_DIR / "home" + runtime_home.mkdir(parents=True, exist_ok=True) + current_home = os.environ.get("HOME", "").strip() + if not current_home or current_home == "/nonexistent": + os.environ["HOME"] = str(runtime_home) + os.environ.setdefault("XDG_CONFIG_HOME", str(runtime_home / ".config")) + os.environ.setdefault("XDG_CACHE_HOME", str(runtime_home / ".cache")) + Path(os.environ["XDG_CONFIG_HOME"]).mkdir(parents=True, exist_ok=True) + Path(os.environ["XDG_CACHE_HOME"]).mkdir(parents=True, exist_ok=True) + logger.debug("RUNTIME_HOME using: {}", os.environ["HOME"]) + return runtime_home + + +RUNTIME_HOME = _ensure_runtime_home() + # Optional override: path reported to clients (e.g. Sonarr) as qBittorrent save path. # Useful when AniBridge runs on host but Sonarr runs in a container with a different mount point. # Normalize to absolute for reporting if it points into container diff --git a/apps/api/app/utils/logger.py b/apps/api/app/utils/logger.py index b1687ddc..ceeee3ec 100644 --- a/apps/api/app/utils/logger.py +++ b/apps/api/app/utils/logger.py @@ -95,7 +95,14 @@ def ensure_log_path(base_dir: Optional[Path] = None) -> Path: if env: return Path(env) - base = Path.cwd() / "data" if base_dir is None else base_dir + if base_dir is None: + data_dir_env = os.environ.get("DATA_DIR", "").strip() + if data_dir_env: + base = Path(data_dir_env).expanduser() + else: + base = Path.cwd() / "data" + else: + base = base_dir ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") run_id = uuid.uuid4().hex[:8] log_path = base / f"terminal-{ts}-{run_id}.log" From ee20930b04425f27f17fe57ab505cc0250cfa209 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 20:59:38 +0200 Subject: [PATCH 05/45] test(api): cover container runtime home and log path defaults Add regression tests for the Docker runtime edge cases that broke the provider catalog bootstrap. Verify that HOME falls back to a writable directory under DATA_DIR when the container uses /nonexistent, and that generated terminal logs prefer DATA_DIR by default. --- apps/api/tests/unit/app/test_config.py | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/apps/api/tests/unit/app/test_config.py b/apps/api/tests/unit/app/test_config.py index e97b0624..07916cdc 100644 --- a/apps/api/tests/unit/app/test_config.py +++ b/apps/api/tests/unit/app/test_config.py @@ -121,3 +121,40 @@ def test_provider_redirect_settings(monkeypatch): cfg = importlib.import_module("app.config") cfg = importlib.reload(cfg) assert cfg.DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC == 0 + + +def test_runtime_home_defaults_to_data_dir_when_home_is_nonexistent( + monkeypatch, tmp_path +): + import importlib + import app + import sys + + monkeypatch.setenv("DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("DOWNLOAD_DIR", str(tmp_path / "downloads")) + monkeypatch.setenv("HOME", "/nonexistent") + + if "app.config" in sys.modules: + del sys.modules["app.config"] + if hasattr(app, "config"): + delattr(app, "config") + cfg = importlib.import_module("app.config") + cfg = importlib.reload(cfg) + + assert cfg.RUNTIME_HOME == (cfg.DATA_DIR / "home").resolve() + assert cfg.RUNTIME_HOME.exists() + assert cfg.RUNTIME_HOME == cfg.RUNTIME_HOME.resolve() + assert cfg.os.environ["HOME"] == str(cfg.RUNTIME_HOME) + + +def test_ensure_log_path_prefers_data_dir_env(monkeypatch, tmp_path): + from app.utils.logger import ensure_log_path + + monkeypatch.delenv("ANIBRIDGE_LOG_PATH", raising=False) + monkeypatch.setenv("DATA_DIR", str(tmp_path / "mounted-data")) + + log_path = ensure_log_path() + + assert log_path.parent == (tmp_path / "mounted-data").resolve() + assert log_path.parent.exists() + assert log_path.name.startswith("terminal-") From 285cfc57e61bbab5d2b0391956e88598b5e23105 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 21:15:25 +0200 Subject: [PATCH 06/45] feat(docker): update port configuration for anibridge service set ANIBRIDGE_PORT environment variable for dynamic port assignment in docker-compose files. Adjusted healthcheck command to reflect the new port configuration. --- docker/compose.dev.yaml | 1 + docker/compose.yaml | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docker/compose.dev.yaml b/docker/compose.dev.yaml index 4e296fc4..1249c654 100644 --- a/docker/compose.dev.yaml +++ b/docker/compose.dev.yaml @@ -15,6 +15,7 @@ services: - ../.env environment: - ANIBRIDGE_RELOAD=true + - ANIBRIDGE_PORT=8000 - DATA_DIR=/data - DOWNLOAD_DIR=/downloads volumes: diff --git a/docker/compose.yaml b/docker/compose.yaml index 7ccb113e..36385688 100644 --- a/docker/compose.yaml +++ b/docker/compose.yaml @@ -4,7 +4,7 @@ services: image: ghcr.io/zzackllack/anibridge:latest container_name: anibridge ports: - - "8000:8000" + - "8000:${ANIBRIDGE_PORT:-8000}" environment: # Container runtime / permissions - PUID=${PUID:-1000} @@ -30,7 +30,10 @@ services: - ../data:/data healthcheck: test: - ["CMD", "curl", "--fail", "--silent", "http://localhost:8000/health"] + [ + "CMD-SHELL", + "curl --fail --silent \"http://localhost:${ANIBRIDGE_PORT:-8000}/health\" >/dev/null", + ] interval: 30s timeout: 5s retries: 3 From 1739d7ebbeadc261a6ac334a1244daced6ad1bd6 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 21:15:56 +0200 Subject: [PATCH 07/45] feat(indexer): implement heartbeat mechanism for catalog crawling add a heartbeat mechanism to the provider catalog crawling process, allowing for better monitoring of long-running operations. This change enhances the reliability of the catalog discovery by logging progress at regular intervals. --- apps/api/app/catalog/indexer.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 99287c85..e417d16c 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -1,5 +1,6 @@ from __future__ import annotations +from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError import threading from dataclasses import dataclass from datetime import timedelta @@ -42,6 +43,7 @@ _INDEXER: "ProviderCatalogIndexer | None" = None _INDEXER_LOCK = threading.Lock() _UNSET = object() +_DISCOVERY_HEARTBEAT_SECONDS = 15.0 @dataclass(slots=True) @@ -218,11 +220,13 @@ def get_progress_snapshot(self) -> dict[str, object]: } def _run_loop(self) -> None: - while not self._stop_event.wait(PROVIDER_INDEX_SCHEDULER_POLL_SECONDS): + while not self._stop_event.is_set(): try: self.run_due_once() except Exception as exc: logger.exception("Provider catalog scheduler loop failed: {}", exc) + if self._stop_event.wait(PROVIDER_INDEX_SCHEDULER_POLL_SECONDS): + break def _ensure_status_rows(self) -> None: with Session(engine) as session: @@ -283,7 +287,12 @@ def _refresh_provider(self, provider: str) -> None: ) try: - titles = crawl_provider_catalog(provider) + titles = self._crawl_provider_catalog_with_heartbeat(provider) + logger.info( + "Provider catalog {}: discovered {} titles", + provider, + len(titles), + ) reporter = ProgressReporter( label=f"Catalog {provider}", unit="title", @@ -472,6 +481,23 @@ def _refresh_provider(self, provider: str) -> None: current_slug="", ) + def _crawl_provider_catalog_with_heartbeat( + self, provider: str + ) -> list[object]: + elapsed_seconds = 0.0 + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(crawl_provider_catalog, provider) + while True: + try: + return future.result(timeout=_DISCOVERY_HEARTBEAT_SECONDS) + except FutureTimeoutError: + elapsed_seconds += _DISCOVERY_HEARTBEAT_SECONDS + logger.info( + "Provider catalog {}: still discovering titles after {}s", + provider, + int(elapsed_seconds), + ) + def _set_progress( self, provider: str, From 66aaa9f58c7bebbcbdc43fef4ab95636c1ca94f6 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 21:16:02 +0200 Subject: [PATCH 08/45] test: add unit tests for catalog indexer functionality --- apps/api/tests/unit/catalog/test_indexer.py | 43 +++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 apps/api/tests/unit/catalog/test_indexer.py diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py new file mode 100644 index 00000000..899898b7 --- /dev/null +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -0,0 +1,43 @@ +import time + + +def test_catalog_scheduler_runs_immediately(monkeypatch): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + calls: list[str] = [] + + def fake_run_due_once() -> None: + calls.append("called") + indexer._stop_event.set() + + monkeypatch.setattr(indexer, "run_due_once", fake_run_due_once) + + indexer._run_loop() + + assert calls == ["called"] + + +def test_catalog_discovery_logs_heartbeat(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + + messages: list[str] = [] + + def fake_info(message: str, *args) -> None: + messages.append(message.format(*args)) + + def fake_crawl(_provider: str) -> list[object]: + time.sleep(0.03) + return [] + + monkeypatch.setattr(indexer_module, "_DISCOVERY_HEARTBEAT_SECONDS", 0.01) + monkeypatch.setattr(indexer_module, "crawl_provider_catalog", fake_crawl) + monkeypatch.setattr(indexer_module.logger, "info", fake_info) + + titles = ProviderCatalogIndexer()._crawl_provider_catalog_with_heartbeat( + "aniworld.to" + ) + + assert titles == [] + assert any("still discovering titles after" in message for message in messages) From ea8554733fc68bcf7ce48f978bdd754b5b76a5fb Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 21:43:21 +0200 Subject: [PATCH 09/45] feat(indexer): enhance logging for provider catalog bootstrap and status add detailed logging for the provider catalog indexer's bootstrap state and status checks. This includes information on the starting of the scheduler, warnings for missing status rows, and debug logs for provider statuses during refresh cycles. --- apps/api/app/catalog/indexer.py | 120 ++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index e417d16c..36a18788 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -127,10 +127,17 @@ def __init__(self) -> None: def start(self) -> None: self._ensure_status_rows() + self._log_bootstrap_state() if ANIBRIDGE_TEST_MODE: return if self._thread is not None and self._thread.is_alive(): return + logger.info( + "Provider catalog scheduler starting: poll={}s global_concurrency={} providers={}", + PROVIDER_INDEX_SCHEDULER_POLL_SECONDS, + PROVIDER_INDEX_GLOBAL_CONCURRENCY, + ", ".join(CATALOG_SITES_LIST), + ) self._thread = threading.Thread( target=self._run_loop, name="provider-catalog-indexer", @@ -146,14 +153,55 @@ def stop(self) -> None: def run_due_once(self) -> None: with Session(engine) as session: statuses = list_provider_index_statuses(session) + if not statuses: + logger.warning("Provider catalog scheduler: no provider status rows found") + return + logger.debug( + "Provider catalog scheduler pass: bootstrap_ready={} providers={}", + self._is_bootstrap_ready(), + ", ".join( + f"{status.provider}={status.status}" + for status in sorted(statuses, key=lambda item: item.provider) + ), + ) for status in statuses: if self._is_due(status): + logger.info( + "Provider catalog scheduler: {} is due (status={} bootstrap_completed={} next_refresh_after={} latest_success_at={})", + status.provider, + status.status, + status.bootstrap_completed, + status.next_refresh_after.isoformat() + if status.next_refresh_after is not None + else None, + status.latest_success_at.isoformat() + if status.latest_success_at is not None + else None, + ) self.refresh_provider(status.provider) + else: + logger.debug( + "Provider catalog scheduler: {} not due (status={} bootstrap_completed={} next_refresh_after={} latest_success_at={})", + status.provider, + status.status, + status.bootstrap_completed, + status.next_refresh_after.isoformat() + if status.next_refresh_after is not None + else None, + status.latest_success_at.isoformat() + if status.latest_success_at is not None + else None, + ) def refresh_provider(self, provider: str) -> None: if not self._active.acquire(blocking=False): + logger.warning( + "Provider catalog scheduler: concurrency exhausted, skipping {} for now", + provider, + ) return try: + logger.info("Provider catalog scheduler: starting refresh for {}", provider) self._refresh_provider(provider) finally: self._active.release() @@ -240,6 +288,10 @@ def _ensure_status_rows(self) -> None: ) status = get_provider_index_status(session, provider=provider) if status is None: + logger.warning( + "Provider catalog bootstrap: no persisted index state for {}. Initial bootstrap required.", + provider, + ) upsert_provider_index_status( session, provider=provider, @@ -248,6 +300,37 @@ def _ensure_status_rows(self) -> None: bootstrap_completed=False, next_refresh_after=now, ) + continue + if status.status == "running": + logger.warning( + "Provider catalog bootstrap: recovered interrupted run for {} started_at={} cursor_slug={}. Marking it pending for retry.", + provider, + status.latest_started_at.isoformat() + if status.latest_started_at is not None + else None, + status.cursor_title_slug or None, + ) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=hours, + status="pending", + latest_completed_at=utcnow(), + next_refresh_after=now, + failure_count=status.failure_count + 1, + last_error_summary="Interrupted by process restart before completion.", + ) + else: + logger.debug( + "Provider catalog bootstrap: loaded persisted state for {} status={} bootstrap_completed={} latest_success_generation={} next_refresh_after={}", + provider, + status.status, + status.bootstrap_completed, + status.latest_success_generation, + status.next_refresh_after.isoformat() + if status.next_refresh_after is not None + else None, + ) def _is_due(self, status) -> bool: if status.status == "running": @@ -498,6 +581,43 @@ def _crawl_provider_catalog_with_heartbeat( int(elapsed_seconds), ) + def _is_bootstrap_ready(self) -> bool: + with Session(engine) as session: + return is_catalog_bootstrap_ready(session, providers=CATALOG_SITES_LIST) + + def _log_bootstrap_state(self) -> None: + with Session(engine) as session: + statuses = list_provider_index_statuses(session) + bootstrap_ready = is_catalog_bootstrap_ready( + session, providers=CATALOG_SITES_LIST + ) + if not statuses: + logger.warning("Provider catalog bootstrap: no provider status rows exist yet") + return + if bootstrap_ready: + logger.info("Provider catalog bootstrap: already complete") + else: + logger.warning("Provider catalog bootstrap: incomplete, requests may be gated") + for status in sorted(statuses, key=lambda item: item.provider): + logger.info( + "Provider catalog bootstrap state: provider={} status={} bootstrap_completed={} latest_success_generation={} latest_started_at={} latest_completed_at={} next_refresh_after={} cursor_slug={} last_error={}", + status.provider, + status.status, + status.bootstrap_completed, + status.latest_success_generation, + status.latest_started_at.isoformat() + if status.latest_started_at is not None + else None, + status.latest_completed_at.isoformat() + if status.latest_completed_at is not None + else None, + status.next_refresh_after.isoformat() + if status.next_refresh_after is not None + else None, + status.cursor_title_slug or None, + status.last_error_summary or None, + ) + def _set_progress( self, provider: str, From 5a51f42c586080193c09cbe6334944371be30137 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 21:43:34 +0200 Subject: [PATCH 10/45] feat(tests): add recovery test for interrupted catalog indexing state add a test to ensure the catalog indexer can recover from an interrupted running state, verifying that appropriate warnings and status updates are logged. --- apps/api/tests/unit/catalog/test_indexer.py | 67 +++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index 899898b7..ff1205de 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -1,4 +1,5 @@ import time +from types import SimpleNamespace def test_catalog_scheduler_runs_immediately(monkeypatch): @@ -41,3 +42,69 @@ def fake_crawl(_provider: str) -> list[object]: assert titles == [] assert any("still discovering titles after" in message for message in messages) + + +def test_catalog_recovers_interrupted_running_state(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + + updates: list[dict[str, object]] = [] + warnings: list[str] = [] + statuses = { + "aniworld.to": SimpleNamespace( + provider="aniworld.to", + status="running", + bootstrap_completed=False, + latest_started_at=None, + latest_success_generation=None, + next_refresh_after=None, + failure_count=2, + cursor_title_slug="one-piece", + ), + "s.to": None, + "megakino": SimpleNamespace( + provider="megakino", + status="ready", + bootstrap_completed=True, + latest_started_at=None, + latest_success_generation="abc123", + next_refresh_after=None, + failure_count=0, + cursor_title_slug=None, + ), + } + + class FakeSession: + def __enter__(self): + return object() + + def __exit__(self, exc_type, exc, tb): + return False + + def fake_warning(message: str, *args) -> None: + warnings.append(message.format(*args)) + + def fake_get_provider_index_status(_session, provider: str): + return statuses[provider] + + def fake_upsert_provider_index_status(_session, **kwargs): + updates.append(kwargs) + return None + + monkeypatch.setattr(indexer_module, "Session", lambda _engine: FakeSession()) + monkeypatch.setattr(indexer_module, "get_provider_index_status", fake_get_provider_index_status) + monkeypatch.setattr(indexer_module, "upsert_provider_index_status", fake_upsert_provider_index_status) + monkeypatch.setattr(indexer_module.logger, "warning", fake_warning) + + ProviderCatalogIndexer()._ensure_status_rows() + + assert any("recovered interrupted run for aniworld.to" in item for item in warnings) + assert any("Initial bootstrap required" in item for item in warnings) + assert any( + update.get("provider") == "aniworld.to" and update.get("status") == "pending" + for update in updates + ) + assert any( + update.get("provider") == "s.to" and update.get("status") == "pending" + for update in updates + ) From 2a8f68946deca3605c32fbb131051a7f600f0710 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Tue, 28 Apr 2026 21:44:10 +0200 Subject: [PATCH 11/45] style: run ruff format --- apps/api/app/api/torznab/api.py | 17 ++++-- apps/api/app/catalog/indexer.py | 28 +++++---- apps/api/app/catalog/providers.py | 61 +++++++++++++------ apps/api/app/config.py | 4 +- .../20260428_0004_provider_catalog_index.py | 15 +++-- apps/api/app/db/models.py | 18 ++++-- .../tests/integration/api/torznab/test_api.py | 24 ++++++-- apps/api/tests/unit/catalog/test_indexer.py | 10 ++- apps/api/tests/unit/db/test_migrations.py | 4 +- 9 files changed, 124 insertions(+), 57 deletions(-) diff --git a/apps/api/app/api/torznab/api.py b/apps/api/app/api/torznab/api.py index eb74d8c8..06c9a19f 100644 --- a/apps/api/app/api/torznab/api.py +++ b/apps/api/app/api/torznab/api.py @@ -410,9 +410,15 @@ def _indexed_preview_results( season=target.season, episode=target.episode, ) - language_values = [item.language for item in languages] or _default_languages_for_site(provider) - season_i = mapping.canonical_season if mapping is not None else target.season - episode_i = mapping.canonical_episode if mapping is not None else target.episode + language_values = [ + item.language for item in languages + ] or _default_languages_for_site(provider) + season_i = ( + mapping.canonical_season if mapping is not None else target.season + ) + episode_i = ( + mapping.canonical_episode if mapping is not None else target.episode + ) provider_season_i = target.season provider_episode_i = target.episode else: @@ -672,7 +678,9 @@ def torznab_api( q_str=q_str, channel=channel, cat_id=TORZNAB_CAT_ANIME, - providers=[site for site in CATALOG_SITES_LIST if site != "megakino"], + providers=[ + site for site in CATALOG_SITES_LIST if site != "megakino" + ], limit=limit, strm_suffix=strm_suffix, ) @@ -730,6 +738,7 @@ def torznab_api( raise HTTPException(status_code=400, detail="invalid t") import app.api.torznab as tn + try: require_catalog_ready() except CatalogNotReadyError as exc: diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 36a18788..854a1cff 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -85,9 +85,7 @@ def get_catalog_readiness_error() -> str | None: return None pending: list[str] = [] snapshot = indexer.get_progress_snapshot() - by_provider = { - item["provider"]: item for item in snapshot.get("providers", []) - } + by_provider = {item["provider"]: item for item in snapshot.get("providers", [])} for provider in CATALOG_SITES_LIST: status = get_provider_index_status(session, provider=provider) if status is None or not status.bootstrap_completed: @@ -239,7 +237,9 @@ def get_progress_snapshot(self) -> dict[str, object]: "provider": provider, "status": status.status if status is not None else "pending", "bootstrap_completed": ( - bool(status.bootstrap_completed) if status is not None else False + bool(status.bootstrap_completed) + if status is not None + else False ), "phase": phase, "processed_titles": progress.processed_titles, @@ -343,7 +343,9 @@ def _is_due(self, status) -> bool: def _refresh_provider(self, provider: str) -> None: refresh_interval_hours = float( - CATALOG_SITE_CONFIGS.get(provider, {}).get("provider_index_refresh_hours", 24.0) + CATALOG_SITE_CONFIGS.get(provider, {}).get( + "provider_index_refresh_hours", 24.0 + ) ) generation = uuid4().hex reporter: ProgressReporter | None = None @@ -535,7 +537,9 @@ def _refresh_provider(self, provider: str) -> None: if reporter is not None: reporter.close() except Exception as exc: - logger.exception("Provider catalog refresh failed for {}: {}", provider, exc) + logger.exception( + "Provider catalog refresh failed for {}: {}", provider, exc + ) if reporter is not None: reporter.close() completed_at = utcnow() @@ -564,9 +568,7 @@ def _refresh_provider(self, provider: str) -> None: current_slug="", ) - def _crawl_provider_catalog_with_heartbeat( - self, provider: str - ) -> list[object]: + def _crawl_provider_catalog_with_heartbeat(self, provider: str) -> list[object]: elapsed_seconds = 0.0 with ThreadPoolExecutor(max_workers=1) as executor: future = executor.submit(crawl_provider_catalog, provider) @@ -592,12 +594,16 @@ def _log_bootstrap_state(self) -> None: session, providers=CATALOG_SITES_LIST ) if not statuses: - logger.warning("Provider catalog bootstrap: no provider status rows exist yet") + logger.warning( + "Provider catalog bootstrap: no provider status rows exist yet" + ) return if bootstrap_ready: logger.info("Provider catalog bootstrap: already complete") else: - logger.warning("Provider catalog bootstrap: incomplete, requests may be gated") + logger.warning( + "Provider catalog bootstrap: incomplete, requests may be gated" + ) for status in sorted(statuses, key=lambda item: item.provider): logger.info( "Provider catalog bootstrap state: provider={} status={} bootstrap_completed={} latest_success_generation={} latest_started_at={} latest_completed_at={} next_refresh_after={} cursor_slug={} last_error={}", diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py index 57db875c..6b3262bc 100644 --- a/apps/api/app/catalog/providers.py +++ b/apps/api/app/catalog/providers.py @@ -72,9 +72,13 @@ def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageReco languages: list[EpisodeLanguageRecord] = [] for key, provider_map in raw.items(): if site == "aniworld.to": - audio = getattr(key[0], "value", str(key[0])) if isinstance(key, tuple) else "" + audio = ( + getattr(key[0], "value", str(key[0])) if isinstance(key, tuple) else "" + ) subtitles = ( - getattr(key[1], "value", str(key[1])) if isinstance(key, tuple) and len(key) > 1 else "" + getattr(key[1], "value", str(key[1])) + if isinstance(key, tuple) and len(key) > 1 + else "" ) if audio == "German" and subtitles == "None": language = "German Dub" @@ -95,7 +99,9 @@ def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageReco else: language = str(key) host_hints = sorted(str(name) for name in (provider_map or {}).keys()) - languages.append(EpisodeLanguageRecord(language=language, host_hints=host_hints)) + languages.append( + EpisodeLanguageRecord(language=language, host_hints=host_hints) + ) languages.sort(key=lambda entry: entry.language) return languages @@ -140,7 +146,11 @@ def _build_tv_canonical_payload( season_number = item.get("seasonNumber") episode_number = item.get("episodeNumber") episode_title = str(item.get("title") or "").strip() - if not isinstance(season_number, int) or not isinstance(episode_number, int) or not episode_title: + if ( + not isinstance(season_number, int) + or not isinstance(episode_number, int) + or not episode_title + ): continue canonical_episodes.append( { @@ -153,9 +163,13 @@ def _build_tv_canonical_payload( series_payload = { "tvdb_id": match.tvdb_id, "title": match.title, - "tmdb_id": payload.get("tmdbId") if isinstance(payload.get("tmdbId"), int) else None, + "tmdb_id": payload.get("tmdbId") + if isinstance(payload.get("tmdbId"), int) + else None, "imdb_id": imdb_id or str(payload.get("imdbId") or "").strip() or None, - "tvmaze_id": payload.get("tvMazeId") if isinstance(payload.get("tvMazeId"), int) else None, + "tvmaze_id": payload.get("tvMazeId") + if isinstance(payload.get("tvMazeId"), int) + else None, "anilist_id": None, "mal_id": mal_id, "aliases": aliases, @@ -169,9 +183,7 @@ def _build_tv_canonical_payload( } ] - by_number = { - (item["season"], item["episode"]): item for item in canonical_episodes - } + by_number = {(item["season"], item["episode"]): item for item in canonical_episodes} by_season: dict[int, list[dict[str, Any]]] = {} for item in canonical_episodes: by_season.setdefault(int(item["season"]), []).append(item) @@ -198,12 +210,18 @@ def _build_tv_canonical_payload( scored: list[tuple[float, dict[str, Any]]] = [] search_titles = [ value - for value in [provider_episode.title_primary, provider_episode.title_secondary] + for value in [ + provider_episode.title_primary, + provider_episode.title_secondary, + ] if value ] for candidate in candidate_pool: score = max( - (_score_episode_title(search_title, candidate["title"]) for search_title in search_titles), + ( + _score_episode_title(search_title, candidate["title"]) + for search_title in search_titles + ), default=0.0, ) if score >= 0.65: @@ -213,9 +231,7 @@ def _build_tv_canonical_payload( continue top_score = scored[0][0] plausible = [ - candidate - for score, candidate in scored - if score >= top_score - 0.05 + candidate for score, candidate in scored if score >= top_score - 0.05 ] confidence = "high_confidence" if top_score >= 0.85 else "low_confidence" for candidate in plausible: @@ -263,7 +279,7 @@ def _crawl_aniworld_like_title( if isinstance(raw_mal, list) and raw_mal: try: mal_id = int(raw_mal[0]) - except (TypeError, ValueError): + except TypeError, ValueError: mal_id = None else: from aniworld.models import SerienstreamSeries @@ -286,9 +302,12 @@ def _crawl_aniworld_like_title( title_primary=getattr(episode, "title_de", None), title_secondary=getattr(episode, "title_en", None), media_type_hint="movie" - if provider_key == "aniworld.to" and getattr(episode, "is_movie", False) + if provider_key == "aniworld.to" + and getattr(episode, "is_movie", False) else "episode", - languages=_normalize_provider_data(provider_data, site=provider_key), + languages=_normalize_provider_data( + provider_data, site=provider_key + ), ) ) @@ -347,7 +366,9 @@ def crawl_provider_catalog(provider_key: str) -> list[TitleRecord]: if live_title: parsed_title = live_title except Exception as exc: - logger.debug("Megakino metadata fetch failed for {}: {}", entry.url, exc) + logger.debug( + "Megakino metadata fetch failed for {}: {}", entry.url, exc + ) titles.append( TitleRecord( provider=provider_key, @@ -364,7 +385,9 @@ def crawl_provider_catalog(provider_key: str) -> list[TitleRecord]: index = provider.load_or_refresh_index() alternatives = provider.load_or_refresh_alternatives() - workers = int(CATALOG_SITE_CONFIGS[provider_key].get("provider_index_concurrency", 1)) + workers = int( + CATALOG_SITE_CONFIGS[provider_key].get("provider_index_concurrency", 1) + ) futures = [] results: list[TitleRecord] = [] with ThreadPoolExecutor(max_workers=max(1, workers)) as executor: diff --git a/apps/api/app/config.py b/apps/api/app/config.py index 30970089..4b000fce 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -246,9 +246,7 @@ def _ensure_runtime_home() -> Path: MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN = int( os.getenv("MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN", "100") ) -PROVIDER_INDEX_REFRESH_HOURS = float( - os.getenv("PROVIDER_INDEX_REFRESH_HOURS", "24") -) +PROVIDER_INDEX_REFRESH_HOURS = float(os.getenv("PROVIDER_INDEX_REFRESH_HOURS", "24")) PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD = float( os.getenv( "PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD", str(PROVIDER_INDEX_REFRESH_HOURS) diff --git a/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py b/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py index 536286c3..cc38710e 100644 --- a/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py +++ b/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py @@ -206,7 +206,10 @@ def upgrade() -> None: sa.Column("indexed_generation", sa.String(), nullable=False), sa.Column("last_indexed_at", sa.DateTime(), nullable=False), sa.PrimaryKeyConstraint( - "provider", "slug", "season", "episode", + "provider", + "slug", + "season", + "episode", name="pk_providercatalogepisode", ), ) @@ -242,7 +245,11 @@ def upgrade() -> None: sa.Column("indexed_generation", sa.String(), nullable=False), sa.Column("last_indexed_at", sa.DateTime(), nullable=False), sa.PrimaryKeyConstraint( - "provider", "slug", "season", "episode", "language", + "provider", + "slug", + "season", + "episode", + "language", name="pk_providerepisodelanguage", ), ) @@ -322,9 +329,7 @@ def upgrade() -> None: sa.Column("tvdb_id", sa.Integer(), nullable=False), sa.Column("alias", sa.String(), nullable=False), sa.Column("normalized_alias", sa.String(), nullable=False), - sa.PrimaryKeyConstraint( - "tvdb_id", "alias", name="pk_canonicalseriesalias" - ), + sa.PrimaryKeyConstraint("tvdb_id", "alias", name="pk_canonicalseriesalias"), ) op.create_index( "ix_canonicalseriesalias_normalized_alias", diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index 3d7db2b7..172ce80a 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -1133,7 +1133,11 @@ def upsert_canonical_series( rec.mal_id = mal_id rec.last_synced_at = utcnow() session.add(rec) - session.exec(CanonicalSeriesAlias.__table__.delete().where(CanonicalSeriesAlias.tvdb_id == tvdb_id)) + session.exec( + CanonicalSeriesAlias.__table__.delete().where( + CanonicalSeriesAlias.tvdb_id == tvdb_id + ) + ) for alias in aliases or []: alias_clean = (alias or "").strip() if not alias_clean: @@ -1183,7 +1187,9 @@ def is_catalog_bootstrap_ready( statuses = { row.provider: row for row in session.exec( - select(ProviderIndexStatus).where(ProviderIndexStatus.provider.in_(providers)) + select(ProviderIndexStatus).where( + ProviderIndexStatus.provider.in_(providers) + ) ).all() } return all( @@ -1233,7 +1239,9 @@ def search_indexed_provider_titles( visible_generations = _visible_generation_map(session, providers=providers) if not visible_generations: return [] - stmt = select(ProviderCatalogTitle).where(ProviderCatalogTitle.provider.in_(providers)) + stmt = select(ProviderCatalogTitle).where( + ProviderCatalogTitle.provider.in_(providers) + ) if media_type_hint is not None: stmt = stmt.where(ProviderCatalogTitle.media_type_hint == media_type_hint) rows = [ @@ -1402,7 +1410,9 @@ def find_canonical_series_by_ids_or_title( if row is not None: return row alias = session.exec( - select(CanonicalSeriesAlias).where(CanonicalSeriesAlias.normalized_alias == q_norm) + select(CanonicalSeriesAlias).where( + CanonicalSeriesAlias.normalized_alias == q_norm + ) ).first() if alias is not None: return session.get(CanonicalSeries, alias.tvdb_id) diff --git a/apps/api/tests/integration/api/torznab/test_api.py b/apps/api/tests/integration/api/torznab/test_api.py index f8224ec4..f7181b1a 100644 --- a/apps/api/tests/integration/api/torznab/test_api.py +++ b/apps/api/tests/integration/api/torznab/test_api.py @@ -150,7 +150,9 @@ def test_tvsearch_happy_path(client, monkeypatch): monkeypatch.setattr( tn, "probe_episode_quality", - lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") + ), ) resp = client.get( @@ -198,7 +200,9 @@ def test_tvsearch_uses_id_resolved_query_when_q_missing(client, monkeypatch): monkeypatch.setattr( tn, "probe_episode_quality", - lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") + ), ) resp = client.get( @@ -221,7 +225,9 @@ def test_tvsearch_season_search_emits_multiple_episodes(client, monkeypatch) -> monkeypatch.setattr( tn, "probe_episode_quality", - lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") + ), ) resp = client.get( @@ -258,7 +264,9 @@ def test_tvsearch_season_search_fallback_stops_on_consecutive_misses( monkeypatch.setattr( tn, "probe_episode_quality", - lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") + ), ) resp = client.get( @@ -282,7 +290,9 @@ def test_tvsearch_ep_zero_is_treated_as_season_search(client, monkeypatch) -> No monkeypatch.setattr( tn, "probe_episode_quality", - lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") + ), ) resp = client.get( @@ -316,7 +326,9 @@ def test_tvsearch_fast_season_mode_avoids_live_probe(client, monkeypatch) -> Non monkeypatch.setattr( tn, "probe_episode_quality", - lambda **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected live probe")), + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") + ), ) resp = client.get( diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index ff1205de..8b01bb11 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -92,8 +92,14 @@ def fake_upsert_provider_index_status(_session, **kwargs): return None monkeypatch.setattr(indexer_module, "Session", lambda _engine: FakeSession()) - monkeypatch.setattr(indexer_module, "get_provider_index_status", fake_get_provider_index_status) - monkeypatch.setattr(indexer_module, "upsert_provider_index_status", fake_upsert_provider_index_status) + monkeypatch.setattr( + indexer_module, "get_provider_index_status", fake_get_provider_index_status + ) + monkeypatch.setattr( + indexer_module, + "upsert_provider_index_status", + fake_upsert_provider_index_status, + ) monkeypatch.setattr(indexer_module.logger, "warning", fake_warning) ProviderCatalogIndexer()._ensure_status_rows() diff --git a/apps/api/tests/unit/db/test_migrations.py b/apps/api/tests/unit/db/test_migrations.py index b6f2afb2..e54d960a 100644 --- a/apps/api/tests/unit/db/test_migrations.py +++ b/apps/api/tests/unit/db/test_migrations.py @@ -9,9 +9,7 @@ def _head_revision() -> str: config = Config( str( - Path(__file__) - .resolve() - .parents[3] + Path(__file__).resolve().parents[3] / "app" / "db" / "migrations" From 8baed8dc0843979d59829f0b55c5467e23f62e78 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 00:07:17 +0200 Subject: [PATCH 12/45] perf(catalog): add language deduplication and parsing for Aniworld implement functions to deduplicate language records and parse season rows from Aniworld, enhancing the cataloging process for episodes and languages. --- apps/api/app/catalog/providers.py | 167 +++++++++++++++++++++++++----- 1 file changed, 141 insertions(+), 26 deletions(-) diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py index 6b3262bc..2b28a949 100644 --- a/apps/api/app/catalog/providers.py +++ b/apps/api/app/catalog/providers.py @@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field from difflib import SequenceMatcher +import re from typing import Any, Optional from urllib.parse import urlparse @@ -106,6 +107,142 @@ def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageReco return languages +def _dedupe_languages( + languages: list[EpisodeLanguageRecord], +) -> list[EpisodeLanguageRecord]: + deduped: dict[str, set[str]] = {} + for item in languages: + bucket = deduped.setdefault(item.language, set()) + bucket.update(item.host_hints) + return [ + EpisodeLanguageRecord(language=language, host_hints=sorted(host_hints)) + for language, host_hints in sorted(deduped.items()) + ] + + +def _aniworld_languages_from_flags(host_hints: list[str], row: BeautifulSoup) -> list[EpisodeLanguageRecord]: + languages: list[EpisodeLanguageRecord] = [] + for image in row.select("td.editFunctions img.flag"): + src = str(image.get("src") or "").lower() + title = str(image.get("title") or "").lower() + alt = str(image.get("alt") or "").lower() + text = " ".join([src, title, alt]) + if "japanese-german" in text or "deutsch" in text and "untertitel" in text: + languages.append( + EpisodeLanguageRecord( + language="German Sub", + host_hints=host_hints, + ) + ) + elif "japanese-english" in text or "englisch" in text: + languages.append( + EpisodeLanguageRecord( + language="English Sub", + host_hints=host_hints, + ) + ) + elif "german.svg" in src or "deutsche sprache" in text or "deutsch/german" in text: + languages.append( + EpisodeLanguageRecord( + language="German Dub", + host_hints=host_hints, + ) + ) + return _dedupe_languages(languages) + + +def _host_hints_from_row(row: BeautifulSoup) -> list[str]: + names: list[str] = [] + for icon in row.select("i.icon"): + classes = [cls for cls in icon.get("class", []) if cls != "icon"] + if classes: + names.append(str(classes[-1])) + continue + title = str(icon.get("title") or "").strip() + if title: + names.append(title.replace("Hoster ", "").strip()) + return sorted(dict.fromkeys(name for name in names if name)) + + +def _parse_aniworld_season_rows(season) -> list[EpisodeRecord]: + soup = BeautifulSoup(season._html, "html.parser") + episodes: list[EpisodeRecord] = [] + for row in soup.select('tr[itemtype="http://schema.org/Episode"]'): + link = row.select_one('a[itemprop="url"]') + if link is None: + continue + href = str(link.get("href") or "").strip() + if not href: + continue + relative_path = _relative_path(href) + episode_number = 0 + number_meta = row.select_one('meta[itemprop="episodeNumber"]') + if number_meta is not None: + content = str(number_meta.get("content") or "").strip() + if content.isdigit(): + episode_number = int(content) + if episode_number <= 0: + match = re.search(r"(?:episode|film)-(\d+)", href) + if match: + episode_number = int(match.group(1)) + if episode_number <= 0: + continue + title_primary = None + title_secondary = None + title_cell = row.select_one("td.seasonEpisodeTitle") + if title_cell is not None: + strong = title_cell.select_one("strong") + span = title_cell.select_one("span") + title_primary = strong.get_text(" ", strip=True) if strong is not None else None + title_secondary = span.get_text(" ", strip=True) if span is not None else None + host_hints = _host_hints_from_row(row) + languages = _aniworld_languages_from_flags(host_hints, row) + episodes.append( + EpisodeRecord( + season=int(getattr(season, "season_number", 0) or 0), + episode=episode_number, + relative_path=relative_path, + title_primary=title_primary, + title_secondary=title_secondary, + media_type_hint="movie" if getattr(season, "are_movies", False) else "episode", + languages=languages, + ) + ) + return episodes + + +def _parse_sto_season_rows(season) -> list[EpisodeRecord]: + html = season._html + season_number = int(getattr(season, "season_number", 0) or 0) + pattern = re.compile( + r'href="(?P(?:https?://(?:serienstream|s)\.to)?/serie/[^"\s]+/staffel-' + + str(season_number) + + r'/episode-(?P\d+))/?\"' + ) + episodes: list[EpisodeRecord] = [] + seen: set[tuple[int, str]] = set() + for match in pattern.finditer(html): + episode_number = int(match.group("episode")) + href = match.group("href") + relative_path = _relative_path(href) + key = (episode_number, relative_path) + if key in seen: + continue + seen.add(key) + episodes.append( + EpisodeRecord( + season=season_number, + episode=episode_number, + relative_path=relative_path, + title_primary=None, + title_secondary=None, + media_type_hint="episode", + languages=[], + ) + ) + return episodes + + def _score_episode_title(left: str, right: str) -> float: a = normalize_catalog_text(left) b = normalize_catalog_text(right) @@ -275,12 +412,6 @@ def _crawl_aniworld_like_title( series = AniworldSeries(url) imdb_id = series.imdb mal_id = None - raw_mal = series.mal_id - if isinstance(raw_mal, list) and raw_mal: - try: - mal_id = int(raw_mal[0]) - except TypeError, ValueError: - mal_id = None else: from aniworld.models import SerienstreamSeries @@ -290,26 +421,10 @@ def _crawl_aniworld_like_title( episodes: list[EpisodeRecord] = [] for season in series.seasons: - for episode in season.episodes: - provider_data = getattr(episode.provider_data, "_data", None) - if provider_data is None: - provider_data = getattr(episode.provider_data, "data", None) - episodes.append( - EpisodeRecord( - season=int(getattr(season, "season_number", 0) or 0), - episode=int(getattr(episode, "episode_number", 0) or 0), - relative_path=_relative_path(episode.url), - title_primary=getattr(episode, "title_de", None), - title_secondary=getattr(episode, "title_en", None), - media_type_hint="movie" - if provider_key == "aniworld.to" - and getattr(episode, "is_movie", False) - else "episode", - languages=_normalize_provider_data( - provider_data, site=provider_key - ), - ) - ) + if provider_key == "aniworld.to": + episodes.extend(_parse_aniworld_season_rows(season)) + else: + episodes.extend(_parse_sto_season_rows(season)) canonical = _build_tv_canonical_payload( provider=provider_key, From ab1962ba0ef15d8f651eceac62745db7e164d87b Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 00:07:33 +0200 Subject: [PATCH 13/45] perf(indexer): add worker management for provider catalog refresh Implement threading support for managing provider refresh operations. This includes adding locks and a dictionary to track active worker threads, ensuring that concurrent refreshes for the same provider do not occur. --- apps/api/app/catalog/indexer.py | 38 ++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 854a1cff..cb687126 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -122,6 +122,8 @@ def __init__(self) -> None: self._active = threading.Semaphore(PROVIDER_INDEX_GLOBAL_CONCURRENCY) self._progress_lock = threading.Lock() self._progress: dict[str, ProviderCatalogProgress] = {} + self._workers_lock = threading.Lock() + self._workers: dict[str, threading.Thread] = {} def start(self) -> None: self._ensure_status_rows() @@ -147,6 +149,10 @@ def stop(self) -> None: self._stop_event.set() if self._thread is not None: self._thread.join(timeout=5) + with self._workers_lock: + workers = list(self._workers.values()) + for worker in workers: + worker.join(timeout=5) def run_due_once(self) -> None: with Session(engine) as session: @@ -192,17 +198,31 @@ def run_due_once(self) -> None: ) def refresh_provider(self, provider: str) -> None: + with self._workers_lock: + existing = self._workers.get(provider) + if existing is not None and existing.is_alive(): + logger.debug( + "Provider catalog scheduler: {} already running in worker {}", + provider, + existing.name, + ) + return if not self._active.acquire(blocking=False): logger.warning( "Provider catalog scheduler: concurrency exhausted, skipping {} for now", provider, ) return - try: - logger.info("Provider catalog scheduler: starting refresh for {}", provider) - self._refresh_provider(provider) - finally: - self._active.release() + logger.info("Provider catalog scheduler: starting refresh for {}", provider) + worker = threading.Thread( + target=self._run_provider_refresh, + name=f"provider-index-{provider}", + args=(provider,), + daemon=True, + ) + with self._workers_lock: + self._workers[provider] = worker + worker.start() def get_progress_snapshot(self) -> dict[str, object]: with Session(engine) as session: @@ -276,6 +296,14 @@ def _run_loop(self) -> None: if self._stop_event.wait(PROVIDER_INDEX_SCHEDULER_POLL_SECONDS): break + def _run_provider_refresh(self, provider: str) -> None: + try: + self._refresh_provider(provider) + finally: + self._active.release() + with self._workers_lock: + self._workers.pop(provider, None) + def _ensure_status_rows(self) -> None: with Session(engine) as session: now = None From 9c8abe6678f92dc97ef0886fe288b177d47541a6 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 00:07:50 +0200 Subject: [PATCH 14/45] test: add tests for provider catalog indexing and parsing Add unit tests for the provider catalog indexing functionality, including the refresh mechanism and parsing of Aniworld and Sto season rows. These tests ensure that the indexing process behaves as expected and correctly extracts episode information from the provided HTML structures. --- apps/api/tests/unit/catalog/test_indexer.py | 17 +++++ apps/api/tests/unit/catalog/test_providers.py | 72 +++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 apps/api/tests/unit/catalog/test_providers.py diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index 8b01bb11..0817898b 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -114,3 +114,20 @@ def fake_upsert_provider_index_status(_session, **kwargs): update.get("provider") == "s.to" and update.get("status") == "pending" for update in updates ) + + +def test_refresh_provider_starts_background_worker(monkeypatch): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + called: list[str] = [] + + def fake_refresh(provider: str) -> None: + called.append(provider) + + monkeypatch.setattr(indexer, "_refresh_provider", fake_refresh) + + indexer.refresh_provider("aniworld.to") + indexer.stop() + + assert called == ["aniworld.to"] diff --git a/apps/api/tests/unit/catalog/test_providers.py b/apps/api/tests/unit/catalog/test_providers.py new file mode 100644 index 00000000..4ee5417b --- /dev/null +++ b/apps/api/tests/unit/catalog/test_providers.py @@ -0,0 +1,72 @@ +from types import SimpleNamespace + +from app.catalog.providers import _parse_aniworld_season_rows, _parse_sto_season_rows + + +def test_parse_aniworld_season_rows_uses_season_html_only(): + season = SimpleNamespace( + season_number=1, + are_movies=False, + _html=""" + + + + + + + + + Deutsch Titel - English Title + + + + + + + + + + + + + + + + + """, + ) + + episodes = _parse_aniworld_season_rows(season) + + assert len(episodes) == 1 + episode = episodes[0] + assert episode.season == 1 + assert episode.episode == 1 + assert episode.relative_path == "/anime/stream/demo/staffel-1/episode-1" + assert episode.title_primary == "Deutsch Titel" + assert episode.title_secondary == "English Title" + assert episode.media_type_hint == "episode" + assert [item.language for item in episode.languages] == [ + "German Dub", + "German Sub", + ] + assert episode.languages[0].host_hints == ["Filemoon", "VOE"] + + +def test_parse_sto_season_rows_extracts_episode_links_without_episode_pages(): + season = SimpleNamespace( + season_number=2, + _html=""" + Episode 1 + Episode 2 + Episode 2 duplicate + """, + ) + + episodes = _parse_sto_season_rows(season) + + assert [(item.season, item.episode, item.relative_path) for item in episodes] == [ + (2, 1, "/serie/demo/staffel-2/episode-1"), + (2, 2, "/serie/demo/staffel-2/episode-2"), + ] + assert all(item.languages == [] for item in episodes) From 7f8d0e28201fff5ab493f474432d1b47c4277b3b Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 00:21:00 +0200 Subject: [PATCH 15/45] fix(api): stabilize Megakino domain resolution and sitemap loading Resolve the active Megakino domain through a small redirect-aware seed chain before the catalog indexer starts, and use the external GitHub domain monitor only as a fallback hint source. Bypass the shared pooled HTTP client for Megakino resolver and sitemap requests so startup no longer fails on the urllib3-future kill_cursor error, and ensure the indexer uses the resolved base URL instead of a stale configured host. --- apps/api/app/core/lifespan.py | 10 +- apps/api/app/providers/megakino/sitemap.py | 18 +++- apps/api/app/utils/domain_resolver.py | 101 +++++++++++------- .../megakino/test_domain_resolver.py | 56 +++++----- 4 files changed, 110 insertions(+), 75 deletions(-) diff --git a/apps/api/app/core/lifespan.py b/apps/api/app/core/lifespan.py index 9defbdf9..0185cc15 100644 --- a/apps/api/app/core/lifespan.py +++ b/apps/api/app/core/lifespan.py @@ -130,6 +130,11 @@ async def lifespan(app: FastAPI): if cleaned: logger.warning(f"Reset {cleaned} dangling jobs to 'failed'") init_executor() + if "megakino" in CATALOG_SITE_CONFIGS and not ANIBRIDGE_TEST_MODE: + try: + resolve_megakino_base_url() + except Exception as e: + logger.warning(f"megakino domain resolution failed: {e}") try: get_catalog_indexer().start() except Exception as e: @@ -139,11 +144,6 @@ async def lifespan(app: FastAPI): cleanup_stop = threading.Event() ip_stop = threading.Event() megakino_stop = threading.Event() - if "megakino" in CATALOG_SITE_CONFIGS and not ANIBRIDGE_TEST_MODE: - try: - resolve_megakino_base_url() - except Exception as e: - logger.warning(f"megakino domain resolution failed: {e}") try: _start_ttl_cleanup_thread(cleanup_stop) except Exception as e: diff --git a/apps/api/app/providers/megakino/sitemap.py b/apps/api/app/providers/megakino/sitemap.py index d113365f..75d3faea 100644 --- a/apps/api/app/providers/megakino/sitemap.py +++ b/apps/api/app/providers/megakino/sitemap.py @@ -10,7 +10,10 @@ from loguru import logger -from app.utils.http_client import get as http_get +MEGAKINO_SITEMAP_USER_AGENT = ( + "Mozilla/5.0 (AniBridge Megakino Indexer; " + "+https://github.com/Zzackllack/AniBridge)" +) @dataclass(frozen=True) @@ -199,11 +202,20 @@ def _fetch_sitemap(url: str, timeout: float = 20.0) -> str: HTTPError: If the HTTP response status indicates a failure. """ logger.debug("Megakino sitemap fetch: {}", url) - resp = http_get(url, timeout=timeout) + resp = requests.get( + url, + timeout=timeout, + allow_redirects=True, + headers={ + "User-Agent": MEGAKINO_SITEMAP_USER_AGENT, + "Accept-Encoding": "identity", + }, + ) resp.raise_for_status() logger.debug( - "Megakino sitemap response: status={} bytes={}", + "Megakino sitemap response: status={} final_url={} bytes={}", resp.status_code, + resp.url, len(resp.text or ""), ) return resp.text diff --git a/apps/api/app/utils/domain_resolver.py b/apps/api/app/utils/domain_resolver.py index a0f9d314..6337de45 100644 --- a/apps/api/app/utils/domain_resolver.py +++ b/apps/api/app/utils/domain_resolver.py @@ -8,14 +8,19 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from loguru import logger +import requests from requests.exceptions import RequestException -from app.utils.http_client import get as http_get from app.utils.logger import config as configure_logger configure_logger() MEGAKINO_DEFAULT_DOMAIN = "megakino1.to" +MEGAKINO_REDIRECT_SEEDS = [ + "megakino1.to", + "megakino.to", + "megakino.live", +] MEGAKINO_DOMAIN_CANDIDATES = [ "megakino1.to", "megakino.live", @@ -28,6 +33,10 @@ "megakino.to", ] MEGAKINO_MIRRORS_PATH = "/mirrors.txt" +MEGAKINO_GITHUB_DOMAIN_HINT_URL = ( + "https://raw.githubusercontent.com/" + "Yezun-hikari/new-domain-check/main/monitors/megakino/domain.txt" +) USER_AGENT = "Mozilla/5.0 (AniBridge; +https://github.com/Zzackllack/AniBridge)" MEGAKINO_RESOLVER_TIMEOUT_SECONDS = 6 MEGAKINO_RESOLVER_MAX_WORKERS = 12 @@ -96,11 +105,13 @@ def _resolver_http_get( def _run() -> None: try: - result["resp"] = http_get( + request_headers = {"User-Agent": USER_AGENT, "Accept-Encoding": "identity"} + request_headers.update(headers) + result["resp"] = requests.get( url, timeout=timeout, allow_redirects=allow_redirects, - headers=headers, + headers=request_headers, ) except Exception as exc: err["exc"] = exc @@ -115,6 +126,36 @@ def _run() -> None: return result["resp"] +def _fetch_github_domain_hint( + timeout: float | int = MEGAKINO_RESOLVER_TIMEOUT_SECONDS, +) -> Optional[str]: + """Return the current Megakino host from the external monitor repository.""" + try: + resp = _resolver_http_get( + MEGAKINO_GITHUB_DOMAIN_HINT_URL, + timeout=timeout, + allow_redirects=True, + headers={"Accept": "text/plain"}, + ) + except RequestException as exc: + logger.debug("Megakino GitHub domain hint fetch failed: {}", exc) + return None + + if resp.status_code >= 400: + logger.debug( + "Megakino GitHub domain hint returned {}", + resp.status_code, + ) + return None + + domain = _normalize_domain(resp.text or "") + if not domain: + logger.debug("Megakino GitHub domain hint was empty or invalid.") + return None + logger.info("Megakino GitHub domain hint resolved to {}", domain) + return domain + + def _build_base_url(value: str) -> str: """ Create a normalized base URL from an input domain or URL. @@ -410,18 +451,20 @@ def fetch_megakino_domain( Returns: The resolved domain without a URL scheme (for example, "example.com"), or `None` if no candidate could be validated. """ - logger.info("Resolving megakino domain via sitemap checks.") + logger.info("Resolving megakino domain via redirect-aware sitemap checks.") seen: set[str] = set() - ordered_candidates: list[str] = [] - mirror_timeout = min(timeout, 8) - mirror_domains = fetch_megakino_mirror_domains( - timeout=mirror_timeout, include_sitemap_fallback=False - ) - if mirror_domains: - ordered_candidates.extend(mirror_domains) - ordered_candidates.extend(MEGAKINO_DOMAIN_CANDIDATES) + raw_candidates: list[str] = [] + env_candidate = os.getenv("MEGAKINO_BASE_URL", "").strip() + if env_candidate: + raw_candidates.append(env_candidate) + raw_candidates.extend(MEGAKINO_REDIRECT_SEEDS) + + hint_domain = _fetch_github_domain_hint(timeout=min(timeout, 8)) + if hint_domain: + raw_candidates.append(hint_domain) + normalized_candidates: list[str] = [] - for candidate in ordered_candidates: + for candidate in raw_candidates: domain = _normalize_domain(candidate) if not domain or domain in seen: continue @@ -429,44 +472,26 @@ def fetch_megakino_domain( normalized_candidates.append(domain) if not normalized_candidates: - logger.warning("Megakino domain resolution failed; no candidate succeeded.") + logger.warning("Megakino domain resolution failed; no candidate seeds available.") return None - max_workers = _resolver_max_workers(len(normalized_candidates)) logger.info( - "Probing {} megakino candidates (workers={}, timeout={}s).", - len(normalized_candidates), - max_workers, - timeout, + "Megakino domain resolution candidates: {}", + ", ".join(normalized_candidates), ) - def _probe_candidate(idx: int, domain: str) -> tuple[int, Optional[str]]: + for domain in normalized_candidates: base_url = _build_base_url(domain) try: - return (idx, _probe_megakino_sitemap(base_url, timeout=timeout)) + resolved = _probe_megakino_sitemap(base_url, timeout=timeout) except Exception as exc: logger.warning("Megakino candidate check failed for {}: {}", base_url, exc) - return (idx, None) - - probe_results: dict[int, str] = {} - with ThreadPoolExecutor( - max_workers=min(max_workers, len(normalized_candidates)) - ) as ex: - futures = [ - ex.submit(_probe_candidate, idx, domain) - for idx, domain in enumerate(normalized_candidates) - ] - for fut in as_completed(futures): - idx, resolved = fut.result() - if resolved: - probe_results[idx] = resolved - - for idx, domain in enumerate(normalized_candidates): - resolved = probe_results.get(idx) + continue if resolved: logger.success("Megakino domain resolved: {}", resolved) return resolved logger.warning("Megakino candidate failed validation: {}", domain) + logger.warning("Megakino domain resolution failed; no candidate succeeded.") return None diff --git a/apps/api/tests/unit/providers/megakino/test_domain_resolver.py b/apps/api/tests/unit/providers/megakino/test_domain_resolver.py index ec6094ed..7e1933b5 100644 --- a/apps/api/tests/unit/providers/megakino/test_domain_resolver.py +++ b/apps/api/tests/unit/providers/megakino/test_domain_resolver.py @@ -58,7 +58,7 @@ def fake_get(url, *, timeout=0, allow_redirects=True, headers=None): "MEGAKINO_DOMAIN_CANDIDATES", ["first.example", "second.example"], ) - monkeypatch.setattr(domain_resolver, "http_get", fake_get) + monkeypatch.setattr(domain_resolver.requests, "get", fake_get) domains = domain_resolver.fetch_megakino_mirror_domains(timeout=1) assert domains == ["megakino1.to", "megakino1.fit"] @@ -86,25 +86,21 @@ def fake_get(url, *, timeout=0, allow_redirects=True, headers=None): "MEGAKINO_DOMAIN_CANDIDATES", ["first.example"], ) - monkeypatch.setattr(domain_resolver, "http_get", fake_get) + monkeypatch.setattr(domain_resolver.requests, "get", fake_get) domains = domain_resolver.fetch_megakino_mirror_domains(timeout=1) assert domains == ["megakino1.org"] assert any("/sitemap.xml" in call for call in calls) -def test_fetch_megakino_domain_dedupes_and_prefers_candidate_order(monkeypatch): +def test_fetch_megakino_domain_prefers_seed_order(monkeypatch): probed: list[str] = [] + monkeypatch.setattr(domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example", "second.example"]) monkeypatch.setattr( domain_resolver, - "MEGAKINO_DOMAIN_CANDIDATES", - ["second.example", "third.example"], - ) - monkeypatch.setattr( - domain_resolver, - "fetch_megakino_mirror_domains", - lambda timeout=0, **kwargs: ["first.example", "second.example"], + "_fetch_github_domain_hint", + lambda timeout=0: "third.example", ) def fake_probe(base_url: str, timeout=0): @@ -118,19 +114,16 @@ def fake_probe(base_url: str, timeout=0): resolved = domain_resolver.fetch_megakino_domain(timeout=1) assert resolved == "second.example" + assert probed == ["first.example", "second.example"] assert probed.count("second.example") == 1 def test_fetch_megakino_domain_returns_none_when_all_candidates_fail(monkeypatch): + monkeypatch.setattr(domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example", "second.example"]) monkeypatch.setattr( domain_resolver, - "MEGAKINO_DOMAIN_CANDIDATES", - ["first.example", "second.example"], - ) - monkeypatch.setattr( - domain_resolver, - "fetch_megakino_mirror_domains", - lambda timeout=0, **kwargs: [], + "_fetch_github_domain_hint", + lambda timeout=0: None, ) monkeypatch.setattr( domain_resolver, "_probe_megakino_sitemap", lambda *a, **k: None @@ -145,7 +138,7 @@ def fake_get(*args, **kwargs): time.sleep(1.0) raise RuntimeError("late failure") - monkeypatch.setattr(domain_resolver, "http_get", fake_get) + monkeypatch.setattr(domain_resolver.requests, "get", fake_get) started = time.monotonic() try: @@ -162,20 +155,25 @@ def fake_get(*args, **kwargs): assert elapsed < 0.5 -def test_fetch_megakino_domain_disables_mirror_sitemap_fallback(monkeypatch): - seen_kwargs: dict[str, object] = {} - - def fake_mirrors(timeout=0, **kwargs): - seen_kwargs.update(kwargs) - return [] +def test_fetch_megakino_domain_uses_github_hint_after_seed_failures(monkeypatch): + probed: list[str] = [] + monkeypatch.setattr(domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example"]) monkeypatch.setattr( domain_resolver, - "MEGAKINO_DOMAIN_CANDIDATES", - [], + "_fetch_github_domain_hint", + lambda timeout=0: "hint.example", ) - monkeypatch.setattr(domain_resolver, "fetch_megakino_mirror_domains", fake_mirrors) + + def fake_probe(base_url: str, timeout=0): + domain = domain_resolver._normalize_domain(base_url) + probed.append(domain) + if domain == "hint.example": + return "hint.example" + return None + + monkeypatch.setattr(domain_resolver, "_probe_megakino_sitemap", fake_probe) resolved = domain_resolver.fetch_megakino_domain(timeout=1) - assert resolved is None - assert seen_kwargs.get("include_sitemap_fallback") is False + assert resolved == "hint.example" + assert probed == ["first.example", "hint.example"] From 8a744a616ecb63bd7701efc599c481a586662b42 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 01:54:41 +0200 Subject: [PATCH 16/45] test(indexer): enhance tests for catalog discovery and crawling behavior - Added tests to verify logging of title crawl counts and heartbeat. - Updated the fake_crawl function to include an observer parameter. - Ensured that the catalog indexer correctly logs the number of titles crawled. --- apps/api/tests/unit/catalog/test_indexer.py | 42 ++++++++++++++++++- apps/api/tests/unit/catalog/test_providers.py | 33 ++++++++++++++- 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index 0817898b..bb04bdb0 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -1,4 +1,5 @@ import time +from datetime import datetime, timezone from types import SimpleNamespace @@ -28,7 +29,8 @@ def test_catalog_discovery_logs_heartbeat(monkeypatch): def fake_info(message: str, *args) -> None: messages.append(message.format(*args)) - def fake_crawl(_provider: str) -> list[object]: + def fake_crawl(_provider: str, *, observer=None) -> list[object]: + assert observer is not None time.sleep(0.03) return [] @@ -44,6 +46,32 @@ def fake_crawl(_provider: str) -> list[object]: assert any("still discovering titles after" in message for message in messages) +def test_catalog_discovery_logs_title_crawl_counts(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + + messages: list[str] = [] + + def fake_info(message: str, *args) -> None: + messages.append(message.format(*args)) + + def fake_crawl(_provider: str, *, observer=None) -> list[object]: + assert observer is not None + observer.on_index_loaded(10) + observer.on_title_crawled("slug-1") + time.sleep(0.03) + return [] + + monkeypatch.setattr(indexer_module, "_DISCOVERY_HEARTBEAT_SECONDS", 0.01) + monkeypatch.setattr(indexer_module, "crawl_provider_catalog", fake_crawl) + monkeypatch.setattr(indexer_module.logger, "info", fake_info) + + ProviderCatalogIndexer()._crawl_provider_catalog_with_heartbeat("aniworld.to") + + assert any("loaded title index with 10 titles" in message for message in messages) + assert any("crawling title details 1/10 (10.0%)" in message for message in messages) + + def test_catalog_recovers_interrupted_running_state(monkeypatch): import app.catalog.indexer as indexer_module from app.catalog.indexer import ProviderCatalogIndexer @@ -131,3 +159,15 @@ def fake_refresh(provider: str) -> None: indexer.stop() assert called == ["aniworld.to"] + + +def test_is_due_handles_naive_next_refresh_after(): + from app.catalog.indexer import ProviderCatalogIndexer + + status = SimpleNamespace( + status="ready", + latest_success_at=datetime.now(timezone.utc), + next_refresh_after=datetime(2000, 1, 1, 0, 0, 0), + ) + + assert ProviderCatalogIndexer()._is_due(status) is True diff --git a/apps/api/tests/unit/catalog/test_providers.py b/apps/api/tests/unit/catalog/test_providers.py index 4ee5417b..10b8f0d8 100644 --- a/apps/api/tests/unit/catalog/test_providers.py +++ b/apps/api/tests/unit/catalog/test_providers.py @@ -1,6 +1,12 @@ +import time from types import SimpleNamespace -from app.catalog.providers import _parse_aniworld_season_rows, _parse_sto_season_rows +from app.catalog.providers import ( + _fallback_title_record, + _parse_aniworld_season_rows, + _parse_sto_season_rows, + _run_with_timeout, +) def test_parse_aniworld_season_rows_uses_season_html_only(): @@ -70,3 +76,28 @@ def test_parse_sto_season_rows_extracts_episode_links_without_episode_pages(): (2, 2, "/serie/demo/staffel-2/episode-2"), ] assert all(item.languages == [] for item in episodes) + + +def test_run_with_timeout_raises_for_hung_title_crawl(): + def slow() -> object: + time.sleep(0.05) + return object() + + try: + _run_with_timeout(0.01, slow) + assert False, "expected timeout" + except TimeoutError as exc: + assert "title crawl exceeded" in str(exc) + + +def test_fallback_title_record_uses_provider_relative_path(): + record = _fallback_title_record( + provider_key="s.to", + slug="demo-show", + title="Demo Show", + aliases=["Demo Show"], + ) + + assert record.relative_path == "/serie/demo-show" + assert record.title == "Demo Show" + assert record.episodes == [] From a45b6876d98801d40ee2eb8dc46d06963cc716fb Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 01:54:52 +0200 Subject: [PATCH 17/45] feat(indexer): add timeout configuration for provider title crawls - Introduced a new environment variable `PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS` to set a hard timeout for title crawls. - Updated the catalog crawling logic to utilize the new timeout setting. - Enhanced logging to provide better insights during the crawling process. --- apps/api/.env.example | 3 + apps/api/app/catalog/indexer.py | 90 ++++++++++++++--- apps/api/app/catalog/providers.py | 162 ++++++++++++++++++++++++++++-- apps/api/app/config.py | 11 ++ apps/api/app/db/models.py | 5 - 5 files changed, 244 insertions(+), 27 deletions(-) diff --git a/apps/api/.env.example b/apps/api/.env.example index 3ea36d86..3f914811 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -106,6 +106,9 @@ PROVIDER_INDEX_CONCURRENCY_STO=1 # What: Per-provider crawl worker count for megakino title refreshes # Default: 1 PROVIDER_INDEX_CONCURRENCY_MEGAKINO=1 +# What: Hard timeout in seconds for one provider title crawl +# Default: 45 +PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS=45 # What: Domain check interval in minutes (0 disables background checks) # Default: 100 diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index cb687126..3dfe4541 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -10,7 +10,7 @@ from sqlmodel import Session from app.catalog.exceptions import CatalogNotReadyError -from app.catalog.providers import crawl_provider_catalog +from app.catalog.providers import CatalogCrawlObserver, crawl_provider_catalog from app.config import ( ANIBRIDGE_TEST_MODE, CATALOG_SITES_LIST, @@ -20,6 +20,7 @@ PROVIDER_INDEX_SCHEDULER_POLL_SECONDS, ) from app.db import ( + as_aware_utc, delete_provider_generation, engine, get_provider_index_status, @@ -367,7 +368,7 @@ def _is_due(self, status) -> bool: return True if status.next_refresh_after is None: return True - return status.next_refresh_after <= utcnow() + return as_aware_utc(status.next_refresh_after) <= utcnow() def _refresh_provider(self, provider: str) -> None: refresh_interval_hours = float( @@ -415,12 +416,12 @@ def _refresh_provider(self, provider: str) -> None: ProgressSnapshot( downloaded=0, total=len(titles), - status="indexing_titles", + status="persisting_titles", ) ) self._set_progress( provider, - phase="indexing_titles", + phase="persisting_titles", total_titles=len(titles), processed_titles=0, current_slug="", @@ -531,7 +532,7 @@ def _refresh_provider(self, provider: str) -> None: ProgressSnapshot( downloaded=min(len(titles), processed_titles), total=len(titles), - status="indexing_titles", + status="persisting_titles", ) ) completed_at = utcnow() @@ -598,18 +599,68 @@ def _refresh_provider(self, provider: str) -> None: def _crawl_provider_catalog_with_heartbeat(self, provider: str) -> list[object]: elapsed_seconds = 0.0 + observer = CatalogCrawlObserver( + on_index_loaded=lambda total: self._on_title_index_loaded(provider, total), + on_title_started=lambda slug: self._on_title_started(provider, slug), + on_title_crawled=lambda slug: self._on_title_crawled(provider, slug), + ) with ThreadPoolExecutor(max_workers=1) as executor: - future = executor.submit(crawl_provider_catalog, provider) + future = executor.submit( + crawl_provider_catalog, + provider, + observer=observer, + ) while True: try: return future.result(timeout=_DISCOVERY_HEARTBEAT_SECONDS) except FutureTimeoutError: elapsed_seconds += _DISCOVERY_HEARTBEAT_SECONDS - logger.info( - "Provider catalog {}: still discovering titles after {}s", - provider, - int(elapsed_seconds), - ) + processed = self._get_processed_titles(provider) + total = self._get_total_titles(provider) + current_slug = self._get_current_slug(provider) + if total is not None: + percent = 100.0 if total <= 0 else round(processed / total * 100.0, 1) + logger.info( + "Provider catalog {}: crawling title details {}/{} ({}%) after {}s current={}", + provider, + processed, + total, + percent, + int(elapsed_seconds), + current_slug or "-", + ) + else: + logger.info( + "Provider catalog {}: still discovering titles after {}s current={}", + provider, + int(elapsed_seconds), + current_slug or "-", + ) + + def _on_title_index_loaded(self, provider: str, total_titles: int) -> None: + self._set_progress( + provider, + phase="crawling_titles", + total_titles=total_titles, + processed_titles=0, + current_slug="", + reset_log_step=True, + ) + logger.info( + "Provider catalog {}: loaded title index with {} titles", + provider, + total_titles, + ) + + def _on_title_started(self, provider: str, slug: str) -> None: + self._set_progress( + provider, + phase="crawling_titles", + current_slug=slug, + ) + + def _on_title_crawled(self, provider: str, slug: str) -> None: + self._advance_progress(provider, current_slug=slug) def _is_bootstrap_ready(self) -> bool: with Session(engine) as session: @@ -696,8 +747,9 @@ def _advance_progress(self, provider: str, *, current_slug: str) -> None: return snapshot.last_logged_step = current_step logger.info( - "Provider catalog {} progress: {}/{} ({}%) current={}", + "Provider catalog {} progress [{}]: {}/{} ({}%) current={}", provider, + snapshot.phase, snapshot.processed_titles, total, percent, @@ -710,3 +762,17 @@ def _get_processed_titles(self, provider: str) -> int: if snapshot is None: return 0 return snapshot.processed_titles + + def _get_total_titles(self, provider: str) -> int | None: + with self._progress_lock: + snapshot = self._progress.get(provider) + if snapshot is None: + return None + return snapshot.total_titles + + def _get_current_slug(self, provider: str) -> str: + with self._progress_lock: + snapshot = self._progress.get(provider) + if snapshot is None: + return "" + return snapshot.current_slug diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py index 2b28a949..9788861a 100644 --- a/apps/api/app/catalog/providers.py +++ b/apps/api/app/catalog/providers.py @@ -3,15 +3,16 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field from difflib import SequenceMatcher +import threading import re -from typing import Any, Optional +from typing import Any, Callable, Optional from urllib.parse import urlparse from bs4 import BeautifulSoup # type: ignore from loguru import logger from app.catalog.metadata import resolve_tv_canonical_match -from app.config import CATALOG_SITE_CONFIGS +from app.config import CATALOG_SITE_CONFIGS, PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS from app.db import normalize_catalog_text from app.providers import get_provider from app.providers.megakino.client import ( @@ -59,6 +60,17 @@ class TitleRecord: canonical: CanonicalPayload = field(default_factory=CanonicalPayload) +@dataclass(slots=True) +class CatalogCrawlObserver: + on_index_loaded: Callable[[int], None] | None = None + on_title_started: Callable[[str], None] | None = None + on_title_crawled: Callable[[str], None] | None = None + + +class ProviderTitleCrawlTimeoutError(TimeoutError): + pass + + def _relative_path(url: str) -> str: parsed = urlparse(url) path = parsed.path or "/" @@ -67,6 +79,56 @@ def _relative_path(url: str) -> str: return path +def _run_with_timeout( + timeout_seconds: float, + fn: Callable[..., TitleRecord], + /, + *args: Any, + **kwargs: Any, +) -> TitleRecord: + result: dict[str, TitleRecord] = {} + err: dict[str, BaseException] = {} + + def _target() -> None: + try: + result["value"] = fn(*args, **kwargs) + except BaseException as exc: + err["exc"] = exc + + thread = threading.Thread(target=_target, name="provider-title-crawl", daemon=True) + thread.start() + thread.join(float(timeout_seconds)) + if thread.is_alive(): + raise ProviderTitleCrawlTimeoutError( + f"title crawl exceeded {int(timeout_seconds)}s" + ) + if "exc" in err: + raise err["exc"] + return result["value"] + + +def _fallback_title_record( + *, + provider_key: str, + slug: str, + title: str, + aliases: list[str], +) -> TitleRecord: + relative_root = ( + f"/anime/stream/{slug}" if provider_key == "aniworld.to" else f"/serie/{slug}" + ) + return TitleRecord( + provider=provider_key, + slug=slug, + title=title, + aliases=aliases, + media_type_hint="series", + relative_path=relative_root, + episodes=[], + canonical=CanonicalPayload(), + ) + + def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageRecord]: if not isinstance(raw, dict): return [] @@ -447,6 +509,27 @@ def _crawl_aniworld_like_title( ) +def _crawl_title_job( + *, + provider_key: str, + slug: str, + title: str, + aliases: list[str], + observer: CatalogCrawlObserver | None, + title_timeout_seconds: float, +) -> TitleRecord: + if observer is not None and observer.on_title_started is not None: + observer.on_title_started(slug) + return _run_with_timeout( + title_timeout_seconds, + _crawl_aniworld_like_title, + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + ) + + def _parse_megakino_page_metadata(url: str) -> tuple[str | None, int | None]: base_url = get_megakino_base_url().rstrip("/") response = http_get(url, timeout=20, headers={"Referer": base_url}) @@ -465,7 +548,11 @@ def _parse_megakino_page_metadata(url: str) -> tuple[str | None, int | None]: return title, year -def crawl_provider_catalog(provider_key: str) -> list[TitleRecord]: +def crawl_provider_catalog( + provider_key: str, + *, + observer: CatalogCrawlObserver | None = None, +) -> list[TitleRecord]: provider = get_provider(provider_key) if provider is None: return [] @@ -473,6 +560,8 @@ def crawl_provider_catalog(provider_key: str) -> list[TitleRecord]: if provider_key == "megakino": client = get_default_megakino_client() entries = client.load_index() + if observer is not None and observer.on_index_loaded is not None: + observer.on_index_loaded(len(entries)) titles: list[TitleRecord] = [] for entry in entries.values(): parsed_title = entry.slug.replace("-", " ").title() @@ -496,28 +585,81 @@ def crawl_provider_catalog(provider_key: str) -> list[TitleRecord]: canonical=CanonicalPayload(), ) ) + if observer is not None and observer.on_title_crawled is not None: + observer.on_title_crawled(entry.slug) return titles + logger.info("Provider catalog {}: loading title index", provider_key) index = provider.load_or_refresh_index() + logger.info("Provider catalog {}: title index loaded ({} titles)", provider_key, len(index)) + logger.info("Provider catalog {}: loading title alternatives", provider_key) alternatives = provider.load_or_refresh_alternatives() + logger.info( + "Provider catalog {}: title alternatives loaded ({} titles with aliases)", + provider_key, + len(alternatives), + ) + if observer is not None and observer.on_index_loaded is not None: + observer.on_index_loaded(len(index)) workers = int( CATALOG_SITE_CONFIGS[provider_key].get("provider_index_concurrency", 1) ) - futures = [] + title_timeout_seconds = float( + CATALOG_SITE_CONFIGS[provider_key].get( + "provider_index_title_timeout_seconds", + PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, + ) + ) + futures: dict[object, tuple[str, str, list[str]]] = {} results: list[TitleRecord] = [] with ThreadPoolExecutor(max_workers=max(1, workers)) as executor: for slug, title in index.items(): aliases = list(dict.fromkeys(alternatives.get(slug, []) or [title])) - futures.append( - executor.submit( - _crawl_aniworld_like_title, + future = executor.submit( + _crawl_title_job, + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + observer=observer, + title_timeout_seconds=title_timeout_seconds, + ) + futures[future] = (slug, title, aliases) + for future in as_completed(futures): + slug, title, aliases = futures[future] + try: + record = future.result() + except ProviderTitleCrawlTimeoutError as exc: + logger.warning( + "Provider catalog {}: title crawl timed out after {}s slug={} title={}: {}", + provider_key, + int(title_timeout_seconds), + slug, + title, + exc, + ) + record = _fallback_title_record( provider_key=provider_key, slug=slug, title=title, aliases=aliases, ) - ) - for future in as_completed(futures): - results.append(future.result()) + except Exception as exc: + logger.warning( + "Provider catalog {}: title crawl failed slug={} title={}: {}", + provider_key, + slug, + title, + exc, + ) + record = _fallback_title_record( + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + ) + results.append(record) + if observer is not None and observer.on_title_crawled is not None: + observer.on_title_crawled(record.slug) results.sort(key=lambda item: item.slug) return results diff --git a/apps/api/app/config.py b/apps/api/app/config.py index 4b000fce..d0c40c6c 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -285,6 +285,10 @@ def _ensure_runtime_home() -> Path: ) if PROVIDER_INDEX_CONCURRENCY_MEGAKINO < 1: PROVIDER_INDEX_CONCURRENCY_MEGAKINO = 1 +PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS = max( + 5, + _as_non_negative_int(os.getenv("PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS"), 45), +) logger.debug( f"ANIWORLD_ALPHABET_HTML={ANIWORLD_ALPHABET_HTML}, ANIWORLD_ALPHABET_URL={ANIWORLD_ALPHABET_URL}" @@ -310,6 +314,10 @@ def _ensure_runtime_home() -> Path: PROVIDER_INDEX_CONCURRENCY_STO, PROVIDER_INDEX_CONCURRENCY_MEGAKINO, ) +logger.debug( + "Provider index title timeout: {}s", + PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, +) # TTL (Stunden) für Live-Index; 0 = nie neu laden (nur einmal pro Prozess) ANIWORLD_TITLES_REFRESH_HOURS = float(os.getenv("ANIWORLD_TITLES_REFRESH_HOURS", "24")) @@ -339,6 +347,7 @@ def _ensure_runtime_home() -> Path: "titles_refresh_hours": ANIWORLD_TITLES_REFRESH_HOURS, "provider_index_refresh_hours": PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD, "provider_index_concurrency": PROVIDER_INDEX_CONCURRENCY_ANIWORLD, + "provider_index_title_timeout_seconds": PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, "default_languages": ["German Dub", "German Sub", "English Sub"], "release_group": RELEASE_GROUP_ANIWORLD, }, @@ -349,6 +358,7 @@ def _ensure_runtime_home() -> Path: "titles_refresh_hours": STO_TITLES_REFRESH_HOURS, "provider_index_refresh_hours": PROVIDER_INDEX_REFRESH_HOURS_STO, "provider_index_concurrency": PROVIDER_INDEX_CONCURRENCY_STO, + "provider_index_title_timeout_seconds": PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, "default_languages": ["German Dub", "English Dub"], "release_group": RELEASE_GROUP_STO, }, @@ -359,6 +369,7 @@ def _ensure_runtime_home() -> Path: "titles_refresh_hours": MEGAKINO_TITLES_REFRESH_HOURS, "provider_index_refresh_hours": PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO, "provider_index_concurrency": PROVIDER_INDEX_CONCURRENCY_MEGAKINO, + "provider_index_title_timeout_seconds": PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, "default_languages": ["Deutsch", "German Dub"], "release_group": "megakino", }, diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index 172ce80a..31c85d73 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -37,19 +37,14 @@ # ---- Datetime Helpers def utcnow() -> datetime: - logger.trace("utcnow() called.") return datetime.now(timezone.utc) def as_aware_utc(dt: Optional[datetime]) -> datetime: - logger.debug(f"as_aware_utc() called with dt={dt}") if dt is None: - logger.debug("Datetime is None, returning utcnow().") return utcnow() if dt.tzinfo is None: - logger.debug("Datetime is naive, setting tzinfo to UTC.") return dt.replace(tzinfo=timezone.utc) - logger.debug("Datetime is aware, converting to UTC.") return dt.astimezone(timezone.utc) From 7b605d869c89ba180c8da422e9e81551256207dc Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 02:08:28 +0200 Subject: [PATCH 18/45] feat(caching): implement caching for SkyHook search and show retrieval Added caching mechanisms for search results and show details to improve performance and reduce redundant API calls. Introduced functions to get and set cached data for both searches and shows, utilizing a thread-safe lock for concurrent access. --- apps/api/app/catalog/metadata.py | 105 ++++++++++++++++++++------ apps/api/app/db/models.py | 11 ++- apps/api/tests/unit/db/test_models.py | 35 +++++++++ 3 files changed, 127 insertions(+), 24 deletions(-) diff --git a/apps/api/app/catalog/metadata.py b/apps/api/app/catalog/metadata.py index 76585f27..2063570d 100644 --- a/apps/api/app/catalog/metadata.py +++ b/apps/api/app/catalog/metadata.py @@ -4,6 +4,8 @@ from difflib import SequenceMatcher from typing import Any, Optional from urllib.parse import urlencode +import threading +import time from loguru import logger @@ -12,6 +14,12 @@ SKYHOOK_SEARCH_URL = "https://skyhook.sonarr.tv/v1/tvdb/search/en/" SKYHOOK_SHOW_URL = "https://skyhook.sonarr.tv/v1/tvdb/shows/en/{tvdb_id}" +SKYHOOK_TIMEOUT_SECONDS = 4.0 +SKYHOOK_CACHE_TTL_SECONDS = 3600.0 + +_cache_lock = threading.Lock() +_search_cache: dict[str, tuple[float, list[dict[str, Any]]]] = {} +_show_cache: dict[int, tuple[float, dict[str, Any]]] = {} @dataclass(slots=True) @@ -24,6 +32,42 @@ class TvCanonicalMatch: payload: dict[str, Any] +def _cache_get_search(term: str) -> list[dict[str, Any]] | None: + now = time.time() + with _cache_lock: + entry = _search_cache.get(term) + if entry is None: + return None + cached_at, payload = entry + if now - cached_at > SKYHOOK_CACHE_TTL_SECONDS: + _search_cache.pop(term, None) + return None + return [dict(item) for item in payload] + + +def _cache_set_search(term: str, payload: list[dict[str, Any]]) -> None: + with _cache_lock: + _search_cache[term] = (time.time(), [dict(item) for item in payload]) + + +def _cache_get_show(tvdb_id: int) -> dict[str, Any] | None: + now = time.time() + with _cache_lock: + entry = _show_cache.get(tvdb_id) + if entry is None: + return None + cached_at, payload = entry + if now - cached_at > SKYHOOK_CACHE_TTL_SECONDS: + _show_cache.pop(tvdb_id, None) + return None + return dict(payload) + + +def _cache_set_show(tvdb_id: int, payload: dict[str, Any]) -> None: + with _cache_lock: + _show_cache[tvdb_id] = (time.time(), dict(payload)) + + def _score_title(query: str, candidate: str) -> float: left = normalize_catalog_text(query) right = normalize_catalog_text(candidate) @@ -76,21 +120,31 @@ def resolve_tv_canonical_match( imdb_id=imdb_id, tmdb_id=tmdb_id, ): + payload = _cache_get_search(term) + if payload is None: + try: + query = urlencode({"term": term}) + response = http_get( + f"{SKYHOOK_SEARCH_URL}?{query}", + timeout=SKYHOOK_TIMEOUT_SECONDS, + ) + response.raise_for_status() + raw_payload = response.json() + except Exception as exc: + logger.debug("SkyHook search failed for '{}': {}", term, exc) + continue + if not isinstance(raw_payload, list): + continue + payload = [item for item in raw_payload if isinstance(item, dict)] + _cache_set_search(term, payload) try: - query = urlencode({"term": term}) - response = http_get(f"{SKYHOOK_SEARCH_URL}?{query}", timeout=8.0) - response.raise_for_status() - payload = response.json() - except Exception as exc: - logger.debug("SkyHook search failed for '{}': {}", term, exc) + for item in payload: + copied = dict(item) + copied["_ab_source"] = source + copied["_ab_term"] = term + candidates.append(copied) + except Exception: continue - if not isinstance(payload, list): - continue - for item in payload: - if isinstance(item, dict): - item["_ab_source"] = source - item["_ab_term"] = term - candidates.append(item) best_match: Optional[tuple[float, dict[str, Any]]] = None for item in candidates: @@ -112,15 +166,22 @@ def resolve_tv_canonical_match( score, item = best_match tvdb_id = int(item["tvdbId"]) - try: - response = http_get(SKYHOOK_SHOW_URL.format(tvdb_id=tvdb_id), timeout=8.0) - response.raise_for_status() - payload = response.json() - except Exception as exc: - logger.debug("SkyHook show fetch failed for tvdb {}: {}", tvdb_id, exc) - return None - if not isinstance(payload, dict): - return None + payload = _cache_get_show(tvdb_id) + if payload is None: + try: + response = http_get( + SKYHOOK_SHOW_URL.format(tvdb_id=tvdb_id), + timeout=SKYHOOK_TIMEOUT_SECONDS, + ) + response.raise_for_status() + raw_payload = response.json() + except Exception as exc: + logger.debug("SkyHook show fetch failed for tvdb {}: {}", tvdb_id, exc) + return None + if not isinstance(raw_payload, dict): + return None + payload = raw_payload + _cache_set_show(tvdb_id, payload) if score >= 0.99: confidence = "confirmed" diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index 31c85d73..3e7b8ff2 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -1156,15 +1156,22 @@ def replace_canonical_episodes( session.exec( CanonicalEpisode.__table__.delete().where(CanonicalEpisode.tvdb_id == tvdb_id) ) + deduped: dict[tuple[int, int], str] = {} for episode in episodes: title = str(episode.get("title") or "").strip() if not title: continue + season = int(episode["season"]) + number = int(episode["episode"]) + key = (season, number) + if key not in deduped: + deduped[key] = title + for (season, number), title in sorted(deduped.items()): session.add( CanonicalEpisode( tvdb_id=tvdb_id, - season=int(episode["season"]), - episode=int(episode["episode"]), + season=season, + episode=number, title=title, normalized_title=normalize_catalog_text(title), last_synced_at=utcnow(), diff --git a/apps/api/tests/unit/db/test_models.py b/apps/api/tests/unit/db/test_models.py index 0b4a491e..2e0d136a 100644 --- a/apps/api/tests/unit/db/test_models.py +++ b/apps/api/tests/unit/db/test_models.py @@ -140,3 +140,38 @@ def test_availability_and_clienttask_crud(client): assert get_client_task(s, "abc") delete_client_task(s, "abc") assert get_client_task(s, "abc") is None + + +def test_replace_canonical_episodes_dedupes_duplicate_numbers(client): + from sqlmodel import Session, select + from app.db import ( + CanonicalEpisode, + engine, + replace_canonical_episodes, + upsert_canonical_series, + ) + + with Session(engine) as s: + upsert_canonical_series( + s, + tvdb_id=12345, + title="Demo Show", + aliases=["Demo Show"], + ) + replace_canonical_episodes( + s, + tvdb_id=12345, + episodes=[ + {"season": 1, "episode": 1, "title": "Pilot"}, + {"season": 1, "episode": 1, "title": "Pilot Duplicate"}, + {"season": 1, "episode": 2, "title": "Second"}, + ], + ) + s.commit() + + rows = s.exec( + select(CanonicalEpisode).where(CanonicalEpisode.tvdb_id == 12345) + ).all() + + assert len(rows) == 2 + assert {(row.season, row.episode) for row in rows} == {(1, 1), (1, 2)} From eb10ebe7239a79cd7c78e679b5d013941dee9561 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 14:17:33 +0200 Subject: [PATCH 19/45] docs(specs): Streaming Persistence and Memory Bounds --- ...streaming-persistence-and-memory-bounds.md | 273 ++++++++++++++++++ .../017-provider-catalog-index/README.md | 1 + 2 files changed, 274 insertions(+) create mode 100644 internal/specs/017-provider-catalog-index/02-streaming-persistence-and-memory-bounds.md diff --git a/internal/specs/017-provider-catalog-index/02-streaming-persistence-and-memory-bounds.md b/internal/specs/017-provider-catalog-index/02-streaming-persistence-and-memory-bounds.md new file mode 100644 index 00000000..5082abf1 --- /dev/null +++ b/internal/specs/017-provider-catalog-index/02-streaming-persistence-and-memory-bounds.md @@ -0,0 +1,273 @@ +# 017 Provider Catalog Index - Streaming Persistence and Memory Bounds + +## Goal + +Redesign provider bootstrap and refresh execution so AniBridge can index large providers without unbounded memory growth, long silent stalls, or container restarts under realistic first-run load. + +The preferred solution is to stream crawled provider results into SQLite continuously instead of buffering a full provider crawl in memory and only persisting at the end. + +## Problem Statement + +The current provider indexing architecture is correct in spirit but too memory-heavy in practice for large catalogs such as `s.to`. + +Today the refresh flow for AniWorld and `s.to` is effectively: + +1. load the provider title index +2. crawl many titles in parallel +3. accumulate all crawled `TitleRecord` results in memory +4. persist the full provider result set only after crawling is complete +5. mark the refreshed generation as current + +This creates a large in-memory buffer whose size grows with: + +- total title count +- total episode count across those titles +- alias and language metadata +- canonical mapping payloads and intermediate metadata +- number of in-flight worker results + +Lowering concurrency only slows the rate of growth. It does not remove the underlying architectural pressure. + +## Observed Failure Mode + +During first bootstrap runs, memory usage can climb steadily into multiple gigabytes before the provider refresh finishes. + +Typical characteristics: + +- memory rises roughly with crawl progress instead of staying bounded +- the process may appear healthy for a long time and then die abruptly +- large providers such as `s.to` are the worst case because they have many titles and many episodes per title +- progress visibility degrades because the system is still "working" while holding more and more buffered state + +This behavior is more consistent with whole-provider result buffering than with a classic steady-state memory leak. + +## Root Cause + +The main issue is architectural buffering. + +AniBridge currently keeps too much expanded provider state alive at once: + +- full crawled provider results +- per-title episode lists +- canonical enrichment results +- temporarily timed-out worker state that may still be finishing in background threads + +The memory profile therefore scales with provider size instead of with a small bounded working set. + +## Proposed Direction + +Replace the current "crawl everything, then persist everything" flow with a streaming provider pipeline: + +1. discover provider titles +2. crawl titles in parallel +3. emit each completed title result into a bounded queue +4. persist queued results continuously into a staging generation +5. keep serving the previous successful generation during the refresh +6. flip the provider's current successful generation only after the full staged refresh completes successfully + +This preserves refresh consistency while bounding memory growth. + +## Required Architecture Changes + +### Provider crawl contract + +Provider crawl code should stop returning one large `list[TitleRecord]` for the full provider refresh. + +Instead, each provider crawler should expose a streaming-oriented contract that: + +- yields or submits one completed title result at a time +- reports title discovery totals as soon as they are known +- reports per-title progress as results are completed +- allows the orchestrator to apply backpressure when the persistence side falls behind + +The contract may be implemented as: + +- a generator that yields title results +- a callback-based emitter +- a worker pool that pushes into a queue owned by the indexer + +The specific API shape is less important than the bounded-memory behavior. + +### Dedicated writer path + +Use one dedicated persistence path per provider refresh. + +Recommended structure: + +- N crawler workers per provider +- 1 dedicated DB writer thread or task per provider refresh +- 1 bounded in-memory queue between crawl workers and the writer + +The writer is responsible for: + +- inserting or replacing provider title rows for the staging generation +- inserting aliases, episodes, language availability, and mappings for the staging generation +- updating per-provider progress counters +- committing in small batches + +SQLite write contention should remain controlled by avoiding many concurrent writers for the same provider refresh. + +### Staging generation semantics + +Do not publish partially refreshed provider state to the request path. + +Required behavior: + +- write new rows into a staging generation while the previous successful generation remains active +- keep `latest_success_generation` unchanged until the staged refresh fully succeeds +- if the refresh fails midway, keep serving the previous successful generation +- if no successful generation exists yet, keep bootstrap gating behavior intact + +This is the key consistency rule that allows streaming persistence without exposing partial catalogs as complete. + +## Queue and Backpressure Requirements + +The queue between crawlers and the writer must be bounded. + +Required properties: + +- configurable maximum queue size +- producer backpressure when the queue is full +- clear logs when crawlers are blocked on persistence backpressure +- no unbounded fallback list or hidden in-memory spillover + +The queue size should be chosen so that: + +- the writer has enough buffered work to stay busy +- memory remains predictably bounded even for the largest provider + +The implementation should prefer slowing crawl throughput over allowing queue growth beyond the configured bound. + +## Persistence Semantics + +Per-title persistence should happen as soon as the title result is available. + +Required behavior: + +- persist each title independently into the staging generation +- commit frequently enough that long refreshes make visible forward progress +- make restarts and crash recovery able to resume or restart from a known staging state + +Implementation guidance: + +- use small batched transactions rather than one transaction per episode row +- clean up abandoned staging generations on the next startup or refresh attempt +- keep provider status and cursor state aligned with what has already been durably written + +## Failure and Recovery Semantics + +### Title-level failures + +One bad title must not stall or invalidate the whole provider refresh by default. + +Required behavior: + +- timed-out or failed titles should be logged with provider, slug, and reason +- the refresh should continue unless the failure rate crosses a configured threshold +- skipped titles should remain absent from the new generation unless explicitly retried successfully later + +### Refresh-level failures + +If the full provider refresh cannot complete: + +- do not promote the staging generation +- do not delete the currently served successful generation +- persist a clear provider-level error summary +- make the next run able to clean up or reuse stale staging rows safely + +### Restart recovery + +On startup, AniBridge should detect interrupted staging refreshes and handle them explicitly. + +Required behavior: + +- log that an interrupted staging generation was found +- mark the prior run as interrupted +- either delete the abandoned staging generation or restart it from a supported checkpoint + +Version one may choose cleanup-and-restart over true mid-provider resume if that is simpler and more reliable. + +## Progress and Observability + +The new design must improve visibility, not reduce it. + +Required progress signals: + +- title discovery started +- title discovery completed with total count +- crawl progress as `completed/total` and percent when total is known +- queue depth and writer lag +- persistence progress as `persisted/total` +- generation promotion success +- explicit staging cleanup or abandonment messages + +The health surface should remain able to report: + +- provider phase +- processed titles +- total titles when known +- current slug or recently active slug +- last error summary +- whether the provider is serving an older successful generation while a new one is building + +## Memory and Performance Requirements + +The redesigned flow must keep memory bounded primarily by: + +- worker concurrency +- queue size +- one-title working set +- writer batch size + +It must not scale memory roughly linearly with full provider size. + +Additional recommendations: + +- drop intermediate canonical payloads as soon as they have been normalized and written +- avoid keeping large per-title objects alive after queue submission +- avoid background timeout wrappers that leave many unreachable or still-running worker threads alive for long periods + +Concurrency should remain configurable, but the implementation must not depend on low concurrency to stay within safe memory limits. + +## Implementation Guidance + +### Suggested rollout order + +1. introduce staging-generation streaming persistence behind the existing provider status model +2. convert AniWorld and `s.to` from full-list return values to streaming title emission +3. keep Megakino aligned with the same orchestration contract where practical +4. add queue depth, writer lag, and staging-generation logging +5. tighten cleanup of abandoned staging generations and interrupted runs + +### Acceptable simplifications + +The first implementation does not need: + +- fully parallel SQLite writers for the same provider +- cross-provider shared queue infrastructure +- perfect mid-run resume at arbitrary title boundaries + +What matters first is: + +- bounded memory +- continuous persistence +- atomic generation promotion +- clear recovery behavior + +## Non-Goals + +- replacing SQLite +- exposing partial staging rows to the request path as if they were ready +- maximizing raw crawl speed at the expense of stability +- introducing a complicated distributed job system + +## Selected Decisions + +- streaming provider results into SQLite is the selected solution +- a bounded queue with backpressure is required +- one dedicated writer path per provider refresh is preferred over many parallel SQLite writers +- staging generations must remain invisible to the request path until refresh success +- old successful generations must remain served during replacement refreshes +- cleanup-and-restart is acceptable for interrupted staging generations in version one +- lowering crawl concurrency is not considered a real fix for the memory issue, only a temporary mitigation +- progress reporting must include real counts and writer-state visibility, not only long-running heartbeat messages diff --git a/internal/specs/017-provider-catalog-index/README.md b/internal/specs/017-provider-catalog-index/README.md index 771ce377..d1a632bc 100644 --- a/internal/specs/017-provider-catalog-index/README.md +++ b/internal/specs/017-provider-catalog-index/README.md @@ -2,3 +2,4 @@ - `00-scheduled-provider-indexing.md` - scheduled full-catalog indexing and refresh strategy - `01-normalization-and-id-mapping.md` - canonical ID mapping and provider-to-Sonarr/Radarr normalization strategy +- `02-streaming-persistence-and-memory-bounds.md` - streaming provider persistence, staging generations, and bounded-memory bootstrap strategy From c42d543c37e7c4e738add209791a50814fd4daac Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 18:09:13 +0200 Subject: [PATCH 20/45] perf(catalog): stream provider index persistence through staged writes Replace whole-provider buffering with a bounded crawl-to-writer pipeline. Add queue backpressure, batch SQLite persistence, interrupted staging cleanup, richer health progress, and generation-aware provider mappings so partial refresh state never becomes live. --- apps/api/.env.example | 15 + apps/api/app/catalog/indexer.py | 953 +++++++++++++----- apps/api/app/catalog/providers.py | 339 ++++--- apps/api/app/config.py | 52 + ...60429_0005_provider_mapping_generations.py | 364 +++++++ apps/api/app/db/models.py | 123 ++- .../tests/integration/api/torznab/test_api.py | 2 + .../api/torznab/test_indexed_catalog.py | 2 + .../api/torznab/test_specials_mapping.py | 2 + apps/api/tests/unit/catalog/test_indexer.py | 116 ++- internal/agents/api.md | 4 +- internal/agents/architecture.md | 8 +- internal/agents/configuration.md | 123 ++- 13 files changed, 1592 insertions(+), 511 deletions(-) create mode 100644 apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py diff --git a/apps/api/.env.example b/apps/api/.env.example index 3f914811..0d8d66a8 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -109,6 +109,21 @@ PROVIDER_INDEX_CONCURRENCY_MEGAKINO=1 # What: Hard timeout in seconds for one provider title crawl # Default: 45 PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS=45 +# What: Maximum number of completed title payloads allowed to wait in memory for DB persistence +# Default: 32 +PROVIDER_INDEX_QUEUE_SIZE=32 +# What: Max number of title payloads committed per SQLite writer batch +# Default: 8 +PROVIDER_INDEX_WRITER_BATCH_SIZE=8 +# What: Max seconds the writer may hold a partial batch before forcing a commit +# Default: 1.0 +PROVIDER_INDEX_WRITER_FLUSH_SECONDS=1.0 +# What: Abort a refresh when failed title crawls exceed this percentage of the discovered title set +# Default: 20 +PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT=20 +# What: Minimum seconds between repeated queue-backpressure log lines while crawlers are blocked on persistence +# Default: 15 +PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS=15 # What: Domain check interval in minutes (0 disables background checks) # Default: 100 diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 3dfe4541..6c57d2d6 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -1,23 +1,33 @@ from __future__ import annotations -from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError -import threading from dataclasses import dataclass from datetime import timedelta +from queue import Empty, Full, Queue +from threading import Event, Lock, Semaphore, Thread +from time import monotonic from uuid import uuid4 from loguru import logger from sqlmodel import Session from app.catalog.exceptions import CatalogNotReadyError -from app.catalog.providers import CatalogCrawlObserver, crawl_provider_catalog +from app.catalog.providers import ( + CatalogCrawlObserver, + TitleRecord, + stream_provider_catalog, +) from app.config import ( ANIBRIDGE_TEST_MODE, CATALOG_SITES_LIST, CATALOG_SITE_CONFIGS, PROGRESS_STEP_PERCENT, + PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS, + PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT, PROVIDER_INDEX_GLOBAL_CONCURRENCY, + PROVIDER_INDEX_QUEUE_SIZE, PROVIDER_INDEX_SCHEDULER_POLL_SECONDS, + PROVIDER_INDEX_WRITER_BATCH_SIZE, + PROVIDER_INDEX_WRITER_FLUSH_SECONDS, ) from app.db import ( as_aware_utc, @@ -42,19 +52,31 @@ from app.utils.terminal import ProgressReporter, ProgressSnapshot _INDEXER: "ProviderCatalogIndexer | None" = None -_INDEXER_LOCK = threading.Lock() +_INDEXER_LOCK = Lock() _UNSET = object() -_DISCOVERY_HEARTBEAT_SECONDS = 15.0 +_QUEUE_SENTINEL = object() @dataclass(slots=True) class ProviderCatalogProgress: provider: str phase: str = "pending" - processed_titles: int = 0 + crawled_titles: int = 0 + persisted_titles: int = 0 + failed_titles: int = 0 total_titles: int | None = None current_slug: str = "" - last_logged_step: int = -1 + queue_depth: int = 0 + last_logged_crawl_step: int = -1 + last_logged_persist_step: int = -1 + + @property + def processed_titles(self) -> int: + return self.persisted_titles + + @property + def writer_lag_titles(self) -> int: + return max(0, self.crawled_titles - self.persisted_titles) @property def progress_percent(self) -> float | None: @@ -63,10 +85,19 @@ def progress_percent(self) -> float | None: if self.total_titles <= 0: return 100.0 return round( - max(0.0, min(100.0, self.processed_titles / self.total_titles * 100.0)), + max(0.0, min(100.0, self.persisted_titles / self.total_titles * 100.0)), 1, ) + @property + def crawl_percent(self) -> float | None: + if not self.total_titles: + return None + if self.total_titles <= 0: + return 100.0 + completed = self.crawled_titles + self.failed_titles + return round(max(0.0, min(100.0, completed / self.total_titles * 100.0)), 1) + def get_catalog_indexer() -> "ProviderCatalogIndexer": global _INDEXER @@ -118,13 +149,13 @@ def require_catalog_ready() -> None: class ProviderCatalogIndexer: def __init__(self) -> None: - self._stop_event = threading.Event() - self._thread: threading.Thread | None = None - self._active = threading.Semaphore(PROVIDER_INDEX_GLOBAL_CONCURRENCY) - self._progress_lock = threading.Lock() + self._stop_event = Event() + self._thread: Thread | None = None + self._active = Semaphore(PROVIDER_INDEX_GLOBAL_CONCURRENCY) + self._progress_lock = Lock() self._progress: dict[str, ProviderCatalogProgress] = {} - self._workers_lock = threading.Lock() - self._workers: dict[str, threading.Thread] = {} + self._workers_lock = Lock() + self._workers: dict[str, Thread] = {} def start(self) -> None: self._ensure_status_rows() @@ -139,7 +170,7 @@ def start(self) -> None: PROVIDER_INDEX_GLOBAL_CONCURRENCY, ", ".join(CATALOG_SITES_LIST), ) - self._thread = threading.Thread( + self._thread = Thread( target=self._run_loop, name="provider-catalog-indexer", daemon=True, @@ -215,7 +246,7 @@ def refresh_provider(self, provider: str) -> None: ) return logger.info("Provider catalog scheduler: starting refresh for {}", provider) - worker = threading.Thread( + worker = Thread( target=self._run_provider_refresh, name=f"provider-index-{provider}", args=(provider,), @@ -239,10 +270,14 @@ def get_progress_snapshot(self) -> dict[str, object]: provider: ProviderCatalogProgress( provider=snapshot.provider, phase=snapshot.phase, - processed_titles=snapshot.processed_titles, + crawled_titles=snapshot.crawled_titles, + persisted_titles=snapshot.persisted_titles, + failed_titles=snapshot.failed_titles, total_titles=snapshot.total_titles, current_slug=snapshot.current_slug, - last_logged_step=snapshot.last_logged_step, + queue_depth=snapshot.queue_depth, + last_logged_crawl_step=snapshot.last_logged_crawl_step, + last_logged_persist_step=snapshot.last_logged_persist_step, ) for provider, snapshot in self._progress.items() } @@ -253,6 +288,12 @@ def get_progress_snapshot(self) -> dict[str, object]: phase = progress.phase if phase == "pending" and status is not None: phase = status.status + latest_success_generation = ( + status.latest_success_generation if status is not None else None + ) + current_generation = ( + status.current_generation if status is not None else None + ) providers.append( { "provider": provider, @@ -264,9 +305,28 @@ def get_progress_snapshot(self) -> dict[str, object]: ), "phase": phase, "processed_titles": progress.processed_titles, + "crawled_titles": progress.crawled_titles, + "persisted_titles": progress.persisted_titles, + "failed_titles": progress.failed_titles, "total_titles": progress.total_titles, "progress_percent": progress.progress_percent, + "crawl_progress_percent": progress.crawl_percent, + "queue_depth": progress.queue_depth, + "writer_lag_titles": progress.writer_lag_titles, "current_slug": progress.current_slug or None, + "serving_previous_generation": bool( + status is not None + and status.status == "running" + and latest_success_generation + and current_generation + and current_generation != latest_success_generation + ), + "latest_success_generation": latest_success_generation, + "staging_generation": ( + current_generation + if current_generation != latest_success_generation + else None + ), "last_error_summary": ( status.last_error_summary if status is not None else "" ), @@ -307,7 +367,6 @@ def _run_provider_refresh(self, provider: str) -> None: def _ensure_status_rows(self) -> None: with Session(engine) as session: - now = None for provider in CATALOG_SITES_LIST: self._set_progress(provider, phase="pending") hours = float( @@ -327,39 +386,45 @@ def _ensure_status_rows(self) -> None: refresh_interval_hours=hours, status="pending", bootstrap_completed=False, - next_refresh_after=now, + next_refresh_after=None, ) continue - if status.status == "running": + stale_generation = self._stale_generation(status) + if stale_generation is not None: logger.warning( - "Provider catalog bootstrap: recovered interrupted run for {} started_at={} cursor_slug={}. Marking it pending for retry.", + "Provider catalog bootstrap: found interrupted staging generation for {} generation={} status={} cursor_slug={}. Cleaning it up before retry.", provider, - status.latest_started_at.isoformat() - if status.latest_started_at is not None - else None, + stale_generation, + status.status, status.cursor_title_slug or None, ) + delete_provider_generation( + session, + provider=provider, + generation=stale_generation, + ) upsert_provider_index_status( session, provider=provider, refresh_interval_hours=hours, status="pending", + current_generation=None, latest_completed_at=utcnow(), - next_refresh_after=now, + next_refresh_after=None, failure_count=status.failure_count + 1, - last_error_summary="Interrupted by process restart before completion.", - ) - else: - logger.debug( - "Provider catalog bootstrap: loaded persisted state for {} status={} bootstrap_completed={} latest_success_generation={} next_refresh_after={}", - provider, - status.status, - status.bootstrap_completed, - status.latest_success_generation, - status.next_refresh_after.isoformat() - if status.next_refresh_after is not None - else None, + last_error_summary="Interrupted staging generation cleaned up after restart.", ) + continue + logger.debug( + "Provider catalog bootstrap: loaded persisted state for {} status={} bootstrap_completed={} latest_success_generation={} next_refresh_after={}", + provider, + status.status, + status.bootstrap_completed, + status.latest_success_generation, + status.next_refresh_after.isoformat() + if status.next_refresh_after is not None + else None, + ) def _is_due(self, status) -> bool: if status.status == "running": @@ -378,19 +443,34 @@ def _refresh_provider(self, provider: str) -> None: ) generation = uuid4().hex reporter: ProgressReporter | None = None + queue: Queue[TitleRecord | object] = Queue(maxsize=PROVIDER_INDEX_QUEUE_SIZE) + writer_failure: list[BaseException] = [] + state_lock = Lock() + failed_titles = 0 + completed_titles = 0 self._set_progress( provider, phase="discovering_titles", - processed_titles=0, + crawled_titles=0, + persisted_titles=0, + failed_titles=0, total_titles=None, current_slug="", - reset_log_step=True, + queue_depth=0, + reset_log_steps=True, ) logger.info("Provider catalog {}: discovering titles", provider) + with Session(engine) as session: current = get_provider_index_status(session, provider=provider) - failure_count = 0 if current is None else current.failure_count - _ = failure_count + if current is not None: + stale_generation = self._stale_generation(current) + if stale_generation is not None: + delete_provider_generation( + session, + provider=provider, + generation=stale_generation, + ) upsert_provider_index_status( session, provider=provider, @@ -398,174 +478,183 @@ def _refresh_provider(self, provider: str) -> None: status="running", current_generation=generation, latest_started_at=utcnow(), + latest_completed_at=None, + next_refresh_after=None, + last_error_summary="", + cursor_title_slug="", ) - try: - titles = self._crawl_provider_catalog_with_heartbeat(provider) - logger.info( - "Provider catalog {}: discovered {} titles", + reporter = ProgressReporter( + label=f"Catalog {provider}", + unit="title", + unit_scale=False, + ) + reporter.update( + ProgressSnapshot( + downloaded=0, + total=None, + status="discovering_titles", + ) + ) + + writer = Thread( + target=self._writer_loop, + name=f"provider-index-writer-{provider}", + args=( + provider, + generation, + refresh_interval_hours, + queue, + reporter, + writer_failure, + ), + daemon=True, + ) + writer.start() + + def emit_title(record: TitleRecord) -> None: + last_backpressure_log = 0.0 + while True: + if writer_failure: + raise RuntimeError( + f"writer failed for {provider}: {writer_failure[0]}" + ) + try: + queue.put(record, timeout=1.0) + except Full: + depth = queue.qsize() + self._set_progress(provider, queue_depth=depth) + now = monotonic() + if ( + now - last_backpressure_log + >= PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS + ): + logger.warning( + "Provider catalog {}: writer backpressure queue_depth={} lag_titles={}", + provider, + depth, + self._get_writer_lag(provider), + ) + last_backpressure_log = now + continue + self._set_progress(provider, queue_depth=queue.qsize()) + return + + def on_index_loaded(total_titles: int) -> None: + self._set_progress( provider, - len(titles), + phase="crawling_titles", + total_titles=total_titles, + current_slug="", + queue_depth=queue.qsize(), + reset_log_steps=True, ) - reporter = ProgressReporter( - label=f"Catalog {provider}", - unit="title", - unit_scale=False, + logger.info( + "Provider catalog {}: loaded title index with {} titles", + provider, + total_titles, ) reporter.update( ProgressSnapshot( downloaded=0, - total=len(titles), - status="persisting_titles", + total=total_titles, + status="crawling_titles", ) ) - self._set_progress( + + def on_title_started(slug: str) -> None: + self._set_progress(provider, phase="crawling_titles", current_slug=slug) + + def on_title_crawled(slug: str) -> None: + nonlocal completed_titles + with state_lock: + completed_titles += 1 + self._advance_crawl_progress( provider, - phase="persisting_titles", - total_titles=len(titles), - processed_titles=0, - current_slug="", - reset_log_step=True, + current_slug=slug, + queue_depth=queue.qsize(), ) - for title_record in titles: - with Session(engine) as session: - upsert_provider_index_status( - session, - provider=provider, - refresh_interval_hours=refresh_interval_hours, - cursor_title_slug=title_record.slug, - ) - upsert_provider_title_index_state( - session, - provider=provider, - slug=title_record.slug, - attempted_at=utcnow(), - ) - replace_provider_catalog_title( - session, - provider=provider, - slug=title_record.slug, - title=title_record.title, - media_type_hint=title_record.media_type_hint, - relative_path=title_record.relative_path, - indexed_generation=generation, - ) - replace_provider_catalog_aliases( - session, - provider=provider, - slug=title_record.slug, - aliases=title_record.aliases, - indexed_generation=generation, - ) - replace_provider_catalog_episodes( - session, - provider=provider, - slug=title_record.slug, - episodes=[ - { - "season": episode.season, - "episode": episode.episode, - "relative_path": episode.relative_path, - "title_primary": episode.title_primary, - "title_secondary": episode.title_secondary, - "media_type_hint": episode.media_type_hint, - "languages": [ - { - "language": lang.language, - "host_hints": lang.host_hints, - } - for lang in episode.languages - ], - } - for episode in title_record.episodes - ], - indexed_generation=generation, - ) - if title_record.canonical.series is not None: - series = title_record.canonical.series - upsert_canonical_series( - session, - tvdb_id=int(series["tvdb_id"]), - title=str(series["title"]), - tmdb_id=series.get("tmdb_id"), - imdb_id=series.get("imdb_id"), - tvmaze_id=series.get("tvmaze_id"), - anilist_id=series.get("anilist_id"), - mal_id=series.get("mal_id"), - aliases=list(series.get("aliases") or []), - ) - replace_canonical_episodes( - session, - tvdb_id=int(series["tvdb_id"]), - episodes=title_record.canonical.episodes, - ) - replace_provider_series_mappings( - session, - provider=provider, - slug=title_record.slug, - mappings=title_record.canonical.series_mappings, - ) - replace_provider_episode_mappings( - session, - provider=provider, - slug=title_record.slug, - mappings=title_record.canonical.episode_mappings, - ) - replace_provider_movie_mappings( - session, - provider=provider, - slug=title_record.slug, - mappings=title_record.canonical.movie_mappings, - ) - upsert_provider_title_index_state( - session, - provider=provider, - slug=title_record.slug, - succeeded_at=utcnow(), - failure_count=0, - last_error_summary="", - ) - session.commit() - self._advance_progress(provider, current_slug=title_record.slug) - processed_titles = self._get_processed_titles(provider) - reporter.update( - ProgressSnapshot( - downloaded=min(len(titles), processed_titles), - total=len(titles), - status="persisting_titles", + + def on_title_failed(slug: str, reason: str) -> None: + nonlocal completed_titles, failed_titles + with state_lock: + completed_titles += 1 + failed_titles += 1 + failure_count = failed_titles + self._record_title_failure(provider, slug, reason) + self._advance_failed_progress( + provider, + current_slug=slug, + queue_depth=queue.qsize(), + ) + total_titles = self._get_total_titles(provider) + if total_titles and total_titles > 0: + failure_rate = failure_count / total_titles * 100.0 + if failure_rate > PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT: + raise RuntimeError( + "provider refresh failure threshold exceeded: " + f"{failure_count}/{total_titles} titles failed " + f"({failure_rate:.1f}% > {PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT:.1f}%)" ) - ) + + observer = CatalogCrawlObserver( + on_index_loaded=on_index_loaded, + on_title_started=on_title_started, + on_title_crawled=on_title_crawled, + on_title_failed=on_title_failed, + ) + + try: + summary = stream_provider_catalog( + provider, + emit_title=emit_title, + observer=observer, + ) + if writer_failure: + raise RuntimeError(f"writer failed for {provider}: {writer_failure[0]}") + if summary.discovered_titles == 0: + logger.warning("Provider catalog {}: discovered zero titles", provider) + except Exception as exc: + logger.exception( + "Provider catalog refresh failed for {}: {}", provider, exc + ) + queue.put(_QUEUE_SENTINEL) + writer.join(timeout=30) + if reporter is not None: + reporter.close() completed_at = utcnow() with Session(engine) as session: - prune_provider_generation( + delete_provider_generation( session, provider=provider, - keep_generation=generation, + generation=generation, ) + current = get_provider_index_status(session, provider=provider) + failure_count = 1 if current is None else current.failure_count + 1 upsert_provider_index_status( session, provider=provider, refresh_interval_hours=refresh_interval_hours, - status="ready", - current_generation=generation, - latest_success_generation=generation, + status="failed", + current_generation=None, latest_completed_at=completed_at, - latest_success_at=completed_at, next_refresh_after=completed_at + timedelta(hours=refresh_interval_hours), - bootstrap_completed=True, - failure_count=0, - last_error_summary="", - cursor_title_slug="", + failure_count=failure_count, + last_error_summary=str(exc)[:500], ) self._set_progress( provider, - phase="ready", + phase="failed", + queue_depth=0, current_slug="", ) - if reporter is not None: - reporter.close() - except Exception as exc: + return + + queue.put(_QUEUE_SENTINEL) + writer.join() + if writer_failure: + exc = RuntimeError(f"writer failed for {provider}: {writer_failure[0]}") logger.exception( "Provider catalog refresh failed for {}: {}", provider, exc ) @@ -585,6 +674,7 @@ def _refresh_provider(self, provider: str) -> None: provider=provider, refresh_interval_hours=refresh_interval_hours, status="failed", + current_generation=None, latest_completed_at=completed_at, next_refresh_after=completed_at + timedelta(hours=refresh_interval_hours), @@ -594,73 +684,309 @@ def _refresh_provider(self, provider: str) -> None: self._set_progress( provider, phase="failed", + queue_depth=0, current_slug="", ) + return - def _crawl_provider_catalog_with_heartbeat(self, provider: str) -> list[object]: - elapsed_seconds = 0.0 - observer = CatalogCrawlObserver( - on_index_loaded=lambda total: self._on_title_index_loaded(provider, total), - on_title_started=lambda slug: self._on_title_started(provider, slug), - on_title_crawled=lambda slug: self._on_title_crawled(provider, slug), - ) - with ThreadPoolExecutor(max_workers=1) as executor: - future = executor.submit( - crawl_provider_catalog, - provider, - observer=observer, + completed_at = utcnow() + with Session(engine) as session: + prune_provider_generation( + session, + provider=provider, + keep_generation=generation, ) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="ready", + current_generation=generation, + latest_success_generation=generation, + latest_completed_at=completed_at, + latest_success_at=completed_at, + next_refresh_after=completed_at + + timedelta(hours=refresh_interval_hours), + bootstrap_completed=True, + failure_count=0, + last_error_summary="", + cursor_title_slug="", + ) + self._set_progress( + provider, + phase="ready", + queue_depth=0, + current_slug="", + ) + logger.info( + "Provider catalog {}: promoted staging generation {} persisted={}/{} failed={}", + provider, + generation, + self._get_persisted_titles(provider), + self._get_total_titles(provider) or 0, + self._get_failed_titles(provider), + ) + if reporter is not None: + reporter.close() + + def _writer_loop( + self, + provider: str, + generation: str, + refresh_interval_hours: float, + queue: Queue[TitleRecord | object], + reporter: ProgressReporter | None, + writer_failure: list[BaseException], + ) -> None: + batch: list[TitleRecord] = [] + last_flush_at = monotonic() + try: while True: + timeout = max( + 0.1, + PROVIDER_INDEX_WRITER_FLUSH_SECONDS - (monotonic() - last_flush_at), + ) try: - return future.result(timeout=_DISCOVERY_HEARTBEAT_SECONDS) - except FutureTimeoutError: - elapsed_seconds += _DISCOVERY_HEARTBEAT_SECONDS - processed = self._get_processed_titles(provider) - total = self._get_total_titles(provider) - current_slug = self._get_current_slug(provider) - if total is not None: - percent = 100.0 if total <= 0 else round(processed / total * 100.0, 1) - logger.info( - "Provider catalog {}: crawling title details {}/{} ({}%) after {}s current={}", + item = queue.get(timeout=timeout) + except Empty: + item = None + if item is None: + if batch: + self._flush_writer_batch( provider, - processed, - total, - percent, - int(elapsed_seconds), - current_slug or "-", + generation, + refresh_interval_hours, + batch, + queue_depth=queue.qsize(), + reporter=reporter, ) - else: - logger.info( - "Provider catalog {}: still discovering titles after {}s current={}", + batch = [] + last_flush_at = monotonic() + continue + if item is _QUEUE_SENTINEL: + if batch: + self._flush_writer_batch( provider, - int(elapsed_seconds), - current_slug or "-", + generation, + refresh_interval_hours, + batch, + queue_depth=queue.qsize(), + reporter=reporter, ) + return + batch.append(item) + if len(batch) >= PROVIDER_INDEX_WRITER_BATCH_SIZE: + self._flush_writer_batch( + provider, + generation, + refresh_interval_hours, + batch, + queue_depth=queue.qsize(), + reporter=reporter, + ) + batch = [] + last_flush_at = monotonic() + except BaseException as exc: + writer_failure.append(exc) + logger.exception("Provider catalog writer failed for {}: {}", provider, exc) - def _on_title_index_loaded(self, provider: str, total_titles: int) -> None: - self._set_progress( + def _flush_writer_batch( + self, + provider: str, + generation: str, + refresh_interval_hours: float, + batch: list[TitleRecord], + *, + queue_depth: int, + reporter: ProgressReporter | None, + ) -> None: + if not batch: + return + last_slug = batch[-1].slug + with Session(engine) as session: + for record in batch: + now = utcnow() + upsert_provider_title_index_state( + session, + provider=provider, + slug=record.slug, + attempted_at=now, + commit=False, + ) + self._persist_title_record( + session, + record=record, + indexed_generation=generation, + ) + upsert_provider_title_index_state( + session, + provider=provider, + slug=record.slug, + succeeded_at=now, + failure_count=0, + last_error_summary="", + commit=False, + ) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="running", + current_generation=generation, + cursor_title_slug=last_slug, + last_error_summary="", + commit=False, + ) + session.commit() + self._advance_persist_progress( provider, - phase="crawling_titles", - total_titles=total_titles, - processed_titles=0, - current_slug="", - reset_log_step=True, + current_slug=last_slug, + count=len(batch), + queue_depth=queue_depth, ) + persisted = self._get_persisted_titles(provider) + total_titles = self._get_total_titles(provider) + if reporter is not None: + reporter.update( + ProgressSnapshot( + downloaded=persisted, + total=total_titles, + status="persisting_titles", + ) + ) logger.info( - "Provider catalog {}: loaded title index with {} titles", + "Provider catalog {}: persisted batch size={} persisted={}/{} queue_depth={} writer_lag={}", provider, - total_titles, + len(batch), + persisted, + total_titles or 0, + queue_depth, + self._get_writer_lag(provider), ) - def _on_title_started(self, provider: str, slug: str) -> None: - self._set_progress( - provider, - phase="crawling_titles", - current_slug=slug, + def _persist_title_record( + self, + session: Session, + *, + record: TitleRecord, + indexed_generation: str, + ) -> None: + replace_provider_catalog_title( + session, + provider=record.provider, + slug=record.slug, + title=record.title, + media_type_hint=record.media_type_hint, + relative_path=record.relative_path, + indexed_generation=indexed_generation, + ) + replace_provider_catalog_aliases( + session, + provider=record.provider, + slug=record.slug, + aliases=record.aliases, + indexed_generation=indexed_generation, + ) + replace_provider_catalog_episodes( + session, + provider=record.provider, + slug=record.slug, + episodes=[ + { + "season": episode.season, + "episode": episode.episode, + "relative_path": episode.relative_path, + "title_primary": episode.title_primary, + "title_secondary": episode.title_secondary, + "media_type_hint": episode.media_type_hint, + "languages": [ + { + "language": language.language, + "host_hints": language.host_hints, + } + for language in episode.languages + ], + } + for episode in record.episodes + ], + indexed_generation=indexed_generation, + ) + if record.canonical.series is not None: + series = record.canonical.series + upsert_canonical_series( + session, + tvdb_id=int(series["tvdb_id"]), + title=str(series["title"]), + tmdb_id=series.get("tmdb_id"), + imdb_id=series.get("imdb_id"), + tvmaze_id=series.get("tvmaze_id"), + anilist_id=series.get("anilist_id"), + mal_id=series.get("mal_id"), + aliases=list(series.get("aliases") or []), + ) + replace_canonical_episodes( + session, + tvdb_id=int(series["tvdb_id"]), + episodes=record.canonical.episodes, + ) + replace_provider_series_mappings( + session, + provider=record.provider, + slug=record.slug, + mappings=record.canonical.series_mappings, + indexed_generation=indexed_generation, + ) + replace_provider_episode_mappings( + session, + provider=record.provider, + slug=record.slug, + mappings=record.canonical.episode_mappings, + indexed_generation=indexed_generation, + ) + replace_provider_movie_mappings( + session, + provider=record.provider, + slug=record.slug, + mappings=record.canonical.movie_mappings, + indexed_generation=indexed_generation, ) - def _on_title_crawled(self, provider: str, slug: str) -> None: - self._advance_progress(provider, current_slug=slug) + def _record_title_failure(self, provider: str, slug: str, reason: str) -> None: + with Session(engine) as session: + current = get_provider_index_status(session, provider=provider) + refresh_interval_hours = float( + CATALOG_SITE_CONFIGS.get(provider, {}).get( + "provider_index_refresh_hours", 24.0 + ) + ) + state = upsert_provider_title_index_state( + session, + provider=provider, + slug=slug, + attempted_at=utcnow(), + commit=False, + ) + upsert_provider_title_index_state( + session, + provider=provider, + slug=slug, + failure_count=state.failure_count + 1, + last_error_summary=reason[:500], + commit=False, + ) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="running", + current_generation=current.current_generation + if current is not None + else None, + cursor_title_slug=slug, + last_error_summary=reason[:500], + commit=False, + ) + session.commit() def _is_bootstrap_ready(self) -> bool: with Session(engine) as session: @@ -703,15 +1029,29 @@ def _log_bootstrap_state(self) -> None: status.last_error_summary or None, ) + def _stale_generation(self, status) -> str | None: + current_generation = getattr(status, "current_generation", None) + latest_success_generation = getattr(status, "latest_success_generation", None) + if not current_generation: + return None + if status.status == "running": + return current_generation + if current_generation != latest_success_generation: + return current_generation + return None + def _set_progress( self, provider: str, *, phase: str | None = None, - processed_titles: int | None = None, + crawled_titles: int | None = None, + persisted_titles: int | None = None, + failed_titles: int | None = None, total_titles: int | None | object = _UNSET, current_slug: str | None = None, - reset_log_step: bool = False, + queue_depth: int | None = None, + reset_log_steps: bool = False, ) -> None: with self._progress_lock: snapshot = self._progress.get(provider) @@ -720,59 +1060,140 @@ def _set_progress( self._progress[provider] = snapshot if phase is not None: snapshot.phase = phase - if processed_titles is not None: - snapshot.processed_titles = processed_titles + if crawled_titles is not None: + snapshot.crawled_titles = crawled_titles + if persisted_titles is not None: + snapshot.persisted_titles = persisted_titles + if failed_titles is not None: + snapshot.failed_titles = failed_titles if total_titles is not _UNSET: snapshot.total_titles = total_titles if current_slug is not None: snapshot.current_slug = current_slug - if reset_log_step: - snapshot.last_logged_step = -1 + if queue_depth is not None: + snapshot.queue_depth = queue_depth + if reset_log_steps: + snapshot.last_logged_crawl_step = -1 + snapshot.last_logged_persist_step = -1 - def _advance_progress(self, provider: str, *, current_slug: str) -> None: + def _advance_crawl_progress( + self, + provider: str, + *, + current_slug: str, + queue_depth: int, + ) -> None: with self._progress_lock: - snapshot = self._progress.get(provider) - if snapshot is None: - snapshot = ProviderCatalogProgress(provider=provider) - self._progress[provider] = snapshot - snapshot.processed_titles += 1 + snapshot = self._progress.setdefault( + provider, ProviderCatalogProgress(provider=provider) + ) + snapshot.crawled_titles += 1 snapshot.current_slug = current_slug - total = snapshot.total_titles - percent = snapshot.progress_percent - if total is None or percent is None: - return - step = max(1, int(PROGRESS_STEP_PERCENT)) - current_step = int(percent) // step - if percent < 100.0 and current_step <= snapshot.last_logged_step: + snapshot.queue_depth = queue_depth + self._maybe_log_progress(snapshot, kind="crawl") + + def _advance_failed_progress( + self, + provider: str, + *, + current_slug: str, + queue_depth: int, + ) -> None: + with self._progress_lock: + snapshot = self._progress.setdefault( + provider, ProviderCatalogProgress(provider=provider) + ) + snapshot.failed_titles += 1 + snapshot.current_slug = current_slug + snapshot.queue_depth = queue_depth + self._maybe_log_progress(snapshot, kind="crawl") + + def _advance_persist_progress( + self, + provider: str, + *, + current_slug: str, + count: int, + queue_depth: int, + ) -> None: + with self._progress_lock: + snapshot = self._progress.setdefault( + provider, ProviderCatalogProgress(provider=provider) + ) + snapshot.phase = "persisting_titles" + snapshot.persisted_titles += count + snapshot.current_slug = current_slug + snapshot.queue_depth = queue_depth + self._maybe_log_progress(snapshot, kind="persist") + + def _maybe_log_progress( + self, + snapshot: ProviderCatalogProgress, + *, + kind: str, + ) -> None: + if snapshot.total_titles is None: + return + step = max(1, int(PROGRESS_STEP_PERCENT)) + if kind == "crawl": + percent = snapshot.crawl_percent + current_step = int(percent or 0.0) // step + if ( + percent is not None + and percent < 100.0 + and current_step <= snapshot.last_logged_crawl_step + ): return - snapshot.last_logged_step = current_step + snapshot.last_logged_crawl_step = current_step logger.info( - "Provider catalog {} progress [{}]: {}/{} ({}%) current={}", - provider, - snapshot.phase, - snapshot.processed_titles, - total, + "Provider catalog {} progress [crawl]: crawled={} failed={} persisted={} total={} crawl_percent={} queue_depth={} lag={} current={}", + snapshot.provider, + snapshot.crawled_titles, + snapshot.failed_titles, + snapshot.persisted_titles, + snapshot.total_titles, percent, - current_slug, + snapshot.queue_depth, + snapshot.writer_lag_titles, + snapshot.current_slug, ) + return + percent = snapshot.progress_percent + current_step = int(percent or 0.0) // step + if ( + percent is not None + and percent < 100.0 + and current_step <= snapshot.last_logged_persist_step + ): + return + snapshot.last_logged_persist_step = current_step + logger.info( + "Provider catalog {} progress [persist]: persisted={} total={} percent={} queue_depth={} lag={} current={}", + snapshot.provider, + snapshot.persisted_titles, + snapshot.total_titles, + percent, + snapshot.queue_depth, + snapshot.writer_lag_titles, + snapshot.current_slug, + ) - def _get_processed_titles(self, provider: str) -> int: + def _get_total_titles(self, provider: str) -> int | None: with self._progress_lock: snapshot = self._progress.get(provider) - if snapshot is None: - return 0 - return snapshot.processed_titles + return None if snapshot is None else snapshot.total_titles - def _get_total_titles(self, provider: str) -> int | None: + def _get_persisted_titles(self, provider: str) -> int: with self._progress_lock: snapshot = self._progress.get(provider) - if snapshot is None: - return None - return snapshot.total_titles + return 0 if snapshot is None else snapshot.persisted_titles - def _get_current_slug(self, provider: str) -> str: + def _get_failed_titles(self, provider: str) -> int: with self._progress_lock: snapshot = self._progress.get(provider) - if snapshot is None: - return "" - return snapshot.current_slug + return 0 if snapshot is None else snapshot.failed_titles + + def _get_writer_lag(self, provider: str) -> int: + with self._progress_lock: + snapshot = self._progress.get(provider) + return 0 if snapshot is None else snapshot.writer_lag_titles diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py index 9788861a..1fc73006 100644 --- a/apps/api/app/catalog/providers.py +++ b/apps/api/app/catalog/providers.py @@ -1,9 +1,9 @@ from __future__ import annotations -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait from dataclasses import dataclass, field from difflib import SequenceMatcher -import threading +from time import monotonic import re from typing import Any, Callable, Optional from urllib.parse import urlparse @@ -21,6 +21,8 @@ from app.utils.domain_resolver import get_megakino_base_url from app.utils.http_client import get as http_get +_CATALOG_STREAM_HEARTBEAT_SECONDS = 15.0 + @dataclass(slots=True) class EpisodeLanguageRecord: @@ -65,10 +67,23 @@ class CatalogCrawlObserver: on_index_loaded: Callable[[int], None] | None = None on_title_started: Callable[[str], None] | None = None on_title_crawled: Callable[[str], None] | None = None + on_title_failed: Callable[[str, str], None] | None = None -class ProviderTitleCrawlTimeoutError(TimeoutError): - pass +@dataclass(slots=True) +class CatalogStreamSummary: + discovered_titles: int = 0 + crawled_titles: int = 0 + failed_titles: int = 0 + + +@dataclass(slots=True) +class _TitleJob: + slug: str + title: str + aliases: list[str] + started_at: float + timed_out: bool = False def _relative_path(url: str) -> str: @@ -79,56 +94,6 @@ def _relative_path(url: str) -> str: return path -def _run_with_timeout( - timeout_seconds: float, - fn: Callable[..., TitleRecord], - /, - *args: Any, - **kwargs: Any, -) -> TitleRecord: - result: dict[str, TitleRecord] = {} - err: dict[str, BaseException] = {} - - def _target() -> None: - try: - result["value"] = fn(*args, **kwargs) - except BaseException as exc: - err["exc"] = exc - - thread = threading.Thread(target=_target, name="provider-title-crawl", daemon=True) - thread.start() - thread.join(float(timeout_seconds)) - if thread.is_alive(): - raise ProviderTitleCrawlTimeoutError( - f"title crawl exceeded {int(timeout_seconds)}s" - ) - if "exc" in err: - raise err["exc"] - return result["value"] - - -def _fallback_title_record( - *, - provider_key: str, - slug: str, - title: str, - aliases: list[str], -) -> TitleRecord: - relative_root = ( - f"/anime/stream/{slug}" if provider_key == "aniworld.to" else f"/serie/{slug}" - ) - return TitleRecord( - provider=provider_key, - slug=slug, - title=title, - aliases=aliases, - media_type_hint="series", - relative_path=relative_root, - episodes=[], - canonical=CanonicalPayload(), - ) - - def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageRecord]: if not isinstance(raw, dict): return [] @@ -182,7 +147,9 @@ def _dedupe_languages( ] -def _aniworld_languages_from_flags(host_hints: list[str], row: BeautifulSoup) -> list[EpisodeLanguageRecord]: +def _aniworld_languages_from_flags( + host_hints: list[str], row: BeautifulSoup +) -> list[EpisodeLanguageRecord]: languages: list[EpisodeLanguageRecord] = [] for image in row.select("td.editFunctions img.flag"): src = str(image.get("src") or "").lower() @@ -203,7 +170,11 @@ def _aniworld_languages_from_flags(host_hints: list[str], row: BeautifulSoup) -> host_hints=host_hints, ) ) - elif "german.svg" in src or "deutsche sprache" in text or "deutsch/german" in text: + elif ( + "german.svg" in src + or "deutsche sprache" in text + or "deutsch/german" in text + ): languages.append( EpisodeLanguageRecord( language="German Dub", @@ -255,8 +226,12 @@ def _parse_aniworld_season_rows(season) -> list[EpisodeRecord]: if title_cell is not None: strong = title_cell.select_one("strong") span = title_cell.select_one("span") - title_primary = strong.get_text(" ", strip=True) if strong is not None else None - title_secondary = span.get_text(" ", strip=True) if span is not None else None + title_primary = ( + strong.get_text(" ", strip=True) if strong is not None else None + ) + title_secondary = ( + span.get_text(" ", strip=True) if span is not None else None + ) host_hints = _host_hints_from_row(row) languages = _aniworld_languages_from_flags(host_hints, row) episodes.append( @@ -266,7 +241,9 @@ def _parse_aniworld_season_rows(season) -> list[EpisodeRecord]: relative_path=relative_path, title_primary=title_primary, title_secondary=title_secondary, - media_type_hint="movie" if getattr(season, "are_movies", False) else "episode", + media_type_hint="movie" + if getattr(season, "are_movies", False) + else "episode", languages=languages, ) ) @@ -279,7 +256,7 @@ def _parse_sto_season_rows(season) -> list[EpisodeRecord]: pattern = re.compile( r'href="(?P(?:https?://(?:serienstream|s)\.to)?/serie/[^"\s]+/staffel-' + str(season_number) - + r'/episode-(?P\d+))/?\"' + + r"/episode-(?P\d+))/?\"" ) episodes: list[EpisodeRecord] = [] seen: set[tuple[int, str]] = set() @@ -516,13 +493,10 @@ def _crawl_title_job( title: str, aliases: list[str], observer: CatalogCrawlObserver | None, - title_timeout_seconds: float, ) -> TitleRecord: if observer is not None and observer.on_title_started is not None: observer.on_title_started(slug) - return _run_with_timeout( - title_timeout_seconds, - _crawl_aniworld_like_title, + return _crawl_aniworld_like_title( provider_key=provider_key, slug=slug, title=title, @@ -548,21 +522,160 @@ def _parse_megakino_page_metadata(url: str) -> tuple[str | None, int | None]: return title, year -def crawl_provider_catalog( +def _emit_title_failure( + *, + provider_key: str, + slug: str, + title: str, + reason: str, + observer: CatalogCrawlObserver | None, +) -> None: + logger.warning( + "Provider catalog {}: title crawl failed slug={} title={}: {}", + provider_key, + slug, + title, + reason, + ) + if observer is not None and observer.on_title_failed is not None: + observer.on_title_failed(slug, reason) + + +def _stream_aniworld_like_catalog( + *, + provider_key: str, + index: dict[str, str], + alternatives: dict[str, list[str]], + emit_title: Callable[[TitleRecord], None], + observer: CatalogCrawlObserver | None, + title_timeout_seconds: float, +) -> CatalogStreamSummary: + workers = int( + CATALOG_SITE_CONFIGS[provider_key].get("provider_index_concurrency", 1) + ) + max_workers = max(1, workers) + summary = CatalogStreamSummary(discovered_titles=len(index)) + pending: dict[Future[TitleRecord], _TitleJob] = {} + pending_iter = iter(index.items()) + + def _submit_next(executor: ThreadPoolExecutor) -> bool: + try: + slug, title = next(pending_iter) + except StopIteration: + return False + aliases = list(dict.fromkeys(alternatives.get(slug, []) or [title])) + future = executor.submit( + _crawl_title_job, + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + observer=observer, + ) + pending[future] = _TitleJob( + slug=slug, + title=title, + aliases=aliases, + started_at=monotonic(), + ) + return True + + executor = ThreadPoolExecutor(max_workers=max_workers) + try: + while len(pending) < max_workers and _submit_next(executor): + pass + + while pending: + done, _ = wait( + pending.keys(), + timeout=_CATALOG_STREAM_HEARTBEAT_SECONDS, + return_when=FIRST_COMPLETED, + ) + if not done: + completed = summary.crawled_titles + summary.failed_titles + if summary.discovered_titles > 0: + percent = round(completed / summary.discovered_titles * 100.0, 1) + logger.info( + "Provider catalog {}: crawling title details {}/{} ({}%) active={} queued={}", + provider_key, + completed, + summary.discovered_titles, + percent, + len(pending), + max(0, summary.discovered_titles - completed - len(pending)), + ) + else: + logger.info( + "Provider catalog {}: still discovering titles active={}", + provider_key, + len(pending), + ) + for future, job in list(pending.items()): + if job.timed_out or future.done(): + continue + elapsed = monotonic() - job.started_at + if elapsed < title_timeout_seconds: + continue + job.timed_out = True + _emit_title_failure( + provider_key=provider_key, + slug=job.slug, + title=job.title, + reason=( + f"title crawl exceeded {int(title_timeout_seconds)}s; " + "worker left running until underlying I/O returns" + ), + observer=observer, + ) + summary.failed_titles += 1 + continue + + for future in done: + job = pending.pop(future) + try: + record = future.result() + except Exception as exc: + if not job.timed_out: + summary.failed_titles += 1 + _emit_title_failure( + provider_key=provider_key, + slug=job.slug, + title=job.title, + reason=str(exc), + observer=observer, + ) + else: + if not job.timed_out: + emit_title(record) + summary.crawled_titles += 1 + if ( + observer is not None + and observer.on_title_crawled is not None + ): + observer.on_title_crawled(record.slug) + while len(pending) < max_workers and _submit_next(executor): + pass + finally: + executor.shutdown(wait=False, cancel_futures=True) + return summary + + +def stream_provider_catalog( provider_key: str, *, + emit_title: Callable[[TitleRecord], None], observer: CatalogCrawlObserver | None = None, -) -> list[TitleRecord]: +) -> CatalogStreamSummary: provider = get_provider(provider_key) if provider is None: - return [] + return CatalogStreamSummary() if provider_key == "megakino": client = get_default_megakino_client() entries = client.load_index() if observer is not None and observer.on_index_loaded is not None: observer.on_index_loaded(len(entries)) - titles: list[TitleRecord] = [] + summary = CatalogStreamSummary(discovered_titles=len(entries)) for entry in entries.values(): parsed_title = entry.slug.replace("-", " ").title() try: @@ -573,7 +686,7 @@ def crawl_provider_catalog( logger.debug( "Megakino metadata fetch failed for {}: {}", entry.url, exc ) - titles.append( + emit_title( TitleRecord( provider=provider_key, slug=entry.slug, @@ -585,13 +698,18 @@ def crawl_provider_catalog( canonical=CanonicalPayload(), ) ) + summary.crawled_titles += 1 if observer is not None and observer.on_title_crawled is not None: observer.on_title_crawled(entry.slug) - return titles + return summary logger.info("Provider catalog {}: loading title index", provider_key) index = provider.load_or_refresh_index() - logger.info("Provider catalog {}: title index loaded ({} titles)", provider_key, len(index)) + logger.info( + "Provider catalog {}: title index loaded ({} titles)", + provider_key, + len(index), + ) logger.info("Provider catalog {}: loading title alternatives", provider_key) alternatives = provider.load_or_refresh_alternatives() logger.info( @@ -601,65 +719,32 @@ def crawl_provider_catalog( ) if observer is not None and observer.on_index_loaded is not None: observer.on_index_loaded(len(index)) - workers = int( - CATALOG_SITE_CONFIGS[provider_key].get("provider_index_concurrency", 1) - ) title_timeout_seconds = float( CATALOG_SITE_CONFIGS[provider_key].get( "provider_index_title_timeout_seconds", PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, ) ) - futures: dict[object, tuple[str, str, list[str]]] = {} - results: list[TitleRecord] = [] - with ThreadPoolExecutor(max_workers=max(1, workers)) as executor: - for slug, title in index.items(): - aliases = list(dict.fromkeys(alternatives.get(slug, []) or [title])) - future = executor.submit( - _crawl_title_job, - provider_key=provider_key, - slug=slug, - title=title, - aliases=aliases, - observer=observer, - title_timeout_seconds=title_timeout_seconds, - ) - futures[future] = (slug, title, aliases) - for future in as_completed(futures): - slug, title, aliases = futures[future] - try: - record = future.result() - except ProviderTitleCrawlTimeoutError as exc: - logger.warning( - "Provider catalog {}: title crawl timed out after {}s slug={} title={}: {}", - provider_key, - int(title_timeout_seconds), - slug, - title, - exc, - ) - record = _fallback_title_record( - provider_key=provider_key, - slug=slug, - title=title, - aliases=aliases, - ) - except Exception as exc: - logger.warning( - "Provider catalog {}: title crawl failed slug={} title={}: {}", - provider_key, - slug, - title, - exc, - ) - record = _fallback_title_record( - provider_key=provider_key, - slug=slug, - title=title, - aliases=aliases, - ) - results.append(record) - if observer is not None and observer.on_title_crawled is not None: - observer.on_title_crawled(record.slug) - results.sort(key=lambda item: item.slug) - return results + return _stream_aniworld_like_catalog( + provider_key=provider_key, + index=index, + alternatives=alternatives, + emit_title=emit_title, + observer=observer, + title_timeout_seconds=title_timeout_seconds, + ) + + +def crawl_provider_catalog( + provider_key: str, + *, + observer: CatalogCrawlObserver | None = None, +) -> list[TitleRecord]: + titles: list[TitleRecord] = [] + stream_provider_catalog( + provider_key, + emit_title=titles.append, + observer=observer, + ) + titles.sort(key=lambda item: item.slug) + return titles diff --git a/apps/api/app/config.py b/apps/api/app/config.py index d0c40c6c..708b7c12 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -66,6 +66,21 @@ def _as_non_negative_int(val: str | None, default: int) -> int: return parsed +def _as_non_negative_float(val: str | None, default: float) -> float: + """Parse *val* as a non-negative float, returning *default* on failure.""" + if val is None: + return default + try: + parsed = float(val.strip()) + except TypeError, ValueError: + logger.warning("Invalid non-negative float value {!r}; using {}", val, default) + return default + if parsed < 0: + logger.warning("Negative value {} is not allowed; using {}", parsed, default) + return default + return parsed + + # Always-on public IP monitor. PUBLIC_IP_CHECK_ENABLED = _as_bool(os.getenv("PUBLIC_IP_CHECK_ENABLED", None), False) PUBLIC_IP_CHECK_INTERVAL_MIN = int(os.getenv("PUBLIC_IP_CHECK_INTERVAL_MIN", "30") or 0) @@ -289,6 +304,35 @@ def _ensure_runtime_home() -> Path: 5, _as_non_negative_int(os.getenv("PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS"), 45), ) +PROVIDER_INDEX_QUEUE_SIZE = max( + 1, + _as_non_negative_int(os.getenv("PROVIDER_INDEX_QUEUE_SIZE"), 32), +) +PROVIDER_INDEX_WRITER_BATCH_SIZE = max( + 1, + _as_non_negative_int(os.getenv("PROVIDER_INDEX_WRITER_BATCH_SIZE"), 8), +) +PROVIDER_INDEX_WRITER_FLUSH_SECONDS = max( + 0.1, + _as_non_negative_float(os.getenv("PROVIDER_INDEX_WRITER_FLUSH_SECONDS"), 1.0), +) +PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT = min( + 100.0, + max( + 0.0, + _as_non_negative_float( + os.getenv("PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT"), + 20.0, + ), + ), +) +PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS = max( + 1.0, + _as_non_negative_float( + os.getenv("PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS"), + 15.0, + ), +) logger.debug( f"ANIWORLD_ALPHABET_HTML={ANIWORLD_ALPHABET_HTML}, ANIWORLD_ALPHABET_URL={ANIWORLD_ALPHABET_URL}" @@ -318,6 +362,14 @@ def _ensure_runtime_home() -> Path: "Provider index title timeout: {}s", PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, ) +logger.debug( + "Provider index writer: queue_size={} batch_size={} flush_seconds={} failure_threshold_percent={} backpressure_log_seconds={}", + PROVIDER_INDEX_QUEUE_SIZE, + PROVIDER_INDEX_WRITER_BATCH_SIZE, + PROVIDER_INDEX_WRITER_FLUSH_SECONDS, + PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT, + PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS, +) # TTL (Stunden) für Live-Index; 0 = nie neu laden (nur einmal pro Prozess) ANIWORLD_TITLES_REFRESH_HOURS = float(os.getenv("ANIWORLD_TITLES_REFRESH_HOURS", "24")) diff --git a/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py b/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py new file mode 100644 index 00000000..7b8f9471 --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py @@ -0,0 +1,364 @@ +"""Make provider mappings generation-aware + +Revision ID: 20260429_0005 +Revises: 20260428_0004 +Create Date: 2026-04-29 00:00:00.000000 +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260429_0005" +down_revision = "20260428_0004" +branch_labels = None +depends_on = None + + +def _rebuild_provider_mapping_table( + *, + table_name: str, + create_sql: str, + copy_sql: str, + index_sql: list[str], +) -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + if not inspector.has_table(table_name): + return + columns = {column["name"] for column in inspector.get_columns(table_name)} + if "indexed_generation" in columns: + return + + temp_table = f"{table_name}_v2" + op.execute(sa.text(f"DROP TABLE IF EXISTS {temp_table}")) + op.execute(sa.text(create_sql)) + op.execute(sa.text(copy_sql)) + op.drop_table(table_name) + op.rename_table(temp_table, table_name) + for statement in index_sql: + op.execute(sa.text(statement)) + + +def upgrade() -> None: + _rebuild_provider_mapping_table( + table_name="providerseriesmapping", + create_sql=""" + CREATE TABLE providerseriesmapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + tvdb_id INTEGER NOT NULL, + indexed_generation VARCHAR NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, tvdb_id, indexed_generation) + ) + """, + copy_sql=""" + INSERT INTO providerseriesmapping_v2 ( + provider, + slug, + tvdb_id, + indexed_generation, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + mapping.provider, + mapping.slug, + mapping.tvdb_id, + COALESCE(status.latest_success_generation, status.current_generation, 'legacy'), + mapping.confidence, + mapping.source, + mapping.rationale, + mapping.last_verified_at + FROM providerseriesmapping AS mapping + LEFT JOIN providerindexstatus AS status + ON status.provider = mapping.provider + """, + index_sql=[ + "CREATE INDEX ix_providerseriesmapping_confidence ON providerseriesmapping (confidence)", + "CREATE INDEX ix_providerseriesmapping_source ON providerseriesmapping (source)", + "CREATE INDEX ix_providerseriesmapping_last_verified_at ON providerseriesmapping (last_verified_at)", + "CREATE INDEX ix_providerseriesmapping_indexed_generation ON providerseriesmapping (indexed_generation)", + ], + ) + _rebuild_provider_mapping_table( + table_name="providerepisodemapping", + create_sql=""" + CREATE TABLE providerepisodemapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + provider_season INTEGER NOT NULL, + provider_episode INTEGER NOT NULL, + tvdb_id INTEGER NOT NULL, + canonical_season INTEGER NOT NULL, + canonical_episode INTEGER NOT NULL, + indexed_generation VARCHAR NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY ( + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + indexed_generation + ) + ) + """, + copy_sql=""" + INSERT INTO providerepisodemapping_v2 ( + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + indexed_generation, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + mapping.provider, + mapping.slug, + mapping.provider_season, + mapping.provider_episode, + mapping.tvdb_id, + mapping.canonical_season, + mapping.canonical_episode, + COALESCE(status.latest_success_generation, status.current_generation, 'legacy'), + mapping.confidence, + mapping.source, + mapping.rationale, + mapping.last_verified_at + FROM providerepisodemapping AS mapping + LEFT JOIN providerindexstatus AS status + ON status.provider = mapping.provider + """, + index_sql=[ + "CREATE INDEX ix_providerepisodemapping_confidence ON providerepisodemapping (confidence)", + "CREATE INDEX ix_providerepisodemapping_source ON providerepisodemapping (source)", + "CREATE INDEX ix_providerepisodemapping_last_verified_at ON providerepisodemapping (last_verified_at)", + "CREATE INDEX ix_providerepisodemapping_indexed_generation ON providerepisodemapping (indexed_generation)", + ], + ) + _rebuild_provider_mapping_table( + table_name="providermoviemapping", + create_sql=""" + CREATE TABLE providermoviemapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + tmdb_id INTEGER NOT NULL, + indexed_generation VARCHAR NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, tmdb_id, indexed_generation) + ) + """, + copy_sql=""" + INSERT INTO providermoviemapping_v2 ( + provider, + slug, + tmdb_id, + indexed_generation, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + mapping.provider, + mapping.slug, + mapping.tmdb_id, + COALESCE(status.latest_success_generation, status.current_generation, 'legacy'), + mapping.confidence, + mapping.source, + mapping.rationale, + mapping.last_verified_at + FROM providermoviemapping AS mapping + LEFT JOIN providerindexstatus AS status + ON status.provider = mapping.provider + """, + index_sql=[ + "CREATE INDEX ix_providermoviemapping_confidence ON providermoviemapping (confidence)", + "CREATE INDEX ix_providermoviemapping_source ON providermoviemapping (source)", + "CREATE INDEX ix_providermoviemapping_last_verified_at ON providermoviemapping (last_verified_at)", + "CREATE INDEX ix_providermoviemapping_indexed_generation ON providermoviemapping (indexed_generation)", + ], + ) + + +def downgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + + downgrade_specs = { + "providerseriesmapping": { + "create_sql": """ + CREATE TABLE providerseriesmapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + tvdb_id INTEGER NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, tvdb_id) + ) + """, + "copy_sql": """ + INSERT INTO providerseriesmapping_v2 ( + provider, + slug, + tvdb_id, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + provider, + slug, + tvdb_id, + confidence, + source, + rationale, + last_verified_at + FROM providerseriesmapping + """, + "indexes": [ + "CREATE INDEX ix_providerseriesmapping_confidence ON providerseriesmapping (confidence)", + "CREATE INDEX ix_providerseriesmapping_source ON providerseriesmapping (source)", + "CREATE INDEX ix_providerseriesmapping_last_verified_at ON providerseriesmapping (last_verified_at)", + ], + }, + "providerepisodemapping": { + "create_sql": """ + CREATE TABLE providerepisodemapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + provider_season INTEGER NOT NULL, + provider_episode INTEGER NOT NULL, + tvdb_id INTEGER NOT NULL, + canonical_season INTEGER NOT NULL, + canonical_episode INTEGER NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY ( + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode + ) + ) + """, + "copy_sql": """ + INSERT INTO providerepisodemapping_v2 ( + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + confidence, + source, + rationale, + last_verified_at + FROM providerepisodemapping + """, + "indexes": [ + "CREATE INDEX ix_providerepisodemapping_confidence ON providerepisodemapping (confidence)", + "CREATE INDEX ix_providerepisodemapping_source ON providerepisodemapping (source)", + "CREATE INDEX ix_providerepisodemapping_last_verified_at ON providerepisodemapping (last_verified_at)", + ], + }, + "providermoviemapping": { + "create_sql": """ + CREATE TABLE providermoviemapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + tmdb_id INTEGER NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, tmdb_id) + ) + """, + "copy_sql": """ + INSERT INTO providermoviemapping_v2 ( + provider, + slug, + tmdb_id, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + provider, + slug, + tmdb_id, + confidence, + source, + rationale, + last_verified_at + FROM providermoviemapping + """, + "indexes": [ + "CREATE INDEX ix_providermoviemapping_confidence ON providermoviemapping (confidence)", + "CREATE INDEX ix_providermoviemapping_source ON providermoviemapping (source)", + "CREATE INDEX ix_providermoviemapping_last_verified_at ON providermoviemapping (last_verified_at)", + ], + }, + } + + for table_name, spec in downgrade_specs.items(): + if not inspector.has_table(table_name): + continue + columns = {column["name"] for column in inspector.get_columns(table_name)} + if "indexed_generation" not in columns: + continue + temp_table = f"{table_name}_v2" + op.execute(sa.text(f"DROP TABLE IF EXISTS {temp_table}")) + op.execute(sa.text(spec["create_sql"])) + op.execute(sa.text(spec["copy_sql"])) + op.drop_table(table_name) + op.rename_table(temp_table, table_name) + for statement in spec["indexes"]: + op.execute(sa.text(statement)) diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index 3e7b8ff2..1ddc6873 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -11,6 +11,7 @@ # Defer logger configuration to application startup from sqlmodel import SQLModel, Field, Session, create_engine, select, Column, JSON +from sqlalchemy import tuple_ from sqlalchemy.orm import registry as sa_registry from sqlalchemy.pool import NullPool @@ -288,6 +289,7 @@ class ProviderSeriesMapping(ModelBase, table=True): provider: str = Field(primary_key=True) slug: str = Field(primary_key=True) tvdb_id: int = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) confidence: str = Field(default="unresolved", index=True) source: str = Field(default="title_match", index=True) rationale: Optional[str] = None @@ -302,6 +304,7 @@ class ProviderEpisodeMapping(ModelBase, table=True): tvdb_id: int = Field(primary_key=True) canonical_season: int = Field(primary_key=True) canonical_episode: int = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) confidence: str = Field(default="unresolved", index=True) source: str = Field(default="numbering", index=True) rationale: Optional[str] = None @@ -312,6 +315,7 @@ class ProviderMovieMapping(ModelBase, table=True): provider: str = Field(primary_key=True) slug: str = Field(primary_key=True) tmdb_id: int = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) confidence: str = Field(default="unresolved", index=True) source: str = Field(default="title_year", index=True) rationale: Optional[str] = None @@ -720,22 +724,26 @@ def normalize_catalog_text(value: str) -> str: return normalized.lower().strip() +_UNSET = object() + + def upsert_provider_index_status( session: Session, *, provider: str, refresh_interval_hours: float, status: Optional[str] = None, - current_generation: Optional[str] = None, - latest_success_generation: Optional[str] = None, - latest_started_at: Optional[datetime] = None, - latest_completed_at: Optional[datetime] = None, - latest_success_at: Optional[datetime] = None, - next_refresh_after: Optional[datetime] = None, + current_generation: Optional[str] | object = _UNSET, + latest_success_generation: Optional[str] | object = _UNSET, + latest_started_at: Optional[datetime] | object = _UNSET, + latest_completed_at: Optional[datetime] | object = _UNSET, + latest_success_at: Optional[datetime] | object = _UNSET, + next_refresh_after: Optional[datetime] | object = _UNSET, bootstrap_completed: Optional[bool] = None, failure_count: Optional[int] = None, - last_error_summary: Optional[str] = None, - cursor_title_slug: Optional[str] = None, + last_error_summary: Optional[str] | object = _UNSET, + cursor_title_slug: Optional[str] | object = _UNSET, + commit: bool = True, ) -> ProviderIndexStatus: rec = session.get(ProviderIndexStatus, provider) if rec is None: @@ -746,30 +754,33 @@ def upsert_provider_index_status( rec.refresh_interval_hours = refresh_interval_hours if status is not None: rec.status = status - if current_generation is not None: + if current_generation is not _UNSET: rec.current_generation = current_generation - if latest_success_generation is not None: + if latest_success_generation is not _UNSET: rec.latest_success_generation = latest_success_generation - if latest_started_at is not None: + if latest_started_at is not _UNSET: rec.latest_started_at = latest_started_at - if latest_completed_at is not None: + if latest_completed_at is not _UNSET: rec.latest_completed_at = latest_completed_at - if latest_success_at is not None: + if latest_success_at is not _UNSET: rec.latest_success_at = latest_success_at - if next_refresh_after is not None: + if next_refresh_after is not _UNSET: rec.next_refresh_after = next_refresh_after if bootstrap_completed is not None: rec.bootstrap_completed = bootstrap_completed if failure_count is not None: rec.failure_count = failure_count - if last_error_summary is not None: + if last_error_summary is not _UNSET: rec.last_error_summary = last_error_summary - if cursor_title_slug is not None or status == "ready": + if cursor_title_slug is not _UNSET: rec.cursor_title_slug = cursor_title_slug + elif status == "ready": + rec.cursor_title_slug = None rec.updated_at = utcnow() session.add(rec) - session.commit() - session.refresh(rec) + if commit: + session.commit() + session.refresh(rec) return rec @@ -794,6 +805,7 @@ def upsert_provider_title_index_state( succeeded_at: Optional[datetime] = None, failure_count: Optional[int] = None, last_error_summary: Optional[str] = None, + commit: bool = True, ) -> ProviderTitleIndexState: rec = session.get(ProviderTitleIndexState, (provider, slug)) if rec is None: @@ -808,8 +820,9 @@ def upsert_provider_title_index_state( rec.last_error_summary = last_error_summary rec.updated_at = utcnow() session.add(rec) - session.commit() - session.refresh(rec) + if commit: + session.commit() + session.refresh(rec) return rec @@ -968,6 +981,24 @@ def prune_provider_generation( & (ProviderCatalogTitle.indexed_generation != keep_generation) ) ) + session.exec( + ProviderSeriesMapping.__table__.delete().where( + (ProviderSeriesMapping.provider == provider) + & (ProviderSeriesMapping.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderEpisodeMapping.__table__.delete().where( + (ProviderEpisodeMapping.provider == provider) + & (ProviderEpisodeMapping.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderMovieMapping.__table__.delete().where( + (ProviderMovieMapping.provider == provider) + & (ProviderMovieMapping.indexed_generation != keep_generation) + ) + ) session.commit() @@ -1001,6 +1032,24 @@ def delete_provider_generation( & (ProviderCatalogTitle.indexed_generation == generation) ) ) + session.exec( + ProviderSeriesMapping.__table__.delete().where( + (ProviderSeriesMapping.provider == provider) + & (ProviderSeriesMapping.indexed_generation == generation) + ) + ) + session.exec( + ProviderEpisodeMapping.__table__.delete().where( + (ProviderEpisodeMapping.provider == provider) + & (ProviderEpisodeMapping.indexed_generation == generation) + ) + ) + session.exec( + ProviderMovieMapping.__table__.delete().where( + (ProviderMovieMapping.provider == provider) + & (ProviderMovieMapping.indexed_generation == generation) + ) + ) session.commit() @@ -1025,11 +1074,13 @@ def replace_provider_series_mappings( provider: str, slug: str, mappings: List[dict[str, Any]], + indexed_generation: str, ) -> None: session.exec( ProviderSeriesMapping.__table__.delete().where( (ProviderSeriesMapping.provider == provider) & (ProviderSeriesMapping.slug == slug) + & (ProviderSeriesMapping.indexed_generation == indexed_generation) ) ) for mapping in mappings: @@ -1038,6 +1089,7 @@ def replace_provider_series_mappings( provider=provider, slug=slug, tvdb_id=int(mapping["tvdb_id"]), + indexed_generation=indexed_generation, confidence=str(mapping.get("confidence", "unresolved")), source=str(mapping.get("source", "title_match")), rationale=mapping.get("rationale"), @@ -1052,11 +1104,13 @@ def replace_provider_episode_mappings( provider: str, slug: str, mappings: List[dict[str, Any]], + indexed_generation: str, ) -> None: session.exec( ProviderEpisodeMapping.__table__.delete().where( (ProviderEpisodeMapping.provider == provider) & (ProviderEpisodeMapping.slug == slug) + & (ProviderEpisodeMapping.indexed_generation == indexed_generation) ) ) for mapping in mappings: @@ -1069,6 +1123,7 @@ def replace_provider_episode_mappings( tvdb_id=int(mapping["tvdb_id"]), canonical_season=int(mapping["canonical_season"]), canonical_episode=int(mapping["canonical_episode"]), + indexed_generation=indexed_generation, confidence=str(mapping.get("confidence", "unresolved")), source=str(mapping.get("source", "numbering")), rationale=mapping.get("rationale"), @@ -1083,11 +1138,13 @@ def replace_provider_movie_mappings( provider: str, slug: str, mappings: List[dict[str, Any]], + indexed_generation: str, ) -> None: session.exec( ProviderMovieMapping.__table__.delete().where( (ProviderMovieMapping.provider == provider) & (ProviderMovieMapping.slug == slug) + & (ProviderMovieMapping.indexed_generation == indexed_generation) ) ) for mapping in mappings: @@ -1096,6 +1153,7 @@ def replace_provider_movie_mappings( provider=provider, slug=slug, tmdb_id=int(mapping["tmdb_id"]), + indexed_generation=indexed_generation, confidence=str(mapping.get("confidence", "unresolved")), source=str(mapping.get("source", "title_year")), rationale=mapping.get("rationale"), @@ -1429,6 +1487,9 @@ def find_provider_episode_mappings_for_canonical_episode( canonical_episode: int, providers: List[str], ) -> List[ProviderEpisodeMapping]: + visible_generations = _visible_generation_map(session, providers=providers) + if not visible_generations: + return [] return list( session.exec( select(ProviderEpisodeMapping).where( @@ -1436,6 +1497,12 @@ def find_provider_episode_mappings_for_canonical_episode( & (ProviderEpisodeMapping.canonical_season == canonical_season) & (ProviderEpisodeMapping.canonical_episode == canonical_episode) & (ProviderEpisodeMapping.provider.in_(providers)) + & ( + tuple_( + ProviderEpisodeMapping.provider, + ProviderEpisodeMapping.indexed_generation, + ).in_(list(visible_generations.items())) + ) & ProviderEpisodeMapping.confidence.in_( ["confirmed", "high_confidence", "low_confidence"] ) @@ -1451,12 +1518,21 @@ def find_provider_episode_mappings_for_canonical_season( canonical_season: int, providers: List[str], ) -> List[ProviderEpisodeMapping]: + visible_generations = _visible_generation_map(session, providers=providers) + if not visible_generations: + return [] return list( session.exec( select(ProviderEpisodeMapping).where( (ProviderEpisodeMapping.tvdb_id == tvdb_id) & (ProviderEpisodeMapping.canonical_season == canonical_season) & (ProviderEpisodeMapping.provider.in_(providers)) + & ( + tuple_( + ProviderEpisodeMapping.provider, + ProviderEpisodeMapping.indexed_generation, + ).in_(list(visible_generations.items())) + ) & ProviderEpisodeMapping.confidence.in_( ["confirmed", "high_confidence", "low_confidence"] ) @@ -1473,12 +1549,19 @@ def find_provider_episode_mapping( provider_season: int, provider_episode: int, ) -> Optional[ProviderEpisodeMapping]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return None return session.exec( select(ProviderEpisodeMapping).where( (ProviderEpisodeMapping.provider == provider) & (ProviderEpisodeMapping.slug == slug) & (ProviderEpisodeMapping.provider_season == provider_season) & (ProviderEpisodeMapping.provider_episode == provider_episode) + & ( + ProviderEpisodeMapping.indexed_generation + == status.latest_success_generation + ) & ProviderEpisodeMapping.confidence.in_( ["confirmed", "high_confidence", "low_confidence"] ) diff --git a/apps/api/tests/integration/api/torznab/test_api.py b/apps/api/tests/integration/api/torznab/test_api.py index f7181b1a..d7d388ef 100644 --- a/apps/api/tests/integration/api/torznab/test_api.py +++ b/apps/api/tests/integration/api/torznab/test_api.py @@ -100,6 +100,7 @@ def _seed_ready_tv_catalog( session, provider="aniworld.to", slug=slug, + indexed_generation=generation, mappings=[ { "tvdb_id": tvdb_id, @@ -113,6 +114,7 @@ def _seed_ready_tv_catalog( session, provider="aniworld.to", slug=slug, + indexed_generation=generation, mappings=[ { "provider_season": provider_season, diff --git a/apps/api/tests/integration/api/torznab/test_indexed_catalog.py b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py index 524c49dd..5c3d0bf5 100644 --- a/apps/api/tests/integration/api/torznab/test_indexed_catalog.py +++ b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py @@ -81,6 +81,7 @@ def _seed_ready_catalog() -> None: session, provider="aniworld.to", slug="kaguya-sama", + indexed_generation=generation, mappings=[ { "tvdb_id": 12345, @@ -94,6 +95,7 @@ def _seed_ready_catalog() -> None: session, provider="aniworld.to", slug="kaguya-sama", + indexed_generation=generation, mappings=[ { "provider_season": 1, diff --git a/apps/api/tests/integration/api/torznab/test_specials_mapping.py b/apps/api/tests/integration/api/torznab/test_specials_mapping.py index f6a07546..3100a9e9 100644 --- a/apps/api/tests/integration/api/torznab/test_specials_mapping.py +++ b/apps/api/tests/integration/api/torznab/test_specials_mapping.py @@ -84,6 +84,7 @@ def _seed_special_mapping_catalog(*, languages: list[str]) -> None: session, provider="aniworld.to", slug="kaguya", + indexed_generation=generation, mappings=[ { "tvdb_id": 12345, @@ -97,6 +98,7 @@ def _seed_special_mapping_catalog(*, languages: list[str]) -> None: session, provider="aniworld.to", slug="kaguya", + indexed_generation=generation, mappings=[ { "provider_season": 0, diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index bb04bdb0..14129d9c 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -1,4 +1,3 @@ -import time from datetime import datetime, timezone from types import SimpleNamespace @@ -20,56 +19,74 @@ def fake_run_due_once() -> None: assert calls == ["called"] -def test_catalog_discovery_logs_heartbeat(monkeypatch): - import app.catalog.indexer as indexer_module +def test_catalog_progress_tracks_crawl_and_persist_counts(): from app.catalog.indexer import ProviderCatalogIndexer - messages: list[str] = [] - - def fake_info(message: str, *args) -> None: - messages.append(message.format(*args)) - - def fake_crawl(_provider: str, *, observer=None) -> list[object]: - assert observer is not None - time.sleep(0.03) - return [] - - monkeypatch.setattr(indexer_module, "_DISCOVERY_HEARTBEAT_SECONDS", 0.01) - monkeypatch.setattr(indexer_module, "crawl_provider_catalog", fake_crawl) - monkeypatch.setattr(indexer_module.logger, "info", fake_info) - - titles = ProviderCatalogIndexer()._crawl_provider_catalog_with_heartbeat( - "aniworld.to" + indexer = ProviderCatalogIndexer() + indexer._set_progress( + "aniworld.to", + phase="crawling_titles", + total_titles=10, + reset_log_steps=True, + ) + indexer._advance_crawl_progress( + "aniworld.to", + current_slug="slug-1", + queue_depth=3, + ) + indexer._advance_failed_progress( + "aniworld.to", + current_slug="slug-2", + queue_depth=2, + ) + indexer._advance_persist_progress( + "aniworld.to", + current_slug="slug-1", + count=1, + queue_depth=1, ) - assert titles == [] - assert any("still discovering titles after" in message for message in messages) + assert indexer._get_total_titles("aniworld.to") == 10 + assert indexer._get_persisted_titles("aniworld.to") == 1 + assert indexer._get_failed_titles("aniworld.to") == 1 + assert indexer._get_writer_lag("aniworld.to") == 0 -def test_catalog_discovery_logs_title_crawl_counts(monkeypatch): - import app.catalog.indexer as indexer_module +def test_stale_generation_detection_handles_running_and_published_states(): from app.catalog.indexer import ProviderCatalogIndexer - messages: list[str] = [] - - def fake_info(message: str, *args) -> None: - messages.append(message.format(*args)) - - def fake_crawl(_provider: str, *, observer=None) -> list[object]: - assert observer is not None - observer.on_index_loaded(10) - observer.on_title_crawled("slug-1") - time.sleep(0.03) - return [] - - monkeypatch.setattr(indexer_module, "_DISCOVERY_HEARTBEAT_SECONDS", 0.01) - monkeypatch.setattr(indexer_module, "crawl_provider_catalog", fake_crawl) - monkeypatch.setattr(indexer_module.logger, "info", fake_info) - - ProviderCatalogIndexer()._crawl_provider_catalog_with_heartbeat("aniworld.to") + indexer = ProviderCatalogIndexer() - assert any("loaded title index with 10 titles" in message for message in messages) - assert any("crawling title details 1/10 (10.0%)" in message for message in messages) + assert ( + indexer._stale_generation( + SimpleNamespace( + status="running", + current_generation="gen-running", + latest_success_generation="gen-old", + ) + ) + == "gen-running" + ) + assert ( + indexer._stale_generation( + SimpleNamespace( + status="failed", + current_generation="gen-staging", + latest_success_generation="gen-old", + ) + ) + == "gen-staging" + ) + assert ( + indexer._stale_generation( + SimpleNamespace( + status="ready", + current_generation="gen-live", + latest_success_generation="gen-live", + ) + ) + is None + ) def test_catalog_recovers_interrupted_running_state(monkeypatch): @@ -77,12 +94,14 @@ def test_catalog_recovers_interrupted_running_state(monkeypatch): from app.catalog.indexer import ProviderCatalogIndexer updates: list[dict[str, object]] = [] + cleaned: list[tuple[str, str]] = [] warnings: list[str] = [] statuses = { "aniworld.to": SimpleNamespace( provider="aniworld.to", status="running", bootstrap_completed=False, + current_generation="staging-123", latest_started_at=None, latest_success_generation=None, next_refresh_after=None, @@ -94,6 +113,7 @@ def test_catalog_recovers_interrupted_running_state(monkeypatch): provider="megakino", status="ready", bootstrap_completed=True, + current_generation="abc123", latest_started_at=None, latest_success_generation="abc123", next_refresh_after=None, @@ -119,10 +139,16 @@ def fake_upsert_provider_index_status(_session, **kwargs): updates.append(kwargs) return None + def fake_delete_provider_generation(_session, *, provider: str, generation: str): + cleaned.append((provider, generation)) + monkeypatch.setattr(indexer_module, "Session", lambda _engine: FakeSession()) monkeypatch.setattr( indexer_module, "get_provider_index_status", fake_get_provider_index_status ) + monkeypatch.setattr( + indexer_module, "delete_provider_generation", fake_delete_provider_generation + ) monkeypatch.setattr( indexer_module, "upsert_provider_index_status", @@ -132,8 +158,12 @@ def fake_upsert_provider_index_status(_session, **kwargs): ProviderCatalogIndexer()._ensure_status_rows() - assert any("recovered interrupted run for aniworld.to" in item for item in warnings) + assert any( + "found interrupted staging generation for aniworld.to" in item + for item in warnings + ) assert any("Initial bootstrap required" in item for item in warnings) + assert cleaned == [("aniworld.to", "staging-123")] assert any( update.get("provider") == "aniworld.to" and update.get("status") == "pending" for update in updates diff --git a/internal/agents/api.md b/internal/agents/api.md index 46f677d5..aca058b7 100644 --- a/internal/agents/api.md +++ b/internal/agents/api.md @@ -8,7 +8,9 @@ ## Health Endpoint (`/health`) - Method: GET -- Response: JSON with `status`, `database`, `scheduler`, `download_dir`, `version`, `runtime` +- Response: JSON with `status` and `catalog` +- `catalog` includes bootstrap readiness plus per-provider crawl/persist counters, + queue depth, writer lag, current slug, and staging-vs-live generation state. ## Torznab Namespace (`/torznab/api`) diff --git a/internal/agents/architecture.md b/internal/agents/architecture.md index 08358acc..39cbd9a9 100644 --- a/internal/agents/architecture.md +++ b/internal/agents/architecture.md @@ -45,6 +45,10 @@ ## Scheduler & Background Services - Thread pool size controlled by `MAX_CONCURRENCY` (default 3). +- Provider catalog indexing now runs as a staged streaming pipeline: + crawler workers emit completed titles into a bounded queue, one writer thread + persists batches into SQLite, and generation promotion only happens after the + full refresh succeeds. - Cleanup thread deletes downloads older than `DOWNLOADS_TTL_HOURS`. - Public IP monitor runs only when `PUBLIC_IP_CHECK_ENABLED=true`. - Lifespan ensures graceful shutdown of scheduler, DB engine, and background threads. @@ -53,5 +57,7 @@ - Loguru configuration lives in `apps/api/app/utils/logger.py`. - `TerminalLogger` duplicates stdout/stderr to `data/terminal-YYYY-MM-DD.log`. -- `/health` endpoint provides liveness/readiness details. +- `/health` and `/health/catalog` expose provider bootstrap state plus crawl, + persistence, queue-depth, and staging-generation progress for the catalog + indexer. - Update notifier logs when new GitHub releases are available. diff --git a/internal/agents/configuration.md b/internal/agents/configuration.md index 295d004f..1fad19e7 100644 --- a/internal/agents/configuration.md +++ b/internal/agents/configuration.md @@ -11,6 +11,8 @@ AniBridge centralizes configuration in `apps/api/app/config.py`. Values are deri `PROVIDER_REDIRECT_RETRIES`, `PROVIDER_CHALLENGE_BACKOFF_SECONDS`, `MAX_CONCURRENCY`, `DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC`, `DOWNLOADS_TTL_HOURS`, `CLEANUP_SCAN_INTERVAL_MIN` +- Provider catalog index: `PROVIDER_INDEX_*` refresh cadence, queue bounds, + writer batching, and failure-threshold controls for staged catalog refreshes - STRM: `STRM_FILES_MODE`, `STRM_PROXY_*` - Networking policy: external VPN/VPN-sidecar routing only + `PUBLIC_IP_CHECK_*` - Video-host order default: `VOE,Filemoon,Streamtape,Vidmoly,Doodstream,LoadX,Luluvdo,Vidoza` via `PROVIDER_ORDER`, mapped at runtime to `VIDEO_HOST_ORDER` @@ -46,59 +48,74 @@ AniBridge centralizes configuration in `apps/api/app/config.py`. Values are deri 17. `MEGAKINO_TITLES_REFRESH_HOURS` — Megakino refresh interval. 18. `MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN` — Megakino domain checker interval. 19. `CATALOG_SITES` — Enabled catalogue sites. -20. `SOURCE_TAG` — Release source tag (default `WEB`). -21. `RELEASE_GROUP` — Release group label (default `aniworld`). -22. `RELEASE_GROUP_ANIWORLD` — AniWorld release group override. -23. `RELEASE_GROUP_STO` — s.to release group override. -24. `PROVIDER_ORDER` — Comma-separated video-host priority input; mapped at runtime to `VIDEO_HOST_ORDER`. -25. `PROVIDER_REDIRECT_TIMEOUT_SECONDS` — Timeout for resolving catalogue redirect tokens into video-host URLs (default `12`). -26. `PROVIDER_REDIRECT_RETRIES` — Extra retry attempts for transient video-host redirect failures (default `2`). -27. `PROVIDER_CHALLENGE_BACKOFF_SECONDS` — Base cool-down for Turnstile challenge retries (default `300`). -28. `MAX_CONCURRENCY` — Thread pool size (default `3`). -29. `DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC` — Per-download yt-dlp rate cap (`0` disables). -30. `INDEXER_NAME` — Torznab display name (default `AniBridge Torznab`). -31. `INDEXER_API_KEY` — Optional Torznab API key. -32. `TORZNAB_CAT_ANIME` — Category mapping (default `5070`). -33. `TORZNAB_CAT_MOVIE` — Movie category mapping (default `2000`). -34. `AVAILABILITY_TTL_HOURS` — Availability cache TTL (default `24`). -35. `TORZNAB_FAKE_SEEDERS` — Seeders in results (default `999`). -36. `TORZNAB_FAKE_LEECHERS` — Leechers in results (default `787`). -37. `TORZNAB_RETURN_TEST_RESULT` — Return test item (default `true`). -38. `TORZNAB_TEST_TITLE` — Test item title. -39. `TORZNAB_TEST_SLUG` — Test item slug. -40. `TORZNAB_TEST_SEASON` — Test season number. -41. `TORZNAB_TEST_EPISODE` — Test episode number. -42. `TORZNAB_TEST_LANGUAGE` — Test language label. -43. `TORZNAB_SEASON_SEARCH_MODE` — Season-search execution mode (`fast`/`strict`, default `fast`). -44. `TORZNAB_SEASON_SEARCH_MAX_EPISODES` — Season-search fallback probe ceiling (default `60`). -45. `TORZNAB_SEASON_SEARCH_MAX_CONSECUTIVE_MISSES` — Season-search fallback stop threshold (default `3`). -46. `DELETE_FILES_ON_TORRENT_DELETE` — Remove files on delete (default `true`). -47. `DOWNLOADS_TTL_HOURS` — TTL cleanup threshold (default `0`, disabled). -48. `CLEANUP_SCAN_INTERVAL_MIN` — Cleanup interval (default `30`). -49. `STRM_FILES_MODE` — STRM mode (`no`, `both`, `only`, default `no`). -50. `STRM_PROXY_MODE` — STRM proxy mode (`direct`, `proxy`, `redirect`, default `direct`). -51. `STRM_PUBLIC_BASE_URL` — Public base URL for STRM proxy URLs. -52. `STRM_PROXY_AUTH` — STRM proxy auth mode (`none`, `token`, `apikey`). -53. `STRM_PROXY_SECRET` — Shared secret for STRM proxy auth. -54. `STRM_PROXY_UPSTREAM_ALLOWLIST` — Comma-separated upstream host allowlist. -55. `STRM_PROXY_CACHE_TTL_SECONDS` — STRM URL cache TTL in seconds (default `0`). -56. `STRM_PROXY_TOKEN_TTL_SECONDS` — STRM proxy token TTL in seconds (default `900`). -57. `PROGRESS_FORCE_BAR` — Force progress bar (default `false`). -58. `PROGRESS_STEP_PERCENT` — Progress logging step (default `5`). -59. `ANIBRIDGE_UPDATE_CHECK` — Enable release polling (default `true`). -60. `ANIBRIDGE_GITHUB_TOKEN` — GitHub API token. -61. `ANIBRIDGE_GITHUB_OWNER` — GitHub owner (default `zzackllack`). -62. `ANIBRIDGE_GITHUB_REPO` — Repo name (default `AniBridge`). -63. `ANIBRIDGE_GHCR_IMAGE` — GHCR image slug (default `zzackllack/anibridge`). -64. `PUBLIC_IP_CHECK_ENABLED` — Enable periodic public IP logging (default `false`). -65. `PUBLIC_IP_CHECK_INTERVAL_MIN` — Public IP check interval minutes (default `30`). -66. `ANIBRIDGE_HOST` — Bind host. -67. `ANIBRIDGE_PORT` — Bind port. -68. `ANIBRIDGE_CORS_ORIGINS` — CORS origins. -69. `ANIBRIDGE_CORS_ALLOW_CREDENTIALS` — CORS credentials behavior. -70. `ANIBRIDGE_TEST_MODE` — Test-mode runtime toggle. -71. `PYTHONUNBUFFERED` — Set to `1` in Docker to keep logs flush. -72. `SONARR_*`, `PROWLARR_*` — Integration values documented in `docs/src/integrations/clients`. +20. `PROVIDER_INDEX_REFRESH_HOURS` — Default staged provider-index refresh cadence (default `24`). +21. `PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD` — AniWorld provider-index cadence override. +22. `PROVIDER_INDEX_REFRESH_HOURS_STO` — s.to provider-index cadence override. +23. `PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO` — megakino provider-index cadence override. +24. `PROVIDER_INDEX_SCHEDULER_POLL_SECONDS` — Scheduler poll interval for due refreshes (default `60`). +25. `PROVIDER_INDEX_GLOBAL_CONCURRENCY` — Max concurrent provider refreshes (default `1`). +26. `PROVIDER_INDEX_CONCURRENCY_ANIWORLD` — AniWorld title crawl worker count. +27. `PROVIDER_INDEX_CONCURRENCY_STO` — s.to title crawl worker count. +28. `PROVIDER_INDEX_CONCURRENCY_MEGAKINO` — megakino crawl worker count. +29. `PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS` — Soft title crawl timeout threshold (default `45`). +30. `PROVIDER_INDEX_QUEUE_SIZE` — Bounded title-result queue depth between crawlers and the SQLite writer (default `32`). +31. `PROVIDER_INDEX_WRITER_BATCH_SIZE` — SQLite writer commit batch size (default `8`). +32. `PROVIDER_INDEX_WRITER_FLUSH_SECONDS` — Max wait before the writer flushes a partial batch (default `1.0`). +33. `PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT` — Refresh abort threshold for failed title crawls (default `20`). +34. `PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS` — Minimum interval between repeated queue-backpressure logs (default `15`). +35. `SOURCE_TAG` — Release source tag (default `WEB`). +36. `RELEASE_GROUP` — Release group label (default `aniworld`). +37. `RELEASE_GROUP_ANIWORLD` — AniWorld release group override. +38. `RELEASE_GROUP_STO` — s.to release group override. +39. `PROVIDER_ORDER` — Comma-separated video-host priority input; mapped at runtime to `VIDEO_HOST_ORDER`. +40. `PROVIDER_REDIRECT_TIMEOUT_SECONDS` — Timeout for resolving catalogue redirect tokens into video-host URLs (default `12`). +41. `PROVIDER_REDIRECT_RETRIES` — Extra retry attempts for transient video-host redirect failures (default `2`). +42. `PROVIDER_CHALLENGE_BACKOFF_SECONDS` — Base cool-down for Turnstile challenge retries (default `300`). +43. `MAX_CONCURRENCY` — Thread pool size (default `3`). +44. `DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC` — Per-download yt-dlp rate cap (`0` disables). +45. `INDEXER_NAME` — Torznab display name (default `AniBridge Torznab`). +46. `INDEXER_API_KEY` — Optional Torznab API key. +47. `TORZNAB_CAT_ANIME` — Category mapping (default `5070`). +48. `TORZNAB_CAT_MOVIE` — Movie category mapping (default `2000`). +49. `AVAILABILITY_TTL_HOURS` — Availability cache TTL (default `24`). +50. `TORZNAB_FAKE_SEEDERS` — Seeders in results (default `999`). +51. `TORZNAB_FAKE_LEECHERS` — Leechers in results (default `787`). +52. `TORZNAB_RETURN_TEST_RESULT` — Return test item (default `true`). +53. `TORZNAB_TEST_TITLE` — Test item title. +54. `TORZNAB_TEST_SLUG` — Test item slug. +55. `TORZNAB_TEST_SEASON` — Test season number. +56. `TORZNAB_TEST_EPISODE` — Test episode number. +57. `TORZNAB_TEST_LANGUAGE` — Test language label. +58. `TORZNAB_SEASON_SEARCH_MODE` — Season-search execution mode (`fast`/`strict`, default `fast`). +59. `TORZNAB_SEASON_SEARCH_MAX_EPISODES` — Season-search fallback probe ceiling (default `60`). +60. `TORZNAB_SEASON_SEARCH_MAX_CONSECUTIVE_MISSES` — Season-search fallback stop threshold (default `3`). +61. `DELETE_FILES_ON_TORRENT_DELETE` — Remove files on delete (default `true`). +62. `DOWNLOADS_TTL_HOURS` — TTL cleanup threshold (default `0`, disabled). +63. `CLEANUP_SCAN_INTERVAL_MIN` — Cleanup interval (default `30`). +64. `STRM_FILES_MODE` — STRM mode (`no`, `both`, `only`, default `no`). +65. `STRM_PROXY_MODE` — STRM proxy mode (`direct`, `proxy`, `redirect`, default `direct`). +66. `STRM_PUBLIC_BASE_URL` — Public base URL for STRM proxy URLs. +67. `STRM_PROXY_AUTH` — STRM proxy auth mode (`none`, `token`, `apikey`). +68. `STRM_PROXY_SECRET` — Shared secret for STRM proxy auth. +69. `STRM_PROXY_UPSTREAM_ALLOWLIST` — Comma-separated upstream host allowlist. +70. `STRM_PROXY_CACHE_TTL_SECONDS` — STRM URL cache TTL in seconds (default `0`). +71. `STRM_PROXY_TOKEN_TTL_SECONDS` — STRM proxy token TTL in seconds (default `900`). +72. `PROGRESS_FORCE_BAR` — Force progress bar (default `false`). +73. `PROGRESS_STEP_PERCENT` — Progress logging step (default `5`). +74. `ANIBRIDGE_UPDATE_CHECK` — Enable release polling (default `true`). +75. `ANIBRIDGE_GITHUB_TOKEN` — GitHub API token. +76. `ANIBRIDGE_GITHUB_OWNER` — GitHub owner (default `zzackllack`). +77. `ANIBRIDGE_GITHUB_REPO` — Repo name (default `AniBridge`). +78. `ANIBRIDGE_GHCR_IMAGE` — GHCR image slug (default `zzackllack/anibridge`). +79. `PUBLIC_IP_CHECK_ENABLED` — Enable periodic public IP logging (default `false`). +80. `PUBLIC_IP_CHECK_INTERVAL_MIN` — Public IP check interval minutes (default `30`). +81. `ANIBRIDGE_HOST` — Bind host. +82. `ANIBRIDGE_PORT` — Bind port. +83. `ANIBRIDGE_CORS_ORIGINS` — CORS origins. +84. `ANIBRIDGE_CORS_ALLOW_CREDENTIALS` — CORS credentials behavior. +85. `ANIBRIDGE_TEST_MODE` — Test-mode runtime toggle. +86. `PYTHONUNBUFFERED` — Set to `1` in Docker to keep logs flush. +87. `SONARR_*`, `PROWLARR_*` — Integration values documented in `docs/src/integrations/clients`. ## Removed Legacy Proxy Variables From 9402e9704f3d059fd9bec3050f9c5466c3c3422a Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 20:53:10 +0200 Subject: [PATCH 21/45] docs(specs): performance considerations for provider catalog indexing --- .../03-performance.md | 1222 +++++++++++++++++ 1 file changed, 1222 insertions(+) create mode 100644 internal/specs/017-provider-catalog-index/03-performance.md diff --git a/internal/specs/017-provider-catalog-index/03-performance.md b/internal/specs/017-provider-catalog-index/03-performance.md new file mode 100644 index 00000000..ee4327c0 --- /dev/null +++ b/internal/specs/017-provider-catalog-index/03-performance.md @@ -0,0 +1,1222 @@ +# AniBridge Provider Catalog Indexing: Memory-Bounded Local Bootstrap and Progressive Enrichment Specification + +## 1. Purpose + +This specification defines the required redesign of AniBridge's provider catalog indexing system so that: + +- first startup is usable quickly; +- full provider catalog data is eventually persisted locally; +- memory stays bounded under realistic self-hosted deployments; +- no official prebuilt provider-derived database or JSON catalog is shipped by the project; +- request paths continue to read from the local database instead of doing uncontrolled live probing; +- SQLite remains viable as the default database backend; +- normal users do not need to manually tune concurrency, queue sizes, retry intervals, or caching settings. + +The core design principle is: + +> AniBridge must compute provider-derived catalog data locally, progressively, and with strict memory bounds. The app must never require loading provider-scale state into Python memory. + +This document focuses only on performance, memory, startup behavior, indexing architecture, and SQLite-safety. + +The following topics are explicitly out of scope for this task: + +- import/export of catalog snapshots; +- official prebuilt provider catalog snapshots; +- Uvicorn reload behavior; +- legal policy text changes; +- UI polish beyond minimal progress/readiness information needed for correct behavior. + +--- + +## 2. Background and Current Problem + +The current `provider-catalog-index` branch already moved from a previous "crawl everything then persist everything" design toward a streaming persistence model. + +However, during realistic first bootstrap runs the container can still grow to multiple gigabytes of RAM and may eventually stop or be killed by the container/runtime. + +The observed and discussed problems are: + +1. The catalog contains many titles: + - AniWorld has thousands of titles. + - `s.to` has more than ten thousand titles. + - Each title may expand into many episodes, languages, host hints, aliases, and canonical metadata mappings. + +2. Python object overhead is large: + - HTML responses; + - BeautifulSoup parse trees; + - provider library objects; + - `TitleRecord` dataclasses; + - nested episode/language objects; + - canonical metadata payloads; + - dictionaries/lists/strings; + - SQLAlchemy ORM identity map objects. + +3. A bounded result queue does not automatically bound total process memory because memory can also be retained by: + - active crawler workers; + - `Future` objects; + - global metadata caches; + - still-running timed-out workers; + - SQLAlchemy sessions; + - retry storms after failures; + - concurrent provider refreshes. + +4. SQLite write contention can happen when multiple provider writer paths or failure/status updates write concurrently. + +5. The first-run user experience is poor if a full crawl is required before the software can be used. + +6. Setting all concurrency to `1` is not an acceptable product solution. It is only an emergency workaround. + +7. Shipping a finished provider-derived database would improve UX but creates a worse legal/platform risk posture because the project would distribute a curated provider index as an official artifact. + +Therefore, AniBridge needs a local progressive indexing architecture that is fast enough for normal use, memory-bounded by default, and safe for SQLite. + +--- + +## 3. High-Level Product Goal + +On a fresh install, AniBridge should behave like this: + +1. Start the container. +2. Initialize the database. +3. Quickly build a lightweight provider title index: + - slug; + - title; + - aliases; + - provider-relative path; + - media type hint when available. +4. Mark the provider as searchable once the title index exists. +5. Allow the application to respond using database-backed catalog data. +6. Continue crawling expensive details in the background: + - seasons; + - episodes; + - available languages; + - host hints; + - canonical TVDB mappings; + - other normalized mapping data. +7. Never hold the full provider catalog in Python memory. +8. Keep RAM usage bounded below a strict target. + +Target resource goals: + +- Normal idle runtime: below 512 MB RAM. +- During provider indexing: preferably below 1 GB RAM. +- Absolute design target: no normal first-run indexing path should exceed 1 GB RAM. +- The implementation must avoid any algorithm where memory grows approximately linearly with total provider size. + +These targets are product requirements, not tuning suggestions. + +--- + +## 4. Legal/Risk-Driven Distribution Decision + +AniBridge must not ship or automatically download an official prebuilt provider-derived catalog database or JSON file as part of this task. + +Rationale: + +- Source code distribution and provider-derived catalog data distribution have different risk profiles. +- A prebuilt catalog would likely be considered an official project artifact. +- A prebuilt catalog could be attacked more easily through takedown or platform complaint processes because it is a concrete curated index. +- The project should avoid becoming the distributor of provider-derived operational metadata. + +Therefore: + +- No official provider-derived catalog snapshot must be added. +- No GitHub Release catalog asset must be required for first startup. +- No automatic download of a project-hosted provider catalog must be implemented. +- No bundled provider-derived SQLite/JSON catalog must be included in the Docker image. +- Local self-hosted computation remains the default. + +This does not prohibit normal database migrations, schema files, empty seed tables, or code-defined provider configuration. + +--- + +## 5. Non-Goals + +Do not implement the following in this task: + +1. Import/export of provider catalogs. +2. Official provider-derived database snapshots. +3. External hosted catalog update service. +4. Uvicorn reload changes. +5. Replacing SQLite with PostgreSQL as a requirement. +6. Requiring Redis, Celery, or another distributed job system. +7. A complex UI redesign. +8. Live probing as the normal search/request path. +9. Large-scale user-configurable indexing strategy UI. +10. Any feature that requires the user to manually understand queue sizes, concurrency, or SQLite locks. + +--- + +## 6. Core Architectural Decision + +The selected architecture is: + +```text +Local progressive indexing + +Phase A: lightweight provider title index + -> fast + -> memory bounded + -> enables basic catalog search/readiness + +Phase B: provider detail enrichment + -> background + -> bounded concurrency + -> writes episodes/languages/host hints + +Phase C: canonical metadata enrichment + -> background + -> bounded concurrency + -> DB-backed or bounded cache only + -> writes TVDB/canonical mappings + +Request path: + -> database only + -> no uncontrolled live provider crawl fallback + -> may trigger explicit targeted warm-up only when designed as a DB-writing indexing job +``` + +Important: + +* The system must not wait for Phase B and Phase C to finish before the app is considered basically usable. +* The request path must not directly perform expensive live crawling as a hidden fallback. +* Any targeted on-demand indexing must be explicit, bounded, persisted, and visible as an indexing job. + +--- + +## 7. Definitions + +### Provider + +A catalog source such as: + +* `aniworld.to` +* `s.to` +* `megakino` + +### Title Index + +The lightweight list of provider titles. + +Contains only cheap title-level metadata: + +* provider key; +* slug; +* display title; +* normalized title; +* aliases; +* normalized aliases; +* provider-relative path; +* media type hint if known; +* generation; +* indexed timestamp. + +This phase must not crawl every title detail page. + +### Detail Index + +The expanded provider-level metadata for a title: + +* seasons; +* episodes; +* episode-relative paths; +* episode titles when available; +* available languages; +* host hints; +* media type hints; +* detail crawl state. + +### Canonical Index + +Normalized mappings to external canonical metadata, for example: + +* TVDB series ID; +* canonical episode mapping; +* confidence; +* source; +* rationale. + +### Generation + +A version identifier for a consistent indexing pass. + +AniBridge may continue using the existing provider generation model, but the implementation must support partial/progressive stages without exposing inconsistent full-catalog state as complete. + +### Bootstrap Readiness + +There are multiple readiness levels: + +1. `title_ready` + + * The provider title index exists. + * Basic title search can work from DB. + +2. `detail_ready` + + * A meaningful detail crawl has completed for the provider. + +3. `canonical_ready` + + * Canonical mapping enrichment has completed or reached a configured baseline. + +4. `full_ready` + + * Detail and canonical enrichment have completed for the current provider generation. + +The app must not treat "title ready" and "full ready" as the same thing. + +--- + +## 8. Required Startup Behavior + +On application startup: + +1. Apply database migrations. +2. Ensure provider indexing status rows/stage rows exist. +3. Detect interrupted indexing work. +4. Clean up abandoned staging generations if necessary. +5. Start the provider catalog scheduler. +6. Schedule title index bootstrap for any provider that is not `title_ready`. +7. Schedule detail enrichment for title-ready providers that are not detail-ready. +8. Schedule canonical enrichment for titles/details that are not canonical-ready. +9. Do not block the web server until full catalog indexing is complete. + +The app must become reachable even if provider indexing is still running. + +Health/readiness responses must clearly distinguish: + +```text +app_ready: true +catalog_title_ready: true/false +catalog_detail_ready: true/false +catalog_canonical_ready: true/false +catalog_full_ready: true/false +provider phases... +``` + +--- + +## 9. Required Request Behavior + +### 9.1 Search and Torznab-like Query Behavior + +Request handlers must read from the database. + +They must not perform uncontrolled live crawling of provider title pages or provider episode pages during normal search. + +If the title index is not ready: + +* Return a clear "catalog title index is still initializing" response. +* Do not trigger large live crawling inside the request. + +If a title is found but details are not indexed yet: + +* Return only DB-backed data that is available; or +* Return a clear "details are still being indexed" message; or +* Optionally enqueue a bounded targeted warm-up job if that endpoint behavior is explicitly implemented. + +The request must not block for a full provider crawl. + +### 9.2 Optional Targeted Warm-Up + +A targeted warm-up feature may be implemented because it improves first-run usability without requiring a global prebuilt DB. + +If implemented, it must follow these rules: + +* It must be explicit. +* It must index one title or a small bounded set of titles. +* It must write results to the DB. +* It must use the same memory-bounded crawler pipeline as background indexing. +* It must not bypass persistence. +* It must not become an uncontrolled live-probing fallback for every request. +* It must expose job/progress state or at least clear pending/completed behavior. +* It must obey concurrency limits. + +Example behavior: + +```text +User searches title. +Title exists in DB title index. +Details missing. +System may enqueue targeted detail indexing for that title. +Request returns "details indexing queued/in progress" instead of doing full live work inline. +``` + +--- + +## 10. Indexing Pipeline + +The provider indexing pipeline must be split into stages. + +### 10.1 Stage A: Provider Title Index Bootstrap + +Purpose: + +* Quickly make provider titles searchable from DB. +* Avoid expensive per-title crawling. +* Avoid canonical metadata fetching. +* Avoid episode/detail expansion. + +Input: + +* Provider alphabet/index page. +* Local provider-specific title index source if configured. + +Output: + +* `ProviderCatalogTitle` rows. +* `ProviderCatalogAlias` rows. +* Stage status updated to `title_ready`. + +Hard requirements: + +* Must not construct one huge list of expanded `TitleRecord` objects. +* Must not crawl every episode page. +* Must not query SkyHook/TVDB/canonical metadata. +* Must write to SQLite in small batches. +* Must use direct SQL or short-lived SQLAlchemy sessions to avoid large identity maps. +* Must commit frequently. +* Must be safe to restart. + +Recommended default: + +* Title index bootstrap may run quickly and with low concurrency because it is not the bottleneck. +* It should complete in seconds to a few minutes, not hours. + +### 10.2 Stage B: Provider Detail Enrichment + +Purpose: + +* Crawl title detail pages. +* Persist episode/language/host-hint data. + +Input: + +* Title rows from the DB that are missing detail enrichment or are stale. + +Output: + +* `ProviderCatalogEpisode` rows. +* `ProviderEpisodeLanguage` rows. +* `ProviderTitleIndexState` detail success/failure state. +* Updated detail stage progress. + +Hard requirements: + +* Must process titles incrementally. +* Must never hold the entire provider detail catalog in memory. +* Must never accumulate a provider-sized Python list of detail results. +* Must use bounded title crawl concurrency. +* Must use a bounded queue or direct row-command pipeline. +* Must write to SQLite continuously. +* Must drop HTML/Soup/provider objects as soon as the title is persisted. +* Must tolerate per-title failures. +* Must not fail the whole provider because one title fails unless failure rate exceeds a configured threshold. +* Must respect retry backoff. + +### 10.3 Stage C: Canonical Metadata Enrichment + +Purpose: + +* Resolve titles/episodes to canonical IDs/mappings. +* Write canonical series and episode mapping rows. + +Input: + +* Provider titles/details from DB. + +Output: + +* `CanonicalSeries` rows. +* `CanonicalEpisode` rows. +* `ProviderSeriesMapping` rows. +* `ProviderEpisodeMapping` rows. +* `ProviderMovieMapping` rows if applicable. + +Hard requirements: + +* Must not use unbounded in-memory caches. +* Must not keep thousands of full show payloads in process-global dictionaries. +* Must use either: + + * a DB-backed canonical metadata cache; or + * a strict in-memory LRU/TTL cache with a hard max size. +* Must have low bounded concurrency by default. +* Must write results incrementally. +* Must tolerate failed canonical lookups. +* Must not block title readiness. +* Must not block detail indexing unless a specific mapping is required for that operation. + +Recommended default: + +* Canonical enrichment should be slower and safer than provider title/detail crawling. +* Use conservative concurrency because external metadata APIs can be slow and responses can be large. + +--- + +## 11. Memory-Bounding Requirements + +The implementation must satisfy all of the following. + +### 11.1 No Provider-Sized In-Memory Results + +Forbidden: + +```python +all_titles = crawl_entire_provider() +persist(all_titles) +``` + +Forbidden: + +```python +titles: list[TitleRecord] = [] +for title in provider: + titles.append(crawl_title(title)) +``` + +Required: + +```text +for each title: + crawl title + normalize to compact rows + write rows to DB + drop temporary objects +``` + +### 11.2 Bounded Queues + +Any queue between crawler workers and persistence must have a fixed max size. + +Default queue size should be small enough to keep RAM bounded. + +Recommended defaults: + +```env +PROVIDER_INDEX_QUEUE_SIZE=8 +PROVIDER_INDEX_WRITER_BATCH_SIZE=32 +``` + +If the queue is full: + +* crawler workers must block; +* logs should indicate backpressure at a rate-limited interval; +* the system must not create an unbounded fallback list. + +### 11.3 Compact Row Commands Preferred + +The queue should not contain large fully expanded objects if avoidable. + +Preferred queue item style: + +```text +PersistTitleDetailCommand + provider + slug + title fields + aliases + compact episode rows + compact language rows + compact mapping rows +``` + +Avoid queueing: + +* BeautifulSoup objects; +* raw HTML; +* HTTP responses; +* provider library model objects; +* full external API payloads; +* unnecessary canonical raw payloads. + +### 11.4 Drop Temporary Objects + +After each title is persisted: + +* raw HTML references must be released; +* BeautifulSoup references must be released; +* provider library objects must not be stored globally; +* large canonical payloads must be reduced to compact DB rows; +* SQLAlchemy sessions must be closed; +* batch lists must be cleared. + +### 11.5 Bounded Metadata Cache + +Process-global metadata caches must be bounded. + +Forbidden: + +```python +_search_cache = {} +_show_cache = {} +``` + +unless there is an enforced max size and eviction. + +Acceptable: + +```python +_search_cache = TTLCache(maxsize=512, ttl=3600) +_show_cache = TTLCache(maxsize=256, ttl=3600) +``` + +Better: + +* DB-backed canonical cache table; +* tiny in-memory LRU hot cache; +* compact cached payloads only. + +Cache requirements: + +* hard max entry count; +* TTL; +* no full provider-sized retention; +* no full raw API response retention if only a subset is needed; +* clear unit tests for max-size behavior. + +### 11.6 SQLAlchemy Session Discipline + +Persistence code must avoid long-lived sessions that accumulate thousands of ORM objects. + +Required: + +* short sessions per batch; +* commit frequently; +* clear/close session after each batch; +* avoid unnecessary `select(...).all()` before deletes; +* prefer direct delete statements for replacing child rows; +* consider `session.expire_all()` or new session per batch if identity map growth is observed; +* avoid query-triggered autoflush surprises by structuring transactions carefully. + +--- + +## 12. SQLite Safety Requirements + +SQLite is the default database and must work safely. + +### 12.1 WAL and Busy Timeout + +The SQLite engine must enable: + +```sql +PRAGMA journal_mode=WAL; +PRAGMA synchronous=NORMAL; +PRAGMA busy_timeout=30000; +``` + +Also configure the DB connection timeout, for example: + +```python +connect_args={ + "check_same_thread": False, + "timeout": 30, +} +``` + +### 12.2 Single-Writer Discipline + +Provider indexing writes must be serialized. + +Acceptable implementation options: + +1. A single global DB writer queue for catalog indexing. +2. A global provider-index DB write lock. +3. A write coordinator that ensures only one catalog-index write transaction runs at a time. + +The implementation must ensure that these write paths do not run concurrently against SQLite: + +* title index batch writes; +* detail enrichment batch writes; +* canonical enrichment writes; +* provider status writes; +* title failure-state writes; +* generation cleanup/promotion writes. + +If using a global write lock, all catalog indexing writes must use it. + +### 12.3 Provider-Level Concurrency Defaults + +Default provider-level concurrency must be: + +```env +PROVIDER_INDEX_GLOBAL_CONCURRENCY=1 +``` + +This means: + +* only one provider refresh/enrichment stage should run at a time by default; +* inside that provider, title-level concurrency may still be greater than one. + +This default protects SQLite from cross-provider write contention while preserving useful crawl speed. + +### 12.4 Title-Level Concurrency Defaults + +Recommended defaults: + +```env +PROVIDER_INDEX_CONCURRENCY_ANIWORLD=4 +PROVIDER_INDEX_CONCURRENCY_STO=4 +PROVIDER_INDEX_CONCURRENCY_MEGAKINO=2 +``` + +These are product defaults, not emergency values. + +The implementation must allow environment overrides, but the defaults must be safe for normal self-hosting. + +### 12.5 Retry Backoff Must Be Respected + +A failed provider or stage must not be retried immediately if `next_refresh_after` or equivalent retry timestamp is in the future. + +Required due logic: + +```python +if status is running: + return False + +if next_refresh_after exists and next_refresh_after > now: + return False + +otherwise: + return due according to status/stage rules +``` + +Forbidden behavior: + +```python +if latest_success_at is None: + return True +``` + +when a retry timestamp exists. + +This is critical to prevent first-bootstrap retry storms. + +--- + +## 13. Worker Timeout and Cancellation Requirements + +The current behavior of marking a title as timed out while leaving the underlying worker running is dangerous if the provider is retried quickly. + +Required behavior: + +1. All HTTP calls used by provider indexing must have hard timeouts. +2. A title timeout must prevent new work from piling up. +3. If a provider/stage fails, the scheduler must not immediately start another run while old workers may still be active. +4. The system must track active provider/stage workers and avoid duplicate runs. +5. `executor.shutdown(wait=False)` must not be used in a way that allows repeated retries while old threads continue consuming memory. +6. If Python threads cannot be safely stopped, the retry/backoff logic must account for that and wait long enough before another run. +7. Timed-out title work must not retain large result objects. + +Recommended implementation: + +* Prefer bounded waits and clean draining. +* Do not submit new futures after writer failure. +* Cancel pending futures. +* Let running futures finish within timeout. +* Mark provider/stage as failed with retry backoff if shutdown is incomplete. +* Do not reschedule until retry backoff has elapsed. + +--- + +## 14. Scheduler Requirements + +The scheduler must support staged indexing. + +### 14.1 Stage Ordering + +For each provider: + +1. Ensure title index exists. +2. Then schedule detail enrichment. +3. Then schedule canonical enrichment. + +Do not run expensive detail/canonical enrichment for a provider whose title index is missing. + +### 14.2 Fairness + +Default behavior should avoid one provider starving all others forever. + +Recommended: + +* Run one provider/stage at a time by default. +* Choose due work by priority: + + 1. missing title index; + 2. targeted warm-up jobs; + 3. missing detail enrichment; + 4. missing canonical enrichment; + 5. scheduled refreshes. + +### 14.3 Retry Backoff + +Each stage must have a retry timestamp after failure. + +Failures must not cause immediate retry loops. + +### 14.4 Progress State + +The scheduler must expose progress per provider and stage: + +* provider; +* stage; +* status; +* total titles when known; +* queued titles; +* active workers; +* completed titles; +* failed titles; +* persisted rows or persisted titles; +* current/recent slug; +* queue depth; +* writer lag; +* last error; +* next retry time; +* whether title index is ready; +* whether details are complete; +* whether canonical mappings are complete. + +--- + +## 15. Database Schema Requirements + +The exact schema may reuse existing models where appropriate, but the final system must be able to represent the following states clearly. + +### 15.1 Provider Stage Status + +There must be durable state for each provider and indexing stage. + +Required logical fields: + +```text +provider +stage +status +generation +latest_success_generation +started_at +completed_at +latest_success_at +next_retry_after +total_items +completed_items +failed_items +cursor_slug +last_error_summary +updated_at +``` + +Stages should include at least: + +```text +title_index +detail_enrichment +canonical_enrichment +``` + +This may be implemented as: + +* a new `ProviderIndexStageStatus` table; or +* carefully extended existing provider status tables. + +The implementation must avoid ambiguous status fields where `bootstrap_completed` means different things in different contexts. + +### 15.2 Title Detail State + +There must be durable state per provider title for detail indexing. + +Required logical fields: + +```text +provider +slug +detail_status +detail_attempted_at +detail_success_at +detail_failure_count +detail_last_error_summary +canonical_status +canonical_attempted_at +canonical_success_at +canonical_failure_count +canonical_last_error_summary +updated_at +``` + +This may extend the existing `ProviderTitleIndexState` table. + +### 15.3 Canonical Metadata Cache + +If canonical metadata is cached in DB, use tables with compact payloads. + +Required logical fields: + +```text +cache_key +source +payload_compact_json +created_at +expires_at +last_used_at +``` + +The payload must be compact and contain only fields required by AniBridge mapping. + +Do not store unnecessarily large raw responses. + +### 15.4 Generation Visibility + +The request path must only serve rows from a generation considered visible/active for the relevant data type. + +If title index generation is ready but details are not complete: + +* title search may use title rows; +* detail-dependent endpoints must know details may be incomplete. + +Do not mark a provider as fully ready just because title rows exist. + +--- + +## 16. Persistence Requirements + +### 16.1 Replace Rows Efficiently + +For per-title child rows such as aliases, episodes, languages, and mappings: + +* delete old rows for that provider/title/stage; +* insert new rows; +* do not first load all old rows into Python unless needed. + +Avoid: + +```python +session.exec(select(Child).where(...)).all() +session.exec(delete(Child).where(...)) +``` + +Prefer: + +```python +session.exec(delete(Child).where(...)) +``` + +### 16.2 Batch Size + +Default writer batch size: + +```env +PROVIDER_INDEX_WRITER_BATCH_SIZE=32 +``` + +Rules: + +* batch size must be configurable; +* batch must be small enough to avoid large memory spikes; +* batch must be large enough to avoid one transaction per tiny row; +* writer must flush by size and by time. + +### 16.3 Failure Recording + +Title failures must be recorded durably. + +However, failure recording must not bypass SQLite write serialization. + +All failure-state writes must go through the same write coordinator or lock. + +### 16.4 Promotion + +For stages that use generations: + +* write into staging generation; +* mark stage generation successful only after successful completion; +* do not expose partial full-stage state as complete; +* if refresh fails, keep previous successful generation visible; +* clean up abandoned staging generation on restart or next run. + +For progressive detail enrichment, per-title successful writes may become visible if the API clearly treats detail completeness as per-title/progressive. This must not falsely report provider-wide `detail_ready`. + +--- + +## 17. Configuration Defaults + +The following default values must be safe for normal users. + +```env +PROVIDER_INDEX_GLOBAL_CONCURRENCY=1 + +PROVIDER_INDEX_CONCURRENCY_ANIWORLD=4 +PROVIDER_INDEX_CONCURRENCY_STO=4 +PROVIDER_INDEX_CONCURRENCY_MEGAKINO=2 + +PROVIDER_INDEX_QUEUE_SIZE=8 +PROVIDER_INDEX_WRITER_BATCH_SIZE=32 +PROVIDER_INDEX_WRITER_FLUSH_SECONDS=1.0 + +PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS=45 +PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT=20.0 +PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS=15.0 + +CANONICAL_INDEX_CONCURRENCY=2 +CANONICAL_CACHE_MEMORY_MAX_SEARCH=512 +CANONICAL_CACHE_MEMORY_MAX_SHOW=256 +CANONICAL_CACHE_TTL_SECONDS=3600 +``` + +If exact variable names differ, implement equivalent settings and document them. + +Invalid environment values must be sanitized. + +For example: + +* concurrency less than 1 becomes 1; +* queue size less than 1 becomes 1; +* negative timeout uses default; +* invalid float/int logs warning and uses default. + +--- + +## 18. Observability Requirements + +Logs must make indexing behavior understandable without being spammy. + +Required log events: + +1. Scheduler startup with effective safe defaults. +2. Provider title index start. +3. Provider title index total loaded. +4. Title index batch persisted. +5. Title index ready. +6. Detail enrichment start. +7. Detail enrichment progress heartbeat. +8. Detail enrichment batch persisted. +9. Queue backpressure warning, rate-limited. +10. Per-title failure warning, rate-limited or summarized if noisy. +11. Canonical enrichment start. +12. Canonical enrichment progress. +13. Canonical cache hit/miss summary, not every request at info level. +14. Stage completed. +15. Stage failed with retry timestamp. +16. Stale staging generation cleanup. +17. SQLite lock retry/busy warning if encountered. +18. Memory budget warning if optional memory instrumentation is implemented. + +Health endpoint should expose provider/stage progress. + +--- + +## 19. Acceptance Criteria + +The implementation is complete only if all criteria below are satisfied. + +### 19.1 Functional + +* Fresh install starts without a provider-derived snapshot. +* DB schema migrates successfully. +* Title index stage runs before detail/canonical enrichment. +* Title index stage writes searchable provider titles. +* App can report partial catalog readiness. +* Detail enrichment runs in background. +* Canonical enrichment runs in background or as a bounded follow-up stage. +* Request handlers read from DB. +* No normal request triggers a full provider crawl. +* Failed stages respect retry backoff. +* Interrupted staging generations are cleaned up or handled explicitly. + +### 19.2 Memory + +* No provider-scale list of expanded title records exists. +* Global canonical metadata caches are bounded or DB-backed. +* Queue sizes are bounded. +* SQLAlchemy sessions do not retain provider-scale identity maps. +* Large parse objects are not stored beyond a single title's processing. +* Default indexing should be designed to stay below 1 GB RAM. + +### 19.3 SQLite + +* WAL mode is enabled. +* Busy timeout is configured. +* Provider indexing writes are serialized. +* Cross-provider writer contention is avoided by default. +* `database is locked` should not be normal during first bootstrap. +* If SQLite lock contention still occurs, it must retry or fail gracefully with backoff, not start a retry storm. + +### 19.4 User Experience + +* Normal users do not need to tune concurrency. +* First startup does not require waiting for full provider detail/canonical enrichment. +* Basic title catalog readiness happens quickly. +* Progress is visible. +* Full enrichment continues automatically. + +### 19.5 Legal/Risk Boundary + +* No official provider-derived catalog DB/JSON is shipped. +* No automatic download of project-hosted provider-derived catalog data exists. +* Local computation remains the default. + +--- + +## 20. Suggested Implementation Plan + +### Step 1: Fix scheduler retry logic + +* Ensure retry timestamps are respected even during first bootstrap. +* Remove logic where `latest_success_at is None` forces immediate due status despite retry backoff. + +### Step 2: Add SQLite write safety + +* Add WAL, synchronous NORMAL, busy timeout. +* Add a global catalog-index write coordinator or lock. +* Ensure all catalog indexing writes use it. + +### Step 3: Introduce explicit stage status + +* Add or extend DB models to represent: + + * title index status; + * detail enrichment status; + * canonical enrichment status. +* Add migrations. +* Update health endpoint. + +### Step 4: Split title indexing from detail crawling + +* Implement fast provider title index stage. +* Persist titles/aliases only. +* Mark title-ready separately from full-ready. + +### Step 5: Rework detail enrichment to operate from DB rows + +* Query due title rows in small chunks. +* Crawl only those titles. +* Persist each result or small batch. +* Drop temporary objects. + +### Step 6: Bound or persist canonical caches + +* Replace unbounded dict caches. +* Prefer DB-backed compact canonical cache. +* If DB-backed cache is too large for this task, use strict TTL LRU cache with hard max sizes. + +### Step 7: Split canonical enrichment from provider detail crawl + +* Avoid canonical API calls during initial title bootstrap. +* Prefer canonical enrichment after details are persisted. +* Make it independently bounded and retryable. + +### Step 8: Add progress and tests + +* Add unit tests for: + + * scheduler due/backoff logic; + * bounded cache max size; + * no immediate retry after failure; + * title stage does not call detail crawler; + * SQLite write coordinator is used. +* Add integration tests for: + + * fresh DB startup state; + * title-ready before full-ready; + * failed provider retry backoff; + * detail enrichment persists incrementally. + +--- + +## 21. Explicit Implementation Constraints + +The implementation must not: + +* add official provider-derived catalog assets; +* require users to configure low concurrency manually; +* block startup until all provider details are indexed; +* use unbounded global caches; +* use unbounded queues; +* store raw HTML or BeautifulSoup objects in queues; +* allow multiple provider writers to fight over SQLite by default; +* retry failed bootstrap stages immediately; +* perform full provider live crawling inside request handlers; +* mark full catalog readiness when only title index readiness exists. + +--- + +## 22. Desired Final Behavior Example + +Fresh install: + +```text +Application startup. +Database migrations complete. +Provider catalog scheduler started. +aniworld.to title_index: running +s.to title_index: pending +megakino title_index: pending +Application HTTP server ready. +``` + +After title index for one provider: + +```text +aniworld.to title_index: ready, 2421 titles +aniworld.to detail_enrichment: running, 37/2421 titles +catalog_title_ready=true +catalog_full_ready=false +``` + +User searches: + +```text +Search query reads title DB. +Matching title is found. +If detail rows exist, return detail-backed result. +If detail rows do not exist, return clear "details indexing" state or enqueue explicit targeted warm-up. +``` + +Background continues: + +```text +aniworld.to detail_enrichment: running +queue_depth=3 +writer_lag=2 +memory remains bounded +``` + +Failure: + +```text +s.to detail_enrichment failed: network/provider issue +next_retry_after=... +scheduler does not retry before next_retry_after +previous visible data remains available +``` + +Completion: + +```text +aniworld.to title_index: ready +aniworld.to detail_enrichment: ready +aniworld.to canonical_enrichment: ready +aniworld.to full_ready=true +``` + +--- + +## 23. Summary + +The correct solution is not to lower all concurrency to one and not to ship a prebuilt provider catalog. + +The correct solution is: + +```text +local progressive indexing ++ fast title index bootstrap ++ background detail enrichment ++ background canonical enrichment ++ strict memory bounds ++ bounded caches ++ SQLite single-writer discipline ++ WAL/busy timeout ++ retry backoff ++ DB-only request path +``` + +This preserves the self-hosted legal/risk boundary while making AniBridge practical for normal users. From b1db107e2486aaf0b1e7b6dd34e0ea665d76ef72 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 22:08:13 +0200 Subject: [PATCH 22/45] refactor(catalog): split provider indexing into staged bounded workers Redesign provider catalog indexing into durable title, detail, and canonical stages with DB-backed request reads. Add SQLite-safe serialized catalog writes, WAL/busy-timeout configuration, bounded canonical caches, and retry backoff that respects future retry timestamps. Update defaults for safe self-hosted concurrency and add targeted coverage for staged readiness, incremental persistence, cache bounds, and scheduler backoff. --- apps/api/.env.example | 32 +- apps/api/app/catalog/indexer.py | 1617 +++++++++++------ apps/api/app/catalog/metadata.py | 119 +- apps/api/app/catalog/providers.py | 404 ++-- apps/api/app/config.py | 33 +- .../20260429_0006_provider_index_stages.py | 195 ++ apps/api/app/db/models.py | 150 +- apps/api/tests/unit/catalog/test_indexer.py | 223 +++ apps/api/tests/unit/catalog/test_metadata.py | 41 + 9 files changed, 1880 insertions(+), 934 deletions(-) create mode 100644 apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py create mode 100644 apps/api/tests/unit/catalog/test_metadata.py diff --git a/apps/api/.env.example b/apps/api/.env.example index 0d8d66a8..bf527da9 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -98,23 +98,23 @@ PROVIDER_INDEX_SCHEDULER_POLL_SECONDS=60 # Default: 1 PROVIDER_INDEX_GLOBAL_CONCURRENCY=1 # What: Per-provider crawl worker count for AniWorld title refreshes -# Default: 1 -PROVIDER_INDEX_CONCURRENCY_ANIWORLD=1 +# Default: 4 +PROVIDER_INDEX_CONCURRENCY_ANIWORLD=4 # What: Per-provider crawl worker count for s.to title refreshes -# Default: 1 -PROVIDER_INDEX_CONCURRENCY_STO=1 +# Default: 4 +PROVIDER_INDEX_CONCURRENCY_STO=4 # What: Per-provider crawl worker count for megakino title refreshes -# Default: 1 -PROVIDER_INDEX_CONCURRENCY_MEGAKINO=1 +# Default: 2 +PROVIDER_INDEX_CONCURRENCY_MEGAKINO=2 # What: Hard timeout in seconds for one provider title crawl # Default: 45 PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS=45 # What: Maximum number of completed title payloads allowed to wait in memory for DB persistence -# Default: 32 -PROVIDER_INDEX_QUEUE_SIZE=32 -# What: Max number of title payloads committed per SQLite writer batch # Default: 8 -PROVIDER_INDEX_WRITER_BATCH_SIZE=8 +PROVIDER_INDEX_QUEUE_SIZE=8 +# What: Max number of title payloads committed per SQLite writer batch +# Default: 32 +PROVIDER_INDEX_WRITER_BATCH_SIZE=32 # What: Max seconds the writer may hold a partial batch before forcing a commit # Default: 1.0 PROVIDER_INDEX_WRITER_FLUSH_SECONDS=1.0 @@ -124,6 +124,18 @@ PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT=20 # What: Minimum seconds between repeated queue-backpressure log lines while crawlers are blocked on persistence # Default: 15 PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS=15 +# What: Maximum parallel canonical metadata lookups per provider stage run +# Default: 2 +CANONICAL_INDEX_CONCURRENCY=2 +# What: Max hot in-memory canonical search cache entries +# Default: 512 +CANONICAL_CACHE_MEMORY_MAX_SEARCH=512 +# What: Max hot in-memory canonical show cache entries +# Default: 256 +CANONICAL_CACHE_MEMORY_MAX_SHOW=256 +# What: TTL in seconds for hot in-memory canonical cache entries +# Default: 3600 +CANONICAL_CACHE_TTL_SECONDS=3600 # What: Domain check interval in minutes (0 disables background checks) # Default: 100 diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 6c57d2d6..96662899 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -1,23 +1,30 @@ from __future__ import annotations +from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait from dataclasses import dataclass from datetime import timedelta -from queue import Empty, Full, Queue +from queue import Empty, Queue from threading import Event, Lock, Semaphore, Thread from time import monotonic from uuid import uuid4 from loguru import logger -from sqlmodel import Session +from sqlmodel import Session, select from app.catalog.exceptions import CatalogNotReadyError from app.catalog.providers import ( + CanonicalPayload, CatalogCrawlObserver, + EpisodeLanguageRecord, + EpisodeRecord, TitleRecord, - stream_provider_catalog, + crawl_provider_title_detail, + load_provider_title_index, + resolve_provider_canonical, ) from app.config import ( ANIBRIDGE_TEST_MODE, + CANONICAL_INDEX_CONCURRENCY, CATALOG_SITES_LIST, CATALOG_SITE_CONFIGS, PROGRESS_STEP_PERCENT, @@ -26,15 +33,23 @@ PROVIDER_INDEX_GLOBAL_CONCURRENCY, PROVIDER_INDEX_QUEUE_SIZE, PROVIDER_INDEX_SCHEDULER_POLL_SECONDS, + PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, PROVIDER_INDEX_WRITER_BATCH_SIZE, PROVIDER_INDEX_WRITER_FLUSH_SECONDS, ) from app.db import ( + ProviderCatalogAlias, + ProviderCatalogEpisode, + ProviderCatalogTitle, + ProviderEpisodeLanguage, + ProviderIndexStatus, + ProviderTitleIndexState, as_aware_utc, delete_provider_generation, engine, get_provider_index_status, is_catalog_bootstrap_ready, + is_provider_fully_ready, list_provider_index_statuses, prune_provider_generation, replace_canonical_episodes, @@ -53,14 +68,16 @@ _INDEXER: "ProviderCatalogIndexer | None" = None _INDEXER_LOCK = Lock() -_UNSET = object() _QUEUE_SENTINEL = object() +_UNSET = object() +_STAGES = ("title_index", "detail_enrichment", "canonical_enrichment") @dataclass(slots=True) class ProviderCatalogProgress: provider: str phase: str = "pending" + stage: str = "title_index" crawled_titles: int = 0 persisted_titles: int = 0 failed_titles: int = 0 @@ -99,6 +116,19 @@ def crawl_percent(self) -> float | None: return round(max(0.0, min(100.0, completed / self.total_titles * 100.0)), 1) +class CatalogIndexWriteCoordinator: + def __init__(self) -> None: + self._lock = Lock() + + def run(self, callback): + with self._lock: + with Session(engine) as session: + result = callback(session) + if hasattr(session, "commit"): + session.commit() + return result + + def get_catalog_indexer() -> "ProviderCatalogIndexer": global _INDEXER with _INDEXER_LOCK: @@ -120,18 +150,15 @@ def get_catalog_readiness_error() -> str | None: by_provider = {item["provider"]: item for item in snapshot.get("providers", [])} for provider in CATALOG_SITES_LIST: status = get_provider_index_status(session, provider=provider) - if status is None or not status.bootstrap_completed: + search_ready = bool( + status is not None + and (status.title_index_status == "ready" or status.bootstrap_completed) + ) + if status is None or not search_ready: progress = by_provider.get(provider, {}) - processed = progress.get("processed_titles") - total = progress.get("total_titles") - percent = progress.get("progress_percent") - phase = progress.get("phase") or "pending" - if isinstance(processed, int) and isinstance(total, int) and total > 0: - pending.append( - f"{provider} ({processed}/{total}, {percent:.1f}%, {phase})" - ) - else: - pending.append(f"{provider} ({phase})") + pending.append( + f"{provider} ({progress.get('title_index_status') or 'pending'})" + ) if not pending: return None return ( @@ -156,6 +183,7 @@ def __init__(self) -> None: self._progress: dict[str, ProviderCatalogProgress] = {} self._workers_lock = Lock() self._workers: dict[str, Thread] = {} + self._writer = CatalogIndexWriteCoordinator() def start(self) -> None: self._ensure_status_rows() @@ -164,12 +192,6 @@ def start(self) -> None: return if self._thread is not None and self._thread.is_alive(): return - logger.info( - "Provider catalog scheduler starting: poll={}s global_concurrency={} providers={}", - PROVIDER_INDEX_SCHEDULER_POLL_SECONDS, - PROVIDER_INDEX_GLOBAL_CONCURRENCY, - ", ".join(CATALOG_SITES_LIST), - ) self._thread = Thread( target=self._run_loop, name="provider-catalog-indexer", @@ -189,63 +211,17 @@ def stop(self) -> None: def run_due_once(self) -> None: with Session(engine) as session: statuses = list_provider_index_statuses(session) - if not statuses: - logger.warning("Provider catalog scheduler: no provider status rows found") - return - logger.debug( - "Provider catalog scheduler pass: bootstrap_ready={} providers={}", - self._is_bootstrap_ready(), - ", ".join( - f"{status.provider}={status.status}" - for status in sorted(statuses, key=lambda item: item.provider) - ), - ) for status in statuses: if self._is_due(status): - logger.info( - "Provider catalog scheduler: {} is due (status={} bootstrap_completed={} next_refresh_after={} latest_success_at={})", - status.provider, - status.status, - status.bootstrap_completed, - status.next_refresh_after.isoformat() - if status.next_refresh_after is not None - else None, - status.latest_success_at.isoformat() - if status.latest_success_at is not None - else None, - ) self.refresh_provider(status.provider) - else: - logger.debug( - "Provider catalog scheduler: {} not due (status={} bootstrap_completed={} next_refresh_after={} latest_success_at={})", - status.provider, - status.status, - status.bootstrap_completed, - status.next_refresh_after.isoformat() - if status.next_refresh_after is not None - else None, - status.latest_success_at.isoformat() - if status.latest_success_at is not None - else None, - ) def refresh_provider(self, provider: str) -> None: with self._workers_lock: existing = self._workers.get(provider) if existing is not None and existing.is_alive(): - logger.debug( - "Provider catalog scheduler: {} already running in worker {}", - provider, - existing.name, - ) return if not self._active.acquire(blocking=False): - logger.warning( - "Provider catalog scheduler: concurrency exhausted, skipping {} for now", - provider, - ) return - logger.info("Provider catalog scheduler: starting refresh for {}", provider) worker = Thread( target=self._run_provider_refresh, name=f"provider-index-{provider}", @@ -270,6 +246,7 @@ def get_progress_snapshot(self) -> dict[str, object]: provider: ProviderCatalogProgress( provider=snapshot.provider, phase=snapshot.phase, + stage=snapshot.stage, crawled_titles=snapshot.crawled_titles, persisted_titles=snapshot.persisted_titles, failed_titles=snapshot.failed_titles, @@ -285,25 +262,39 @@ def get_progress_snapshot(self) -> dict[str, object]: for provider in CATALOG_SITES_LIST: status = statuses.get(provider) progress = runtime.get(provider, ProviderCatalogProgress(provider=provider)) - phase = progress.phase - if phase == "pending" and status is not None: - phase = status.status - latest_success_generation = ( - status.latest_success_generation if status is not None else None - ) - current_generation = ( - status.current_generation if status is not None else None - ) providers.append( { "provider": provider, "status": status.status if status is not None else "pending", + "active_stage": status.active_stage if status is not None else None, "bootstrap_completed": ( bool(status.bootstrap_completed) if status is not None else False ), - "phase": phase, + "title_index_status": ( + status.title_index_status if status is not None else "pending" + ), + "detail_enrichment_status": ( + status.detail_enrichment_status + if status is not None + else "pending" + ), + "canonical_enrichment_status": ( + status.canonical_enrichment_status + if status is not None + else "pending" + ), + "search_ready": bool( + status is not None + and ( + status.title_index_status == "ready" + or status.bootstrap_completed + ) + ), + "full_ready": is_provider_fully_ready(status), + "phase": progress.phase, + "stage": progress.stage, "processed_titles": progress.processed_titles, "crawled_titles": progress.crawled_titles, "persisted_titles": progress.persisted_titles, @@ -314,17 +305,14 @@ def get_progress_snapshot(self) -> dict[str, object]: "queue_depth": progress.queue_depth, "writer_lag_titles": progress.writer_lag_titles, "current_slug": progress.current_slug or None, - "serving_previous_generation": bool( - status is not None - and status.status == "running" - and latest_success_generation - and current_generation - and current_generation != latest_success_generation + "latest_success_generation": ( + status.latest_success_generation if status is not None else None ), - "latest_success_generation": latest_success_generation, "staging_generation": ( - current_generation - if current_generation != latest_success_generation + status.current_generation + if status is not None + and status.current_generation + != status.latest_success_generation else None ), "last_error_summary": ( @@ -340,11 +328,31 @@ def get_progress_snapshot(self) -> dict[str, object]: if status is not None and status.latest_completed_at is not None else None ), + "title_index_ready_at": ( + status.title_index_ready_at.isoformat() + if status is not None + and status.title_index_ready_at is not None + else None + ), + "detail_ready_at": ( + status.detail_ready_at.isoformat() + if status is not None and status.detail_ready_at is not None + else None + ), + "canonical_ready_at": ( + status.canonical_ready_at.isoformat() + if status is not None and status.canonical_ready_at is not None + else None + ), } ) return { "bootstrap_ready": bootstrap_ready, "bootstrapping": not bootstrap_ready, + "full_ready": all( + is_provider_fully_ready(statuses.get(provider)) + for provider in CATALOG_SITES_LIST + ), "providers": providers, } @@ -366,91 +374,165 @@ def _run_provider_refresh(self, provider: str) -> None: self._workers.pop(provider, None) def _ensure_status_rows(self) -> None: - with Session(engine) as session: - for provider in CATALOG_SITES_LIST: - self._set_progress(provider, phase="pending") - hours = float( - CATALOG_SITE_CONFIGS.get(provider, {}).get( - "provider_index_refresh_hours", 24.0 - ) - ) + for provider in CATALOG_SITES_LIST: + self._set_progress(provider, phase="pending", stage="title_index") + hours = self._refresh_interval_hours(provider) + with Session(engine) as session: status = get_provider_index_status(session, provider=provider) - if status is None: - logger.warning( - "Provider catalog bootstrap: no persisted index state for {}. Initial bootstrap required.", - provider, - ) - upsert_provider_index_status( - session, - provider=provider, - refresh_interval_hours=hours, - status="pending", - bootstrap_completed=False, - next_refresh_after=None, - ) - continue - stale_generation = self._stale_generation(status) - if stale_generation is not None: - logger.warning( - "Provider catalog bootstrap: found interrupted staging generation for {} generation={} status={} cursor_slug={}. Cleaning it up before retry.", - provider, - stale_generation, - status.status, - status.cursor_title_slug or None, - ) - delete_provider_generation( - session, - provider=provider, - generation=stale_generation, - ) - upsert_provider_index_status( - session, - provider=provider, - refresh_interval_hours=hours, - status="pending", - current_generation=None, - latest_completed_at=utcnow(), - next_refresh_after=None, - failure_count=status.failure_count + 1, - last_error_summary="Interrupted staging generation cleaned up after restart.", + if status is None: + logger.warning( + "Provider catalog bootstrap: no persisted index state for {}. Initial bootstrap required.", + provider, + ) + self._writer.run( + lambda session, provider=provider, hours=hours: ( + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=hours, + status="pending", + title_index_status="pending", + detail_enrichment_status="pending", + canonical_enrichment_status="pending", + bootstrap_completed=False, + commit=False, + ) ) - continue - logger.debug( - "Provider catalog bootstrap: loaded persisted state for {} status={} bootstrap_completed={} latest_success_generation={} next_refresh_after={}", + ) + continue + stale_generation = self._stale_generation(status) + if stale_generation is not None: + logger.warning( + "Provider catalog bootstrap: found interrupted staging generation for {} generation={} status={} cursor_slug={}. Cleaning it up before retry.", provider, + stale_generation, status.status, - status.bootstrap_completed, - status.latest_success_generation, - status.next_refresh_after.isoformat() - if status.next_refresh_after is not None - else None, + getattr(status, "cursor_title_slug", None) or None, + ) + self._writer.run( + lambda session, provider=provider, generation=stale_generation, hours=hours: ( + self._cleanup_stale_generation( + session, + provider=provider, + generation=generation, + refresh_interval_hours=hours, + ) + ) ) - def _is_due(self, status) -> bool: - if status.status == "running": + def _cleanup_stale_generation( + self, + session: Session, + *, + provider: str, + generation: str, + refresh_interval_hours: float, + ) -> None: + delete_provider_generation(session, provider=provider, generation=generation) + current = get_provider_index_status(session, provider=provider) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="pending", + active_stage=None, + current_generation=None, + latest_completed_at=utcnow(), + title_index_status="pending", + detail_enrichment_status="pending", + canonical_enrichment_status="pending", + failure_count=0 if current is None else current.failure_count + 1, + last_error_summary="Interrupted staging generation cleaned up after restart.", + commit=False, + ) + + def _is_due(self, status: ProviderIndexStatus) -> bool: + return self._pick_due_stage(status) is not None + + def _stage_due( + self, + *, + stage_status: str, + retry_after, + refresh_after=_UNSET, + ) -> bool: + if stage_status == "running": + return False + now = utcnow() + if retry_after is not None and as_aware_utc(retry_after) > now: return False - if status.latest_success_at is None: + if refresh_after is not _UNSET and refresh_after is not None: + if as_aware_utc(refresh_after) > now: + return False + if stage_status in {"pending", "failed"}: return True - if status.next_refresh_after is None: + if refresh_after is _UNSET: + return False + if refresh_after is None: return True - return as_aware_utc(status.next_refresh_after) <= utcnow() + return as_aware_utc(refresh_after) <= now + + def _pick_due_stage(self, status: ProviderIndexStatus) -> str | None: + if getattr(status, "status", None) == "running": + return None + latest_success_generation = getattr(status, "latest_success_generation", None) + title_index_status = getattr(status, "title_index_status", None) + if title_index_status is None: + title_index_status = "ready" if latest_success_generation else "pending" + if not latest_success_generation or title_index_status != "ready": + if self._stage_due( + stage_status=title_index_status, + retry_after=getattr(status, "title_index_next_retry_after", None), + refresh_after=getattr(status, "next_refresh_after", None), + ): + return "title_index" + return None + if self._detail_stage_has_due_work(status.provider): + if self._stage_due( + stage_status=getattr(status, "detail_enrichment_status", "pending"), + retry_after=getattr(status, "detail_next_retry_after", None), + ): + return "detail_enrichment" + if self._canonical_stage_has_due_work(status.provider): + if self._stage_due( + stage_status=getattr(status, "canonical_enrichment_status", "pending"), + retry_after=getattr(status, "canonical_next_retry_after", None), + ): + return "canonical_enrichment" + if self._stage_due( + stage_status=title_index_status, + retry_after=getattr(status, "title_index_next_retry_after", None), + refresh_after=getattr(status, "next_refresh_after", None), + ): + return "title_index" + return None def _refresh_provider(self, provider: str) -> None: - refresh_interval_hours = float( - CATALOG_SITE_CONFIGS.get(provider, {}).get( - "provider_index_refresh_hours", 24.0 - ) - ) + with Session(engine) as session: + status = get_provider_index_status(session, provider=provider) + if status is None: + return + stage = self._pick_due_stage(status) + if stage is None: + return + if stage == "title_index": + self._run_title_index_stage(provider) + return + if stage == "detail_enrichment": + self._run_detail_enrichment_stage(provider) + return + if stage == "canonical_enrichment": + self._run_canonical_enrichment_stage(provider) + + def _run_title_index_stage(self, provider: str) -> None: + refresh_interval_hours = self._refresh_interval_hours(provider) generation = uuid4().hex - reporter: ProgressReporter | None = None queue: Queue[TitleRecord | object] = Queue(maxsize=PROVIDER_INDEX_QUEUE_SIZE) writer_failure: list[BaseException] = [] - state_lock = Lock() - failed_titles = 0 - completed_titles = 0 self._set_progress( provider, - phase="discovering_titles", + phase="title_index", + stage="title_index", crawled_titles=0, persisted_titles=0, failed_titles=0, @@ -459,46 +541,31 @@ def _refresh_provider(self, provider: str) -> None: queue_depth=0, reset_log_steps=True, ) - logger.info("Provider catalog {}: discovering titles", provider) - - with Session(engine) as session: - current = get_provider_index_status(session, provider=provider) - if current is not None: - stale_generation = self._stale_generation(current) - if stale_generation is not None: - delete_provider_generation( - session, - provider=provider, - generation=stale_generation, - ) - upsert_provider_index_status( + self._writer.run( + lambda session: upsert_provider_index_status( session, provider=provider, refresh_interval_hours=refresh_interval_hours, status="running", + active_stage="title_index", current_generation=generation, latest_started_at=utcnow(), latest_completed_at=None, - next_refresh_after=None, + title_index_status="running", + title_index_next_retry_after=None, last_error_summary="", cursor_title_slug="", + commit=False, ) + ) reporter = ProgressReporter( label=f"Catalog {provider}", unit="title", unit_scale=False, ) - reporter.update( - ProgressSnapshot( - downloaded=0, - total=None, - status="discovering_titles", - ) - ) - writer = Thread( - target=self._writer_loop, + target=self._title_index_writer_loop, name=f"provider-index-writer-{provider}", args=( provider, @@ -512,230 +579,153 @@ def _refresh_provider(self, provider: str) -> None: ) writer.start() - def emit_title(record: TitleRecord) -> None: - last_backpressure_log = 0.0 - while True: - if writer_failure: - raise RuntimeError( - f"writer failed for {provider}: {writer_failure[0]}" - ) - try: - queue.put(record, timeout=1.0) - except Full: - depth = queue.qsize() - self._set_progress(provider, queue_depth=depth) - now = monotonic() - if ( - now - last_backpressure_log - >= PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS - ): - logger.warning( - "Provider catalog {}: writer backpressure queue_depth={} lag_titles={}", - provider, - depth, - self._get_writer_lag(provider), - ) - last_backpressure_log = now - continue - self._set_progress(provider, queue_depth=queue.qsize()) - return - def on_index_loaded(total_titles: int) -> None: self._set_progress( provider, - phase="crawling_titles", + phase="title_index", + stage="title_index", total_titles=total_titles, - current_slug="", - queue_depth=queue.qsize(), reset_log_steps=True, ) - logger.info( - "Provider catalog {}: loaded title index with {} titles", - provider, - total_titles, - ) reporter.update( - ProgressSnapshot( - downloaded=0, - total=total_titles, - status="crawling_titles", - ) - ) - - def on_title_started(slug: str) -> None: - self._set_progress(provider, phase="crawling_titles", current_slug=slug) - - def on_title_crawled(slug: str) -> None: - nonlocal completed_titles - with state_lock: - completed_titles += 1 - self._advance_crawl_progress( - provider, - current_slug=slug, - queue_depth=queue.qsize(), + ProgressSnapshot(downloaded=0, total=total_titles, status="title_index") ) - def on_title_failed(slug: str, reason: str) -> None: - nonlocal completed_titles, failed_titles - with state_lock: - completed_titles += 1 - failed_titles += 1 - failure_count = failed_titles - self._record_title_failure(provider, slug, reason) - self._advance_failed_progress( - provider, - current_slug=slug, - queue_depth=queue.qsize(), - ) - total_titles = self._get_total_titles(provider) - if total_titles and total_titles > 0: - failure_rate = failure_count / total_titles * 100.0 - if failure_rate > PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT: - raise RuntimeError( - "provider refresh failure threshold exceeded: " - f"{failure_count}/{total_titles} titles failed " - f"({failure_rate:.1f}% > {PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT:.1f}%)" - ) - - observer = CatalogCrawlObserver( - on_index_loaded=on_index_loaded, - on_title_started=on_title_started, - on_title_crawled=on_title_crawled, - on_title_failed=on_title_failed, - ) + observer = CatalogCrawlObserver(on_index_loaded=on_index_loaded) try: - summary = stream_provider_catalog( - provider, - emit_title=emit_title, - observer=observer, - ) - if writer_failure: - raise RuntimeError(f"writer failed for {provider}: {writer_failure[0]}") - if summary.discovered_titles == 0: - logger.warning("Provider catalog {}: discovered zero titles", provider) - except Exception as exc: - logger.exception( - "Provider catalog refresh failed for {}: {}", provider, exc - ) + rows = load_provider_title_index(provider, observer=observer) + if self._get_total_titles(provider) is None: + on_index_loaded(len(rows)) + for row in rows: + self._enqueue_title_record(provider, queue, row, writer_failure) + self._advance_crawl_progress( + provider, + current_slug=row.slug, + queue_depth=queue.qsize(), + ) queue.put(_QUEUE_SENTINEL) writer.join(timeout=30) - if reporter is not None: - reporter.close() + if writer_failure: + raise RuntimeError(str(writer_failure[0])) completed_at = utcnow() - with Session(engine) as session: - delete_provider_generation( + self._writer.run( + lambda session: self._finish_title_index_success( session, provider=provider, generation=generation, - ) - current = get_provider_index_status(session, provider=provider) - failure_count = 1 if current is None else current.failure_count + 1 - upsert_provider_index_status( - session, - provider=provider, refresh_interval_hours=refresh_interval_hours, - status="failed", - current_generation=None, - latest_completed_at=completed_at, - next_refresh_after=completed_at - + timedelta(hours=refresh_interval_hours), - failure_count=failure_count, - last_error_summary=str(exc)[:500], + completed_at=completed_at, ) + ) self._set_progress( provider, - phase="failed", + phase="title_index_ready", + stage="title_index", queue_depth=0, current_slug="", ) - return - - queue.put(_QUEUE_SENTINEL) - writer.join() - if writer_failure: - exc = RuntimeError(f"writer failed for {provider}: {writer_failure[0]}") + except Exception as exc: logger.exception( - "Provider catalog refresh failed for {}: {}", provider, exc + "Provider catalog title index failed for {}: {}", provider, exc ) - if reporter is not None: - reporter.close() + queue.put(_QUEUE_SENTINEL) + writer.join(timeout=5) completed_at = utcnow() - with Session(engine) as session: - delete_provider_generation( + self._writer.run( + lambda session: self._finish_title_index_failure( session, provider=provider, generation=generation, - ) - current = get_provider_index_status(session, provider=provider) - failure_count = 1 if current is None else current.failure_count + 1 - upsert_provider_index_status( - session, - provider=provider, refresh_interval_hours=refresh_interval_hours, - status="failed", - current_generation=None, - latest_completed_at=completed_at, - next_refresh_after=completed_at - + timedelta(hours=refresh_interval_hours), - failure_count=failure_count, - last_error_summary=str(exc)[:500], + completed_at=completed_at, + error=str(exc), ) + ) self._set_progress( provider, phase="failed", + stage="title_index", queue_depth=0, current_slug="", ) - return + finally: + reporter.close() - completed_at = utcnow() - with Session(engine) as session: - prune_provider_generation( - session, - provider=provider, - keep_generation=generation, - ) - upsert_provider_index_status( - session, - provider=provider, - refresh_interval_hours=refresh_interval_hours, - status="ready", - current_generation=generation, - latest_success_generation=generation, - latest_completed_at=completed_at, - latest_success_at=completed_at, - next_refresh_after=completed_at - + timedelta(hours=refresh_interval_hours), - bootstrap_completed=True, - failure_count=0, - last_error_summary="", - cursor_title_slug="", - ) - self._set_progress( - provider, - phase="ready", - queue_depth=0, - current_slug="", + def _finish_title_index_success( + self, + session: Session, + *, + provider: str, + generation: str, + refresh_interval_hours: float, + completed_at, + ) -> None: + prune_provider_generation( + session, provider=provider, keep_generation=generation ) - logger.info( - "Provider catalog {}: promoted staging generation {} persisted={}/{} failed={}", - provider, - generation, - self._get_persisted_titles(provider), - self._get_total_titles(provider) or 0, - self._get_failed_titles(provider), + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="partial", + active_stage=None, + current_generation=generation, + latest_success_generation=generation, + latest_completed_at=completed_at, + latest_success_at=completed_at, + next_refresh_after=completed_at + timedelta(hours=refresh_interval_hours), + bootstrap_completed=True, + title_index_status="ready", + title_index_ready_at=completed_at, + title_index_next_retry_after=None, + detail_enrichment_status="pending", + detail_ready_at=None, + detail_next_retry_after=None, + canonical_enrichment_status="pending", + canonical_ready_at=None, + canonical_next_retry_after=None, + failure_count=0, + last_error_summary="", + cursor_title_slug="", + commit=False, ) - if reporter is not None: - reporter.close() - def _writer_loop( + def _finish_title_index_failure( + self, + session: Session, + *, + provider: str, + generation: str, + refresh_interval_hours: float, + completed_at, + error: str, + ) -> None: + delete_provider_generation(session, provider=provider, generation=generation) + current = get_provider_index_status(session, provider=provider) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="failed", + active_stage=None, + current_generation=None, + latest_completed_at=completed_at, + title_index_status="failed", + title_index_next_retry_after=completed_at + + timedelta(hours=refresh_interval_hours), + failure_count=1 if current is None else current.failure_count + 1, + last_error_summary=error[:500], + commit=False, + ) + + def _title_index_writer_loop( self, provider: str, generation: str, refresh_interval_hours: float, queue: Queue[TitleRecord | object], - reporter: ProgressReporter | None, + reporter: ProgressReporter, writer_failure: list[BaseException], ) -> None: batch: list[TitleRecord] = [] @@ -752,11 +742,11 @@ def _writer_loop( item = None if item is None: if batch: - self._flush_writer_batch( - provider, - generation, - refresh_interval_hours, - batch, + self._flush_title_index_batch( + provider=provider, + generation=generation, + refresh_interval_hours=refresh_interval_hours, + batch=batch, queue_depth=queue.qsize(), reporter=reporter, ) @@ -765,22 +755,22 @@ def _writer_loop( continue if item is _QUEUE_SENTINEL: if batch: - self._flush_writer_batch( - provider, - generation, - refresh_interval_hours, - batch, + self._flush_title_index_batch( + provider=provider, + generation=generation, + refresh_interval_hours=refresh_interval_hours, + batch=batch, queue_depth=queue.qsize(), reporter=reporter, ) return batch.append(item) if len(batch) >= PROVIDER_INDEX_WRITER_BATCH_SIZE: - self._flush_writer_batch( - provider, - generation, - refresh_interval_hours, - batch, + self._flush_title_index_batch( + provider=provider, + generation=generation, + refresh_interval_hours=refresh_interval_hours, + batch=batch, queue_depth=queue.qsize(), reporter=reporter, ) @@ -788,22 +778,22 @@ def _writer_loop( last_flush_at = monotonic() except BaseException as exc: writer_failure.append(exc) - logger.exception("Provider catalog writer failed for {}: {}", provider, exc) - def _flush_writer_batch( + def _flush_title_index_batch( self, + *, provider: str, generation: str, refresh_interval_hours: float, batch: list[TitleRecord], - *, queue_depth: int, - reporter: ProgressReporter | None, + reporter: ProgressReporter, ) -> None: if not batch: return last_slug = batch[-1].slug - with Session(engine) as session: + + def _persist(session: Session) -> None: for record in batch: now = utcnow() upsert_provider_title_index_state( @@ -811,223 +801,664 @@ def _flush_writer_batch( provider=provider, slug=record.slug, attempted_at=now, + succeeded_at=now, + failure_count=0, + last_error_summary="", + detail_status="pending", + detail_next_retry_after=None, + detail_failure_count=0, + detail_last_error_summary=None, + canonical_status="pending", + canonical_next_retry_after=None, + canonical_failure_count=0, + canonical_last_error_summary=None, commit=False, ) - self._persist_title_record( + replace_provider_catalog_title( session, - record=record, + provider=record.provider, + slug=record.slug, + title=record.title, + media_type_hint=record.media_type_hint, + relative_path=record.relative_path, indexed_generation=generation, ) - upsert_provider_title_index_state( + replace_provider_catalog_aliases( session, - provider=provider, + provider=record.provider, slug=record.slug, - succeeded_at=now, - failure_count=0, - last_error_summary="", - commit=False, + aliases=record.aliases, + indexed_generation=generation, ) upsert_provider_index_status( session, provider=provider, refresh_interval_hours=refresh_interval_hours, status="running", + active_stage="title_index", current_generation=generation, cursor_title_slug=last_slug, last_error_summary="", commit=False, ) - session.commit() + + self._writer.run(_persist) self._advance_persist_progress( provider, current_slug=last_slug, count=len(batch), queue_depth=queue_depth, ) - persisted = self._get_persisted_titles(provider) - total_titles = self._get_total_titles(provider) - if reporter is not None: - reporter.update( - ProgressSnapshot( - downloaded=persisted, - total=total_titles, - status="persisting_titles", - ) + reporter.update( + ProgressSnapshot( + downloaded=self._get_persisted_titles(provider), + total=self._get_total_titles(provider), + status="title_index", ) - logger.info( - "Provider catalog {}: persisted batch size={} persisted={}/{} queue_depth={} writer_lag={}", + ) + + def _run_detail_enrichment_stage(self, provider: str) -> None: + self._run_row_stage( + provider=provider, + stage="detail_enrichment", + concurrency=self._provider_concurrency(provider), + ) + + def _run_canonical_enrichment_stage(self, provider: str) -> None: + self._run_row_stage( + provider=provider, + stage="canonical_enrichment", + concurrency=CANONICAL_INDEX_CONCURRENCY, + ) + + def _run_row_stage(self, *, provider: str, stage: str, concurrency: int) -> None: + refresh_interval_hours = self._refresh_interval_hours(provider) + self._mark_stage_running( + provider=provider, + stage=stage, + refresh_interval_hours=refresh_interval_hours, + ) + total_titles = self._count_visible_titles(provider) + self._set_progress( provider, - len(batch), - persisted, - total_titles or 0, - queue_depth, - self._get_writer_lag(provider), + phase=stage, + stage=stage, + crawled_titles=0, + persisted_titles=0, + failed_titles=0, + total_titles=total_titles, + current_slug="", + queue_depth=0, + reset_log_steps=True, ) + failure_limit = max( + 1, + int( + max(1, total_titles) * PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT / 100.0 + ), + ) + failure_count = 0 + while not self._stop_event.is_set(): + due_rows = self._load_due_stage_rows( + provider=provider, + stage=stage, + limit=max(1, concurrency * 2), + ) + if not due_rows: + completed_at = utcnow() + self._writer.run( + lambda session: self._mark_stage_ready( + session, + provider=provider, + stage=stage, + refresh_interval_hours=refresh_interval_hours, + completed_at=completed_at, + ) + ) + self._set_progress( + provider, + phase=f"{stage}_ready", + stage=stage, + current_slug="", + ) + return + executor = ThreadPoolExecutor(max_workers=max(1, concurrency)) + pending: dict[ + Future, tuple[ProviderCatalogTitle, ProviderTitleIndexState] + ] = {} + try: + for title_row, state in due_rows: + pending[ + executor.submit( + self._run_stage_job, + provider=provider, + stage=stage, + title_row=title_row, + ) + ] = (title_row, state) + while pending: + done, not_done = wait( + pending.keys(), + timeout=1.0, + return_when=FIRST_COMPLETED, + ) + if not done: + continue + for future in done: + title_row, state = pending.pop(future) + try: + payload = future.result() + self._persist_stage_success( + provider=provider, + stage=stage, + title_row=title_row, + payload=payload, + ) + self._advance_persist_progress( + provider, + current_slug=title_row.slug, + count=1, + queue_depth=len(not_done), + ) + except Exception as exc: + failure_count += 1 + self._persist_stage_failure( + provider=provider, + stage=stage, + title_row=title_row, + state=state, + error=str(exc), + ) + self._advance_failed_progress( + provider, + current_slug=title_row.slug, + queue_depth=len(not_done), + ) + if failure_count >= failure_limit: + for remaining in not_done: + remaining.cancel() + completed_at = utcnow() + self._writer.run( + lambda session, error=str(exc): ( + self._mark_stage_failed( + session, + provider=provider, + stage=stage, + refresh_interval_hours=refresh_interval_hours, + completed_at=completed_at, + error=error, + ) + ) + ) + return + finally: + executor.shutdown(wait=False, cancel_futures=True) - def _persist_title_record( + def _run_stage_job( self, - session: Session, *, - record: TitleRecord, - indexed_generation: str, - ) -> None: - replace_provider_catalog_title( - session, - provider=record.provider, - slug=record.slug, - title=record.title, - media_type_hint=record.media_type_hint, - relative_path=record.relative_path, - indexed_generation=indexed_generation, + provider: str, + stage: str, + title_row: ProviderCatalogTitle, + ): + self._advance_crawl_progress( + provider, + current_slug=title_row.slug, + queue_depth=0, ) - replace_provider_catalog_aliases( - session, - provider=record.provider, - slug=record.slug, - aliases=record.aliases, - indexed_generation=indexed_generation, + aliases = self._load_aliases(provider=provider, slug=title_row.slug) + if stage == "detail_enrichment": + return crawl_provider_title_detail( + provider_key=provider, + slug=title_row.slug, + title=title_row.title, + aliases=aliases, + timeout_seconds=float( + CATALOG_SITE_CONFIGS[provider].get( + "provider_index_title_timeout_seconds", + PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, + ) + ), + ) + episodes = self._load_episode_records(provider=provider, slug=title_row.slug) + return resolve_provider_canonical( + provider_key=provider, + slug=title_row.slug, + title=title_row.title, + aliases=aliases, + media_type_hint=title_row.media_type_hint, + episodes=episodes, ) - replace_provider_catalog_episodes( - session, - provider=record.provider, - slug=record.slug, - episodes=[ - { - "season": episode.season, - "episode": episode.episode, - "relative_path": episode.relative_path, - "title_primary": episode.title_primary, - "title_secondary": episode.title_secondary, - "media_type_hint": episode.media_type_hint, - "languages": [ + + def _persist_stage_success( + self, + *, + provider: str, + stage: str, + title_row: ProviderCatalogTitle, + payload, + ) -> None: + generation = self._visible_generation(provider) + now = utcnow() + + def _persist(session: Session) -> None: + if stage == "detail_enrichment": + detail_record: TitleRecord = payload + replace_provider_catalog_title( + session, + provider=provider, + slug=title_row.slug, + title=detail_record.title, + media_type_hint=detail_record.media_type_hint, + relative_path=detail_record.relative_path, + indexed_generation=generation, + ) + replace_provider_catalog_episodes( + session, + provider=provider, + slug=title_row.slug, + episodes=[ { - "language": language.language, - "host_hints": language.host_hints, + "season": episode.season, + "episode": episode.episode, + "relative_path": episode.relative_path, + "title_primary": episode.title_primary, + "title_secondary": episode.title_secondary, + "media_type_hint": episode.media_type_hint, + "languages": [ + { + "language": language.language, + "host_hints": language.host_hints, + } + for language in episode.languages + ], } - for language in episode.languages + for episode in detail_record.episodes ], - } - for episode in record.episodes - ], - indexed_generation=indexed_generation, - ) - if record.canonical.series is not None: - series = record.canonical.series - upsert_canonical_series( - session, - tvdb_id=int(series["tvdb_id"]), - title=str(series["title"]), - tmdb_id=series.get("tmdb_id"), - imdb_id=series.get("imdb_id"), - tvmaze_id=series.get("tvmaze_id"), - anilist_id=series.get("anilist_id"), - mal_id=series.get("mal_id"), - aliases=list(series.get("aliases") or []), - ) - replace_canonical_episodes( - session, - tvdb_id=int(series["tvdb_id"]), - episodes=record.canonical.episodes, + indexed_generation=generation, + ) + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_row.slug, + detail_status="ready", + detail_attempted_at=now, + detail_succeeded_at=now, + detail_next_retry_after=None, + detail_failure_count=0, + detail_last_error_summary=None, + commit=False, + ) + else: + canonical: CanonicalPayload = payload + if canonical.series is not None: + series = canonical.series + upsert_canonical_series( + session, + tvdb_id=int(series["tvdb_id"]), + title=str(series["title"]), + tmdb_id=series.get("tmdb_id"), + imdb_id=series.get("imdb_id"), + tvmaze_id=series.get("tvmaze_id"), + anilist_id=series.get("anilist_id"), + mal_id=series.get("mal_id"), + aliases=list(series.get("aliases") or []), + ) + replace_canonical_episodes( + session, + tvdb_id=int(series["tvdb_id"]), + episodes=canonical.episodes, + ) + replace_provider_series_mappings( + session, + provider=provider, + slug=title_row.slug, + mappings=canonical.series_mappings, + indexed_generation=generation, + ) + replace_provider_episode_mappings( + session, + provider=provider, + slug=title_row.slug, + mappings=canonical.episode_mappings, + indexed_generation=generation, + ) + replace_provider_movie_mappings( + session, + provider=provider, + slug=title_row.slug, + mappings=canonical.movie_mappings, + indexed_generation=generation, + ) + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_row.slug, + canonical_status="ready", + canonical_attempted_at=now, + canonical_succeeded_at=now, + canonical_next_retry_after=None, + canonical_failure_count=0, + canonical_last_error_summary=None, + commit=False, + ) + + self._writer.run(_persist) + + def _persist_stage_failure( + self, + *, + provider: str, + stage: str, + title_row: ProviderCatalogTitle, + state: ProviderTitleIndexState, + error: str, + ) -> None: + refresh_interval_hours = self._refresh_interval_hours(provider) + retry_at = utcnow() + timedelta(hours=refresh_interval_hours) + + def _persist(session: Session) -> None: + if stage == "detail_enrichment": + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_row.slug, + detail_status="failed", + detail_attempted_at=utcnow(), + detail_next_retry_after=retry_at, + detail_failure_count=state.detail_failure_count + 1, + detail_last_error_summary=error[:500], + commit=False, + ) + else: + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_row.slug, + canonical_status="failed", + canonical_attempted_at=utcnow(), + canonical_next_retry_after=retry_at, + canonical_failure_count=state.canonical_failure_count + 1, + canonical_last_error_summary=error[:500], + commit=False, + ) + + self._writer.run(_persist) + + def _mark_stage_running( + self, + *, + provider: str, + stage: str, + refresh_interval_hours: float, + ) -> None: + def _persist(session: Session) -> None: + payload = { + "provider": provider, + "refresh_interval_hours": refresh_interval_hours, + "status": "running", + "active_stage": stage, + "latest_started_at": utcnow(), + "last_error_summary": "", + "commit": False, + } + if stage == "detail_enrichment": + payload["detail_enrichment_status"] = "running" + payload["detail_next_retry_after"] = None + elif stage == "canonical_enrichment": + payload["canonical_enrichment_status"] = "running" + payload["canonical_next_retry_after"] = None + upsert_provider_index_status(session, **payload) + + self._writer.run(_persist) + + def _mark_stage_ready( + self, + session: Session, + *, + provider: str, + stage: str, + refresh_interval_hours: float, + completed_at, + ) -> None: + with Session(engine) as read_session: + status = get_provider_index_status(read_session, provider=provider) + overall_status = "ready" if is_provider_fully_ready(status) else "partial" + payload = { + "provider": provider, + "refresh_interval_hours": refresh_interval_hours, + "status": overall_status, + "active_stage": None, + "latest_completed_at": completed_at, + "last_error_summary": "", + "commit": False, + } + if stage == "detail_enrichment": + payload["detail_enrichment_status"] = "ready" + payload["detail_ready_at"] = completed_at + payload["detail_next_retry_after"] = None + else: + payload["canonical_enrichment_status"] = "ready" + payload["canonical_ready_at"] = completed_at + payload["canonical_next_retry_after"] = None + if status is not None and status.detail_enrichment_status == "ready": + payload["status"] = "ready" + upsert_provider_index_status(session, **payload) + + def _mark_stage_failed( + self, + session: Session, + *, + provider: str, + stage: str, + refresh_interval_hours: float, + completed_at, + error: str, + ) -> None: + payload = { + "provider": provider, + "refresh_interval_hours": refresh_interval_hours, + "status": "failed", + "active_stage": None, + "latest_completed_at": completed_at, + "last_error_summary": error[:500], + "commit": False, + } + retry_at = completed_at + timedelta(hours=refresh_interval_hours) + if stage == "detail_enrichment": + payload["detail_enrichment_status"] = "failed" + payload["detail_next_retry_after"] = retry_at + else: + payload["canonical_enrichment_status"] = "failed" + payload["canonical_next_retry_after"] = retry_at + upsert_provider_index_status(session, **payload) + + def _detail_stage_has_due_work(self, provider: str) -> bool: + return bool( + self._load_due_stage_rows( + provider=provider, stage="detail_enrichment", limit=1 ) - replace_provider_series_mappings( - session, - provider=record.provider, - slug=record.slug, - mappings=record.canonical.series_mappings, - indexed_generation=indexed_generation, - ) - replace_provider_episode_mappings( - session, - provider=record.provider, - slug=record.slug, - mappings=record.canonical.episode_mappings, - indexed_generation=indexed_generation, ) - replace_provider_movie_mappings( - session, - provider=record.provider, - slug=record.slug, - mappings=record.canonical.movie_mappings, - indexed_generation=indexed_generation, + + def _canonical_stage_has_due_work(self, provider: str) -> bool: + return bool( + self._load_due_stage_rows( + provider=provider, stage="canonical_enrichment", limit=1 + ) ) - def _record_title_failure(self, provider: str, slug: str, reason: str) -> None: + def _load_due_stage_rows( + self, + *, + provider: str, + stage: str, + limit: int, + ) -> list[tuple[ProviderCatalogTitle, ProviderTitleIndexState]]: + generation = self._visible_generation(provider) + if generation is None: + return [] + now = utcnow() with Session(engine) as session: - current = get_provider_index_status(session, provider=provider) - refresh_interval_hours = float( - CATALOG_SITE_CONFIGS.get(provider, {}).get( - "provider_index_refresh_hours", 24.0 - ) - ) - state = upsert_provider_title_index_state( - session, - provider=provider, - slug=slug, - attempted_at=utcnow(), - commit=False, - ) - upsert_provider_title_index_state( - session, - provider=provider, - slug=slug, - failure_count=state.failure_count + 1, - last_error_summary=reason[:500], - commit=False, - ) - upsert_provider_index_status( - session, - provider=provider, - refresh_interval_hours=refresh_interval_hours, - status="running", - current_generation=current.current_generation - if current is not None - else None, - cursor_title_slug=slug, - last_error_summary=reason[:500], - commit=False, + rows = list( + session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ).all() ) - session.commit() + due: list[tuple[ProviderCatalogTitle, ProviderTitleIndexState]] = [] + for row in rows: + state = session.get(ProviderTitleIndexState, (provider, row.slug)) + if state is None: + state = ProviderTitleIndexState(provider=provider, slug=row.slug) + if stage == "detail_enrichment": + retry_after = state.detail_next_retry_after + if retry_after is not None and as_aware_utc(retry_after) > now: + continue + if state.detail_status == "ready": + continue + due.append((row, state)) + else: + retry_after = state.canonical_next_retry_after + if retry_after is not None and as_aware_utc(retry_after) > now: + continue + if state.canonical_status == "ready": + continue + if ( + state.detail_status != "ready" + and row.media_type_hint != "movie" + ): + continue + due.append((row, state)) + if len(due) >= max(1, limit): + break + return due - def _is_bootstrap_ready(self) -> bool: + def _load_aliases(self, *, provider: str, slug: str) -> list[str]: + generation = self._visible_generation(provider) + if generation is None: + return [] with Session(engine) as session: - return is_catalog_bootstrap_ready(session, providers=CATALOG_SITES_LIST) + rows = session.exec( + select(ProviderCatalogAlias).where( + (ProviderCatalogAlias.provider == provider) + & (ProviderCatalogAlias.slug == slug) + & (ProviderCatalogAlias.indexed_generation == generation) + ) + ).all() + return [row.alias for row in rows] - def _log_bootstrap_state(self) -> None: + def _load_episode_records(self, *, provider: str, slug: str) -> list[EpisodeRecord]: + generation = self._visible_generation(provider) + if generation is None: + return [] with Session(engine) as session: - statuses = list_provider_index_statuses(session) - bootstrap_ready = is_catalog_bootstrap_ready( - session, providers=CATALOG_SITES_LIST - ) - if not statuses: - logger.warning( - "Provider catalog bootstrap: no provider status rows exist yet" + episode_rows = session.exec( + select(ProviderCatalogEpisode).where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.slug == slug) + & (ProviderCatalogEpisode.indexed_generation == generation) + ) + ).all() + language_rows = session.exec( + select(ProviderEpisodeLanguage).where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.slug == slug) + & (ProviderEpisodeLanguage.indexed_generation == generation) + ) + ).all() + languages_by_episode: dict[tuple[int, int], list[EpisodeLanguageRecord]] = {} + for row in language_rows: + key = (int(row.season), int(row.episode)) + languages_by_episode.setdefault(key, []).append( + EpisodeLanguageRecord( + language=row.language, + host_hints=list(row.host_hints or []), + ) ) - return - if bootstrap_ready: - logger.info("Provider catalog bootstrap: already complete") - else: - logger.warning( - "Provider catalog bootstrap: incomplete, requests may be gated" + return [ + EpisodeRecord( + season=int(row.season), + episode=int(row.episode), + relative_path=row.relative_path, + title_primary=row.title_primary, + title_secondary=row.title_secondary, + media_type_hint=row.media_type_hint, + languages=languages_by_episode.get( + (int(row.season), int(row.episode)), [] + ), ) - for status in sorted(statuses, key=lambda item: item.provider): - logger.info( - "Provider catalog bootstrap state: provider={} status={} bootstrap_completed={} latest_success_generation={} latest_started_at={} latest_completed_at={} next_refresh_after={} cursor_slug={} last_error={}", - status.provider, - status.status, - status.bootstrap_completed, - status.latest_success_generation, - status.latest_started_at.isoformat() - if status.latest_started_at is not None - else None, - status.latest_completed_at.isoformat() - if status.latest_completed_at is not None - else None, - status.next_refresh_after.isoformat() - if status.next_refresh_after is not None - else None, - status.cursor_title_slug or None, - status.last_error_summary or None, + for row in episode_rows + ] + + def _visible_generation(self, provider: str) -> str | None: + with Session(engine) as session: + status = get_provider_index_status(session, provider=provider) + if status is None: + return None + return status.latest_success_generation + + def _count_visible_titles(self, provider: str) -> int: + generation = self._visible_generation(provider) + if generation is None: + return 0 + with Session(engine) as session: + rows = session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ).all() + return len(rows) + + def _refresh_interval_hours(self, provider: str) -> float: + return float( + CATALOG_SITE_CONFIGS.get(provider, {}).get( + "provider_index_refresh_hours", 24.0 ) + ) + + def _provider_concurrency(self, provider: str) -> int: + return max( + 1, + int( + CATALOG_SITE_CONFIGS.get(provider, {}).get( + "provider_index_concurrency", 1 + ) + ), + ) + + def _enqueue_title_record( + self, + provider: str, + queue: Queue[TitleRecord | object], + record: TitleRecord, + writer_failure: list[BaseException], + ) -> None: + last_backpressure_log = 0.0 + while True: + if writer_failure: + raise RuntimeError(str(writer_failure[0])) + try: + queue.put(record, timeout=1.0) + self._set_progress(provider, queue_depth=queue.qsize()) + return + except Exception: + depth = queue.qsize() + self._set_progress(provider, queue_depth=depth) + now = monotonic() + if ( + now - last_backpressure_log + >= PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS + ): + logger.warning( + "Provider catalog {}: writer backpressure queue_depth={} lag_titles={}", + provider, + depth, + self._get_writer_lag(provider), + ) + last_backpressure_log = now def _stale_generation(self, status) -> str | None: current_generation = getattr(status, "current_generation", None) @@ -1045,6 +1476,7 @@ def _set_progress( provider: str, *, phase: str | None = None, + stage: str | None = None, crawled_titles: int | None = None, persisted_titles: int | None = None, failed_titles: int | None = None, @@ -1060,6 +1492,8 @@ def _set_progress( self._progress[provider] = snapshot if phase is not None: snapshot.phase = phase + if stage is not None: + snapshot.stage = stage if crawled_titles is not None: snapshot.crawled_titles = crawled_titles if persisted_titles is not None: @@ -1120,80 +1554,73 @@ def _advance_persist_progress( snapshot = self._progress.setdefault( provider, ProviderCatalogProgress(provider=provider) ) - snapshot.phase = "persisting_titles" snapshot.persisted_titles += count snapshot.current_slug = current_slug snapshot.queue_depth = queue_depth self._maybe_log_progress(snapshot, kind="persist") def _maybe_log_progress( - self, - snapshot: ProviderCatalogProgress, - *, - kind: str, + self, snapshot: ProviderCatalogProgress, *, kind: str ) -> None: - if snapshot.total_titles is None: + percent = ( + snapshot.crawl_percent if kind == "crawl" else snapshot.progress_percent + ) + if percent is None: return - step = max(1, int(PROGRESS_STEP_PERCENT)) + step = int(percent // PROGRESS_STEP_PERCENT) if kind == "crawl": - percent = snapshot.crawl_percent - current_step = int(percent or 0.0) // step - if ( - percent is not None - and percent < 100.0 - and current_step <= snapshot.last_logged_crawl_step - ): + if step <= snapshot.last_logged_crawl_step: return - snapshot.last_logged_crawl_step = current_step - logger.info( - "Provider catalog {} progress [crawl]: crawled={} failed={} persisted={} total={} crawl_percent={} queue_depth={} lag={} current={}", - snapshot.provider, - snapshot.crawled_titles, - snapshot.failed_titles, - snapshot.persisted_titles, - snapshot.total_titles, - percent, - snapshot.queue_depth, - snapshot.writer_lag_titles, - snapshot.current_slug, - ) - return - percent = snapshot.progress_percent - current_step = int(percent or 0.0) // step - if ( - percent is not None - and percent < 100.0 - and current_step <= snapshot.last_logged_persist_step - ): - return - snapshot.last_logged_persist_step = current_step - logger.info( - "Provider catalog {} progress [persist]: persisted={} total={} percent={} queue_depth={} lag={} current={}", - snapshot.provider, - snapshot.persisted_titles, - snapshot.total_titles, - percent, - snapshot.queue_depth, - snapshot.writer_lag_titles, - snapshot.current_slug, - ) + snapshot.last_logged_crawl_step = step + else: + if step <= snapshot.last_logged_persist_step: + return + snapshot.last_logged_persist_step = step def _get_total_titles(self, provider: str) -> int | None: with self._progress_lock: snapshot = self._progress.get(provider) - return None if snapshot is None else snapshot.total_titles + return snapshot.total_titles if snapshot is not None else None def _get_persisted_titles(self, provider: str) -> int: with self._progress_lock: snapshot = self._progress.get(provider) - return 0 if snapshot is None else snapshot.persisted_titles + if snapshot is None: + return 0 + return snapshot.persisted_titles def _get_failed_titles(self, provider: str) -> int: with self._progress_lock: snapshot = self._progress.get(provider) - return 0 if snapshot is None else snapshot.failed_titles + if snapshot is None: + return 0 + return snapshot.failed_titles def _get_writer_lag(self, provider: str) -> int: with self._progress_lock: snapshot = self._progress.get(provider) - return 0 if snapshot is None else snapshot.writer_lag_titles + if snapshot is None: + return 0 + return snapshot.writer_lag_titles + + def _is_bootstrap_ready(self) -> bool: + with Session(engine) as session: + return is_catalog_bootstrap_ready(session, providers=CATALOG_SITES_LIST) + + def _log_bootstrap_state(self) -> None: + with Session(engine) as session: + statuses = list_provider_index_statuses(session) + for status in statuses: + logger.info( + "Provider catalog bootstrap state: provider={} status={} active_stage={} title_index_status={} detail_status={} canonical_status={} latest_success_generation={} next_refresh_after={}", + status.provider, + status.status, + status.active_stage, + status.title_index_status, + status.detail_enrichment_status, + status.canonical_enrichment_status, + status.latest_success_generation, + status.next_refresh_after.isoformat() + if status.next_refresh_after is not None + else None, + ) diff --git a/apps/api/app/catalog/metadata.py b/apps/api/app/catalog/metadata.py index 2063570d..6c0447ca 100644 --- a/apps/api/app/catalog/metadata.py +++ b/apps/api/app/catalog/metadata.py @@ -1,25 +1,62 @@ from __future__ import annotations +from collections import OrderedDict from dataclasses import dataclass from difflib import SequenceMatcher -from typing import Any, Optional +from threading import Lock +from time import time +from typing import Any, Generic, Optional, TypeVar from urllib.parse import urlencode -import threading -import time from loguru import logger +from app.config import ( + CANONICAL_CACHE_MEMORY_MAX_SEARCH, + CANONICAL_CACHE_MEMORY_MAX_SHOW, + CANONICAL_CACHE_TTL_SECONDS, +) from app.db import normalize_catalog_text from app.utils.http_client import get as http_get SKYHOOK_SEARCH_URL = "https://skyhook.sonarr.tv/v1/tvdb/search/en/" SKYHOOK_SHOW_URL = "https://skyhook.sonarr.tv/v1/tvdb/shows/en/{tvdb_id}" SKYHOOK_TIMEOUT_SECONDS = 4.0 -SKYHOOK_CACHE_TTL_SECONDS = 3600.0 -_cache_lock = threading.Lock() -_search_cache: dict[str, tuple[float, list[dict[str, Any]]]] = {} -_show_cache: dict[int, tuple[float, dict[str, Any]]] = {} +TKey = TypeVar("TKey") +TValue = TypeVar("TValue") + + +class TtlLruCache(Generic[TKey, TValue]): + def __init__(self, *, max_entries: int, ttl_seconds: int) -> None: + self._max_entries = max(1, int(max_entries)) + self._ttl_seconds = max(1, int(ttl_seconds)) + self._entries: OrderedDict[TKey, tuple[float, TValue]] = OrderedDict() + self._lock = Lock() + + def get(self, key: TKey) -> TValue | None: + now = time() + with self._lock: + entry = self._entries.get(key) + if entry is None: + return None + expires_at, payload = entry + if expires_at <= now: + self._entries.pop(key, None) + return None + self._entries.move_to_end(key) + return payload + + def set(self, key: TKey, value: TValue) -> None: + expires_at = time() + self._ttl_seconds + with self._lock: + self._entries[key] = (expires_at, value) + self._entries.move_to_end(key) + while len(self._entries) > self._max_entries: + self._entries.popitem(last=False) + + def size(self) -> int: + with self._lock: + return len(self._entries) @dataclass(slots=True) @@ -32,40 +69,21 @@ class TvCanonicalMatch: payload: dict[str, Any] -def _cache_get_search(term: str) -> list[dict[str, Any]] | None: - now = time.time() - with _cache_lock: - entry = _search_cache.get(term) - if entry is None: - return None - cached_at, payload = entry - if now - cached_at > SKYHOOK_CACHE_TTL_SECONDS: - _search_cache.pop(term, None) - return None - return [dict(item) for item in payload] - - -def _cache_set_search(term: str, payload: list[dict[str, Any]]) -> None: - with _cache_lock: - _search_cache[term] = (time.time(), [dict(item) for item in payload]) - - -def _cache_get_show(tvdb_id: int) -> dict[str, Any] | None: - now = time.time() - with _cache_lock: - entry = _show_cache.get(tvdb_id) - if entry is None: - return None - cached_at, payload = entry - if now - cached_at > SKYHOOK_CACHE_TTL_SECONDS: - _show_cache.pop(tvdb_id, None) - return None - return dict(payload) +_search_cache: TtlLruCache[str, list[dict[str, Any]]] = TtlLruCache( + max_entries=CANONICAL_CACHE_MEMORY_MAX_SEARCH, + ttl_seconds=CANONICAL_CACHE_TTL_SECONDS, +) +_show_cache: TtlLruCache[int, dict[str, Any]] = TtlLruCache( + max_entries=CANONICAL_CACHE_MEMORY_MAX_SHOW, + ttl_seconds=CANONICAL_CACHE_TTL_SECONDS, +) -def _cache_set_show(tvdb_id: int, payload: dict[str, Any]) -> None: - with _cache_lock: - _show_cache[tvdb_id] = (time.time(), dict(payload)) +def canonical_cache_stats() -> dict[str, int]: + return { + "search_entries": _search_cache.size(), + "show_entries": _show_cache.size(), + } def _score_title(query: str, candidate: str) -> float: @@ -120,7 +138,7 @@ def resolve_tv_canonical_match( imdb_id=imdb_id, tmdb_id=tmdb_id, ): - payload = _cache_get_search(term) + payload = _search_cache.get(term) if payload is None: try: query = urlencode({"term": term}) @@ -136,15 +154,12 @@ def resolve_tv_canonical_match( if not isinstance(raw_payload, list): continue payload = [item for item in raw_payload if isinstance(item, dict)] - _cache_set_search(term, payload) - try: - for item in payload: - copied = dict(item) - copied["_ab_source"] = source - copied["_ab_term"] = term - candidates.append(copied) - except Exception: - continue + _search_cache.set(term, [dict(item) for item in payload]) + for item in payload: + copied = dict(item) + copied["_ab_source"] = source + copied["_ab_term"] = term + candidates.append(copied) best_match: Optional[tuple[float, dict[str, Any]]] = None for item in candidates: @@ -166,7 +181,7 @@ def resolve_tv_canonical_match( score, item = best_match tvdb_id = int(item["tvdbId"]) - payload = _cache_get_show(tvdb_id) + payload = _show_cache.get(tvdb_id) if payload is None: try: response = http_get( @@ -180,8 +195,8 @@ def resolve_tv_canonical_match( return None if not isinstance(raw_payload, dict): return None - payload = raw_payload - _cache_set_show(tvdb_id, payload) + payload = dict(raw_payload) + _show_cache.set(tvdb_id, dict(payload)) if score >= 0.99: confidence = "confirmed" diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py index 1fc73006..354e47e2 100644 --- a/apps/api/app/catalog/providers.py +++ b/apps/api/app/catalog/providers.py @@ -1,9 +1,8 @@ from __future__ import annotations -from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait +from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError from dataclasses import dataclass, field from difflib import SequenceMatcher -from time import monotonic import re from typing import Any, Callable, Optional from urllib.parse import urlparse @@ -12,8 +11,7 @@ from loguru import logger from app.catalog.metadata import resolve_tv_canonical_match -from app.config import CATALOG_SITE_CONFIGS, PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS -from app.db import normalize_catalog_text +from app.config import CATALOG_SITE_CONFIGS from app.providers import get_provider from app.providers.megakino.client import ( get_default_client as get_default_megakino_client, @@ -21,8 +19,6 @@ from app.utils.domain_resolver import get_megakino_base_url from app.utils.http_client import get as http_get -_CATALOG_STREAM_HEARTBEAT_SECONDS = 15.0 - @dataclass(slots=True) class EpisodeLanguageRecord: @@ -77,15 +73,6 @@ class CatalogStreamSummary: failed_titles: int = 0 -@dataclass(slots=True) -class _TitleJob: - slug: str - title: str - aliases: list[str] - started_at: float - timed_out: bool = False - - def _relative_path(url: str) -> str: parsed = urlparse(url) path = parsed.path or "/" @@ -94,6 +81,20 @@ def _relative_path(url: str) -> str: return path +def _run_with_timeout( + timeout_seconds: float, func: Callable[..., Any], *args, **kwargs +): + executor = ThreadPoolExecutor(max_workers=1) + future = executor.submit(func, *args, **kwargs) + try: + return future.result(timeout=max(0.001, timeout_seconds)) + except FutureTimeoutError as exc: + future.cancel() + raise TimeoutError(f"title crawl exceeded {int(timeout_seconds)}s") from exc + finally: + executor.shutdown(wait=False, cancel_futures=True) + + def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageRecord]: if not isinstance(raw, dict): return [] @@ -283,6 +284,8 @@ def _parse_sto_season_rows(season) -> list[EpisodeRecord]: def _score_episode_title(left: str, right: str) -> float: + from app.db import normalize_catalog_text + a = normalize_catalog_text(left) b = normalize_catalog_text(right) if not a or not b: @@ -432,31 +435,100 @@ def _build_tv_canonical_payload( ) -def _crawl_aniworld_like_title( +def _fallback_title_record( *, provider_key: str, slug: str, title: str, aliases: list[str], ) -> TitleRecord: - site_cfg = CATALOG_SITE_CONFIGS[provider_key] - base_url = str(site_cfg["base_url"]).rstrip("/") - relative_root = ( - f"/anime/stream/{slug}" if provider_key == "aniworld.to" else f"/serie/{slug}" + media_type_hint = "movie" if provider_key == "megakino" else "series" + if provider_key == "aniworld.to": + relative_path = f"/anime/stream/{slug}" + elif provider_key == "s.to": + relative_path = f"/serie/{slug}" + else: + relative_path = f"/{slug}" + return TitleRecord( + provider=provider_key, + slug=slug, + title=title, + aliases=aliases, + media_type_hint=media_type_hint, + relative_path=relative_path, + episodes=[], + canonical=CanonicalPayload(), ) - url = f"{base_url}{relative_root}" + + +def load_provider_title_index( + provider_key: str, + *, + observer: CatalogCrawlObserver | None = None, +) -> list[TitleRecord]: + provider = get_provider(provider_key) + if provider is None: + return [] + + if provider_key == "megakino": + client = get_default_megakino_client() + entries = client.load_index() + if observer is not None and observer.on_index_loaded is not None: + observer.on_index_loaded(len(entries)) + rows = [ + TitleRecord( + provider=provider_key, + slug=entry.slug, + title=entry.slug.replace("-", " ").title(), + aliases=[], + media_type_hint="movie" if entry.kind == "film" else "series", + relative_path=_relative_path(entry.url), + episodes=[], + canonical=CanonicalPayload(), + ) + for entry in entries.values() + ] + rows.sort(key=lambda item: item.slug) + return rows + + logger.info("Provider catalog {}: loading title index", provider_key) + index = provider.load_or_refresh_index() + alternatives = provider.load_or_refresh_alternatives() + if observer is not None and observer.on_index_loaded is not None: + observer.on_index_loaded(len(index)) + rows = [] + for slug, title in index.items(): + aliases = list(dict.fromkeys(alternatives.get(slug, []) or [title])) + rows.append( + _fallback_title_record( + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + ) + ) + rows.sort(key=lambda item: item.slug) + return rows + + +def _crawl_aniworld_like_detail( + *, + provider_key: str, + slug: str, + title: str, + aliases: list[str], +) -> TitleRecord: + base_url = str(CATALOG_SITE_CONFIGS[provider_key]["base_url"]).rstrip("/") if provider_key == "aniworld.to": from aniworld.models import AniworldSeries - series = AniworldSeries(url) - imdb_id = series.imdb - mal_id = None + relative_root = f"/anime/stream/{slug}" + series = AniworldSeries(f"{base_url}{relative_root}") else: from aniworld.models import SerienstreamSeries - series = SerienstreamSeries(url) - imdb_id = series.imdb - mal_id = None + relative_root = f"/serie/{slug}" + series = SerienstreamSeries(f"{base_url}{relative_root}") episodes: list[EpisodeRecord] = [] for season in series.seasons: @@ -465,15 +537,6 @@ def _crawl_aniworld_like_title( else: episodes.extend(_parse_sto_season_rows(season)) - canonical = _build_tv_canonical_payload( - provider=provider_key, - slug=slug, - title=series.title or title, - aliases=aliases, - imdb_id=imdb_id, - mal_id=mal_id, - episodes=episodes, - ) return TitleRecord( provider=provider_key, slug=slug, @@ -482,21 +545,28 @@ def _crawl_aniworld_like_title( media_type_hint="series", relative_path=relative_root, episodes=episodes, - canonical=canonical, + canonical=CanonicalPayload(), ) -def _crawl_title_job( +def crawl_provider_title_detail( *, provider_key: str, slug: str, title: str, aliases: list[str], - observer: CatalogCrawlObserver | None, + timeout_seconds: float, ) -> TitleRecord: - if observer is not None and observer.on_title_started is not None: - observer.on_title_started(slug) - return _crawl_aniworld_like_title( + if provider_key == "megakino": + return _fallback_title_record( + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + ) + return _run_with_timeout( + timeout_seconds, + _crawl_aniworld_like_detail, provider_key=provider_key, slug=slug, title=title, @@ -504,6 +574,30 @@ def _crawl_title_job( ) +def resolve_provider_canonical( + *, + provider_key: str, + slug: str, + title: str, + aliases: list[str], + media_type_hint: str, + episodes: list[EpisodeRecord], + imdb_id: Optional[str] = None, + mal_id: Optional[int] = None, +) -> CanonicalPayload: + if provider_key == "megakino" or media_type_hint == "movie": + return CanonicalPayload() + return _build_tv_canonical_payload( + provider=provider_key, + slug=slug, + title=title, + aliases=aliases, + imdb_id=imdb_id, + mal_id=mal_id, + episodes=episodes, + ) + + def _parse_megakino_page_metadata(url: str) -> tuple[str | None, int | None]: base_url = get_megakino_base_url().rstrip("/") response = http_get(url, timeout=20, headers={"Referer": base_url}) @@ -522,229 +616,13 @@ def _parse_megakino_page_metadata(url: str) -> tuple[str | None, int | None]: return title, year -def _emit_title_failure( - *, - provider_key: str, - slug: str, - title: str, - reason: str, - observer: CatalogCrawlObserver | None, -) -> None: - logger.warning( - "Provider catalog {}: title crawl failed slug={} title={}: {}", - provider_key, - slug, - title, - reason, - ) - if observer is not None and observer.on_title_failed is not None: - observer.on_title_failed(slug, reason) - - -def _stream_aniworld_like_catalog( - *, - provider_key: str, - index: dict[str, str], - alternatives: dict[str, list[str]], - emit_title: Callable[[TitleRecord], None], - observer: CatalogCrawlObserver | None, - title_timeout_seconds: float, -) -> CatalogStreamSummary: - workers = int( - CATALOG_SITE_CONFIGS[provider_key].get("provider_index_concurrency", 1) - ) - max_workers = max(1, workers) - summary = CatalogStreamSummary(discovered_titles=len(index)) - pending: dict[Future[TitleRecord], _TitleJob] = {} - pending_iter = iter(index.items()) - - def _submit_next(executor: ThreadPoolExecutor) -> bool: - try: - slug, title = next(pending_iter) - except StopIteration: - return False - aliases = list(dict.fromkeys(alternatives.get(slug, []) or [title])) - future = executor.submit( - _crawl_title_job, - provider_key=provider_key, - slug=slug, - title=title, - aliases=aliases, - observer=observer, - ) - pending[future] = _TitleJob( - slug=slug, - title=title, - aliases=aliases, - started_at=monotonic(), - ) - return True - - executor = ThreadPoolExecutor(max_workers=max_workers) - try: - while len(pending) < max_workers and _submit_next(executor): - pass - - while pending: - done, _ = wait( - pending.keys(), - timeout=_CATALOG_STREAM_HEARTBEAT_SECONDS, - return_when=FIRST_COMPLETED, - ) - if not done: - completed = summary.crawled_titles + summary.failed_titles - if summary.discovered_titles > 0: - percent = round(completed / summary.discovered_titles * 100.0, 1) - logger.info( - "Provider catalog {}: crawling title details {}/{} ({}%) active={} queued={}", - provider_key, - completed, - summary.discovered_titles, - percent, - len(pending), - max(0, summary.discovered_titles - completed - len(pending)), - ) - else: - logger.info( - "Provider catalog {}: still discovering titles active={}", - provider_key, - len(pending), - ) - for future, job in list(pending.items()): - if job.timed_out or future.done(): - continue - elapsed = monotonic() - job.started_at - if elapsed < title_timeout_seconds: - continue - job.timed_out = True - _emit_title_failure( - provider_key=provider_key, - slug=job.slug, - title=job.title, - reason=( - f"title crawl exceeded {int(title_timeout_seconds)}s; " - "worker left running until underlying I/O returns" - ), - observer=observer, - ) - summary.failed_titles += 1 - continue - - for future in done: - job = pending.pop(future) - try: - record = future.result() - except Exception as exc: - if not job.timed_out: - summary.failed_titles += 1 - _emit_title_failure( - provider_key=provider_key, - slug=job.slug, - title=job.title, - reason=str(exc), - observer=observer, - ) - else: - if not job.timed_out: - emit_title(record) - summary.crawled_titles += 1 - if ( - observer is not None - and observer.on_title_crawled is not None - ): - observer.on_title_crawled(record.slug) - while len(pending) < max_workers and _submit_next(executor): - pass - finally: - executor.shutdown(wait=False, cancel_futures=True) - return summary - - -def stream_provider_catalog( - provider_key: str, - *, - emit_title: Callable[[TitleRecord], None], - observer: CatalogCrawlObserver | None = None, -) -> CatalogStreamSummary: - provider = get_provider(provider_key) - if provider is None: - return CatalogStreamSummary() - - if provider_key == "megakino": - client = get_default_megakino_client() - entries = client.load_index() - if observer is not None and observer.on_index_loaded is not None: - observer.on_index_loaded(len(entries)) - summary = CatalogStreamSummary(discovered_titles=len(entries)) - for entry in entries.values(): - parsed_title = entry.slug.replace("-", " ").title() - try: - live_title, _year = _parse_megakino_page_metadata(entry.url) - if live_title: - parsed_title = live_title - except Exception as exc: - logger.debug( - "Megakino metadata fetch failed for {}: {}", entry.url, exc - ) - emit_title( - TitleRecord( - provider=provider_key, - slug=entry.slug, - title=parsed_title, - aliases=[], - media_type_hint="movie" if entry.kind == "film" else "series", - relative_path=_relative_path(entry.url), - episodes=[], - canonical=CanonicalPayload(), - ) - ) - summary.crawled_titles += 1 - if observer is not None and observer.on_title_crawled is not None: - observer.on_title_crawled(entry.slug) - return summary - - logger.info("Provider catalog {}: loading title index", provider_key) - index = provider.load_or_refresh_index() - logger.info( - "Provider catalog {}: title index loaded ({} titles)", - provider_key, - len(index), - ) - logger.info("Provider catalog {}: loading title alternatives", provider_key) - alternatives = provider.load_or_refresh_alternatives() - logger.info( - "Provider catalog {}: title alternatives loaded ({} titles with aliases)", - provider_key, - len(alternatives), - ) - if observer is not None and observer.on_index_loaded is not None: - observer.on_index_loaded(len(index)) - title_timeout_seconds = float( - CATALOG_SITE_CONFIGS[provider_key].get( - "provider_index_title_timeout_seconds", - PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, - ) - ) - return _stream_aniworld_like_catalog( - provider_key=provider_key, - index=index, - alternatives=alternatives, - emit_title=emit_title, - observer=observer, - title_timeout_seconds=title_timeout_seconds, - ) - - def crawl_provider_catalog( provider_key: str, *, observer: CatalogCrawlObserver | None = None, ) -> list[TitleRecord]: - titles: list[TitleRecord] = [] - stream_provider_catalog( - provider_key, - emit_title=titles.append, - observer=observer, - ) - titles.sort(key=lambda item: item.slug) - return titles + rows = load_provider_title_index(provider_key, observer=observer) + for row in rows: + if observer is not None and observer.on_title_crawled is not None: + observer.on_title_crawled(row.slug) + return rows diff --git a/apps/api/app/config.py b/apps/api/app/config.py index 708b7c12..3ef40270 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -286,17 +286,17 @@ def _ensure_runtime_home() -> Path: if PROVIDER_INDEX_GLOBAL_CONCURRENCY < 1: PROVIDER_INDEX_GLOBAL_CONCURRENCY = 1 PROVIDER_INDEX_CONCURRENCY_ANIWORLD = _as_non_negative_int( - os.getenv("PROVIDER_INDEX_CONCURRENCY_ANIWORLD"), 1 + os.getenv("PROVIDER_INDEX_CONCURRENCY_ANIWORLD"), 4 ) if PROVIDER_INDEX_CONCURRENCY_ANIWORLD < 1: PROVIDER_INDEX_CONCURRENCY_ANIWORLD = 1 PROVIDER_INDEX_CONCURRENCY_STO = _as_non_negative_int( - os.getenv("PROVIDER_INDEX_CONCURRENCY_STO"), 1 + os.getenv("PROVIDER_INDEX_CONCURRENCY_STO"), 4 ) if PROVIDER_INDEX_CONCURRENCY_STO < 1: PROVIDER_INDEX_CONCURRENCY_STO = 1 PROVIDER_INDEX_CONCURRENCY_MEGAKINO = _as_non_negative_int( - os.getenv("PROVIDER_INDEX_CONCURRENCY_MEGAKINO"), 1 + os.getenv("PROVIDER_INDEX_CONCURRENCY_MEGAKINO"), 2 ) if PROVIDER_INDEX_CONCURRENCY_MEGAKINO < 1: PROVIDER_INDEX_CONCURRENCY_MEGAKINO = 1 @@ -306,11 +306,11 @@ def _ensure_runtime_home() -> Path: ) PROVIDER_INDEX_QUEUE_SIZE = max( 1, - _as_non_negative_int(os.getenv("PROVIDER_INDEX_QUEUE_SIZE"), 32), + _as_non_negative_int(os.getenv("PROVIDER_INDEX_QUEUE_SIZE"), 8), ) PROVIDER_INDEX_WRITER_BATCH_SIZE = max( 1, - _as_non_negative_int(os.getenv("PROVIDER_INDEX_WRITER_BATCH_SIZE"), 8), + _as_non_negative_int(os.getenv("PROVIDER_INDEX_WRITER_BATCH_SIZE"), 32), ) PROVIDER_INDEX_WRITER_FLUSH_SECONDS = max( 0.1, @@ -333,6 +333,22 @@ def _ensure_runtime_home() -> Path: 15.0, ), ) +CANONICAL_INDEX_CONCURRENCY = max( + 1, + _as_non_negative_int(os.getenv("CANONICAL_INDEX_CONCURRENCY"), 2), +) +CANONICAL_CACHE_MEMORY_MAX_SEARCH = max( + 1, + _as_non_negative_int(os.getenv("CANONICAL_CACHE_MEMORY_MAX_SEARCH"), 512), +) +CANONICAL_CACHE_MEMORY_MAX_SHOW = max( + 1, + _as_non_negative_int(os.getenv("CANONICAL_CACHE_MEMORY_MAX_SHOW"), 256), +) +CANONICAL_CACHE_TTL_SECONDS = max( + 1, + _as_non_negative_int(os.getenv("CANONICAL_CACHE_TTL_SECONDS"), 3600), +) logger.debug( f"ANIWORLD_ALPHABET_HTML={ANIWORLD_ALPHABET_HTML}, ANIWORLD_ALPHABET_URL={ANIWORLD_ALPHABET_URL}" @@ -370,6 +386,13 @@ def _ensure_runtime_home() -> Path: PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT, PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS, ) +logger.debug( + "Canonical index/cache: concurrency={} search_cache_max={} show_cache_max={} ttl_seconds={}", + CANONICAL_INDEX_CONCURRENCY, + CANONICAL_CACHE_MEMORY_MAX_SEARCH, + CANONICAL_CACHE_MEMORY_MAX_SHOW, + CANONICAL_CACHE_TTL_SECONDS, +) # TTL (Stunden) für Live-Index; 0 = nie neu laden (nur einmal pro Prozess) ANIWORLD_TITLES_REFRESH_HOURS = float(os.getenv("ANIWORLD_TITLES_REFRESH_HOURS", "24")) diff --git a/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py b/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py new file mode 100644 index 00000000..9e3373f2 --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py @@ -0,0 +1,195 @@ +"""Add staged provider index status and title enrichment state + +Revision ID: 20260429_0006 +Revises: 20260429_0005 +Create Date: 2026-04-29 00:30:00.000000 +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260429_0006" +down_revision = "20260429_0005" +branch_labels = None +depends_on = None + + +def _has_column(inspector, table_name: str, column_name: str) -> bool: + return column_name in { + column["name"] for column in inspector.get_columns(table_name) + } + + +def upgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + + if inspector.has_table("providerindexstatus"): + with op.batch_alter_table("providerindexstatus") as batch_op: + if not _has_column(inspector, "providerindexstatus", "active_stage"): + batch_op.add_column( + sa.Column("active_stage", sa.String(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_active_stage", + ["active_stage"], + unique=False, + ) + if not _has_column(inspector, "providerindexstatus", "title_index_status"): + batch_op.add_column( + sa.Column( + "title_index_status", + sa.String(), + nullable=False, + server_default="pending", + ) + ) + batch_op.create_index( + "ix_providerindexstatus_title_index_status", + ["title_index_status"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "title_index_ready_at" + ): + batch_op.add_column( + sa.Column("title_index_ready_at", sa.DateTime(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_title_index_ready_at", + ["title_index_ready_at"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "title_index_next_retry_after" + ): + batch_op.add_column( + sa.Column( + "title_index_next_retry_after", + sa.DateTime(), + nullable=True, + ) + ) + batch_op.create_index( + "ix_providerindexstatus_title_index_next_retry_after", + ["title_index_next_retry_after"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "detail_enrichment_status" + ): + batch_op.add_column( + sa.Column( + "detail_enrichment_status", + sa.String(), + nullable=False, + server_default="pending", + ) + ) + batch_op.create_index( + "ix_providerindexstatus_detail_enrichment_status", + ["detail_enrichment_status"], + unique=False, + ) + if not _has_column(inspector, "providerindexstatus", "detail_ready_at"): + batch_op.add_column( + sa.Column("detail_ready_at", sa.DateTime(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_detail_ready_at", + ["detail_ready_at"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "detail_next_retry_after" + ): + batch_op.add_column( + sa.Column("detail_next_retry_after", sa.DateTime(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_detail_next_retry_after", + ["detail_next_retry_after"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "canonical_enrichment_status" + ): + batch_op.add_column( + sa.Column( + "canonical_enrichment_status", + sa.String(), + nullable=False, + server_default="pending", + ) + ) + batch_op.create_index( + "ix_providerindexstatus_canonical_enrichment_status", + ["canonical_enrichment_status"], + unique=False, + ) + if not _has_column(inspector, "providerindexstatus", "canonical_ready_at"): + batch_op.add_column( + sa.Column("canonical_ready_at", sa.DateTime(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_canonical_ready_at", + ["canonical_ready_at"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "canonical_next_retry_after" + ): + batch_op.add_column( + sa.Column( + "canonical_next_retry_after", + sa.DateTime(), + nullable=True, + ) + ) + batch_op.create_index( + "ix_providerindexstatus_canonical_next_retry_after", + ["canonical_next_retry_after"], + unique=False, + ) + + if inspector.has_table("providertitleindexstate"): + with op.batch_alter_table("providertitleindexstate") as batch_op: + staged_columns = [ + ("detail_status", sa.String(), "pending"), + ("detail_last_attempted_at", sa.DateTime(), None), + ("detail_last_success_at", sa.DateTime(), None), + ("detail_next_retry_after", sa.DateTime(), None), + ("detail_failure_count", sa.Integer(), 0), + ("detail_last_error_summary", sa.String(), None), + ("canonical_status", sa.String(), "pending"), + ("canonical_last_attempted_at", sa.DateTime(), None), + ("canonical_last_success_at", sa.DateTime(), None), + ("canonical_next_retry_after", sa.DateTime(), None), + ("canonical_failure_count", sa.Integer(), 0), + ("canonical_last_error_summary", sa.String(), None), + ] + for name, column_type, default in staged_columns: + if _has_column(inspector, "providertitleindexstate", name): + continue + kwargs = {"nullable": True} + if default is not None: + kwargs["server_default"] = str(default) + kwargs["nullable"] = False + batch_op.add_column(sa.Column(name, column_type, **kwargs)) + if ( + name.endswith("_status") + or name.endswith("_at") + or name.endswith("_retry_after") + ): + batch_op.create_index( + f"ix_providertitleindexstate_{name}", + [name], + unique=False, + ) + + +def downgrade() -> None: + pass diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index 1ddc6873..f320ba61 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -11,7 +11,7 @@ # Defer logger configuration to application startup from sqlmodel import SQLModel, Field, Session, create_engine, select, Column, JSON -from sqlalchemy import tuple_ +from sqlalchemy import event, tuple_ from sqlalchemy.orm import registry as sa_registry from sqlalchemy.pool import NullPool @@ -27,6 +27,12 @@ "ready", "failed", ] +CatalogStageStatus = Literal[ + "pending", + "running", + "ready", + "failed", +] CatalogMappingConfidence = Literal[ "confirmed", "high_confidence", @@ -180,6 +186,7 @@ class ProviderIndexStatus(ModelBase, table=True): provider: str = Field(primary_key=True) refresh_interval_hours: float = 24.0 status: str = Field(default="pending", index=True) + active_stage: Optional[str] = Field(default=None, index=True) current_generation: Optional[str] = None latest_success_generation: Optional[str] = None latest_started_at: Optional[datetime] = Field(default=None, index=True) @@ -187,6 +194,18 @@ class ProviderIndexStatus(ModelBase, table=True): latest_success_at: Optional[datetime] = Field(default=None, index=True) next_refresh_after: Optional[datetime] = Field(default=None, index=True) bootstrap_completed: bool = Field(default=False, index=True) + title_index_status: str = Field(default="pending", index=True) + title_index_ready_at: Optional[datetime] = Field(default=None, index=True) + title_index_next_retry_after: Optional[datetime] = Field(default=None, index=True) + detail_enrichment_status: str = Field(default="pending", index=True) + detail_ready_at: Optional[datetime] = Field(default=None, index=True) + detail_next_retry_after: Optional[datetime] = Field(default=None, index=True) + canonical_enrichment_status: str = Field(default="pending", index=True) + canonical_ready_at: Optional[datetime] = Field(default=None, index=True) + canonical_next_retry_after: Optional[datetime] = Field( + default=None, + index=True, + ) failure_count: int = 0 last_error_summary: Optional[str] = None cursor_title_slug: Optional[str] = None @@ -200,6 +219,18 @@ class ProviderTitleIndexState(ModelBase, table=True): last_success_at: Optional[datetime] = Field(default=None, index=True) failure_count: int = 0 last_error_summary: Optional[str] = None + detail_status: str = Field(default="pending", index=True) + detail_last_attempted_at: Optional[datetime] = Field(default=None, index=True) + detail_last_success_at: Optional[datetime] = Field(default=None, index=True) + detail_next_retry_after: Optional[datetime] = Field(default=None, index=True) + detail_failure_count: int = 0 + detail_last_error_summary: Optional[str] = None + canonical_status: str = Field(default="pending", index=True) + canonical_last_attempted_at: Optional[datetime] = Field(default=None, index=True) + canonical_last_success_at: Optional[datetime] = Field(default=None, index=True) + canonical_next_retry_after: Optional[datetime] = Field(default=None, index=True) + canonical_failure_count: int = 0 + canonical_last_error_summary: Optional[str] = None updated_at: datetime = Field(default_factory=utcnow, index=True) @@ -328,11 +359,30 @@ class ProviderMovieMapping(ModelBase, table=True): engine = create_engine( DATABASE_URL, - connect_args={"check_same_thread": False}, + connect_args={ + "check_same_thread": False, + "timeout": 30, + "autocommit": False, + }, poolclass=NullPool, # ensure connections are closed when sessions end ) logger.debug("SQLModel engine created.") + +@event.listens_for(engine, "connect") +def _configure_sqlite_connection(dbapi_connection, _connection_record) -> None: + autocommit = getattr(dbapi_connection, "autocommit", None) + if autocommit is not None: + dbapi_connection.autocommit = True + cursor = dbapi_connection.cursor() + cursor.execute("PRAGMA journal_mode=WAL") + cursor.execute("PRAGMA synchronous=NORMAL") + cursor.execute("PRAGMA busy_timeout=30000") + cursor.close() + if autocommit is not None: + dbapi_connection.autocommit = autocommit + + MIGRATION_BASE_REVISION = "20260203_0001" @@ -733,6 +783,7 @@ def upsert_provider_index_status( provider: str, refresh_interval_hours: float, status: Optional[str] = None, + active_stage: Optional[str] | object = _UNSET, current_generation: Optional[str] | object = _UNSET, latest_success_generation: Optional[str] | object = _UNSET, latest_started_at: Optional[datetime] | object = _UNSET, @@ -740,6 +791,15 @@ def upsert_provider_index_status( latest_success_at: Optional[datetime] | object = _UNSET, next_refresh_after: Optional[datetime] | object = _UNSET, bootstrap_completed: Optional[bool] = None, + title_index_status: Optional[str] = None, + title_index_ready_at: Optional[datetime] | object = _UNSET, + title_index_next_retry_after: Optional[datetime] | object = _UNSET, + detail_enrichment_status: Optional[str] = None, + detail_ready_at: Optional[datetime] | object = _UNSET, + detail_next_retry_after: Optional[datetime] | object = _UNSET, + canonical_enrichment_status: Optional[str] = None, + canonical_ready_at: Optional[datetime] | object = _UNSET, + canonical_next_retry_after: Optional[datetime] | object = _UNSET, failure_count: Optional[int] = None, last_error_summary: Optional[str] | object = _UNSET, cursor_title_slug: Optional[str] | object = _UNSET, @@ -754,6 +814,8 @@ def upsert_provider_index_status( rec.refresh_interval_hours = refresh_interval_hours if status is not None: rec.status = status + if active_stage is not _UNSET: + rec.active_stage = active_stage if current_generation is not _UNSET: rec.current_generation = current_generation if latest_success_generation is not _UNSET: @@ -768,6 +830,24 @@ def upsert_provider_index_status( rec.next_refresh_after = next_refresh_after if bootstrap_completed is not None: rec.bootstrap_completed = bootstrap_completed + if title_index_status is not None: + rec.title_index_status = title_index_status + if title_index_ready_at is not _UNSET: + rec.title_index_ready_at = title_index_ready_at + if title_index_next_retry_after is not _UNSET: + rec.title_index_next_retry_after = title_index_next_retry_after + if detail_enrichment_status is not None: + rec.detail_enrichment_status = detail_enrichment_status + if detail_ready_at is not _UNSET: + rec.detail_ready_at = detail_ready_at + if detail_next_retry_after is not _UNSET: + rec.detail_next_retry_after = detail_next_retry_after + if canonical_enrichment_status is not None: + rec.canonical_enrichment_status = canonical_enrichment_status + if canonical_ready_at is not _UNSET: + rec.canonical_ready_at = canonical_ready_at + if canonical_next_retry_after is not _UNSET: + rec.canonical_next_retry_after = canonical_next_retry_after if failure_count is not None: rec.failure_count = failure_count if last_error_summary is not _UNSET: @@ -805,6 +885,18 @@ def upsert_provider_title_index_state( succeeded_at: Optional[datetime] = None, failure_count: Optional[int] = None, last_error_summary: Optional[str] = None, + detail_status: Optional[str] = None, + detail_attempted_at: Optional[datetime] = None, + detail_succeeded_at: Optional[datetime] = None, + detail_next_retry_after: Optional[datetime] | object = _UNSET, + detail_failure_count: Optional[int] = None, + detail_last_error_summary: Optional[str] | object = _UNSET, + canonical_status: Optional[str] = None, + canonical_attempted_at: Optional[datetime] = None, + canonical_succeeded_at: Optional[datetime] = None, + canonical_next_retry_after: Optional[datetime] | object = _UNSET, + canonical_failure_count: Optional[int] = None, + canonical_last_error_summary: Optional[str] | object = _UNSET, commit: bool = True, ) -> ProviderTitleIndexState: rec = session.get(ProviderTitleIndexState, (provider, slug)) @@ -818,6 +910,30 @@ def upsert_provider_title_index_state( rec.failure_count = failure_count if last_error_summary is not None: rec.last_error_summary = last_error_summary + if detail_status is not None: + rec.detail_status = detail_status + if detail_attempted_at is not None: + rec.detail_last_attempted_at = detail_attempted_at + if detail_succeeded_at is not None: + rec.detail_last_success_at = detail_succeeded_at + if detail_next_retry_after is not _UNSET: + rec.detail_next_retry_after = detail_next_retry_after + if detail_failure_count is not None: + rec.detail_failure_count = detail_failure_count + if detail_last_error_summary is not _UNSET: + rec.detail_last_error_summary = detail_last_error_summary + if canonical_status is not None: + rec.canonical_status = canonical_status + if canonical_attempted_at is not None: + rec.canonical_last_attempted_at = canonical_attempted_at + if canonical_succeeded_at is not None: + rec.canonical_last_success_at = canonical_succeeded_at + if canonical_next_retry_after is not _UNSET: + rec.canonical_next_retry_after = canonical_next_retry_after + if canonical_failure_count is not None: + rec.canonical_failure_count = canonical_failure_count + if canonical_last_error_summary is not _UNSET: + rec.canonical_last_error_summary = canonical_last_error_summary rec.updated_at = utcnow() session.add(rec) if commit: @@ -867,12 +983,6 @@ def replace_provider_catalog_aliases( aliases: List[str], indexed_generation: str, ) -> None: - session.exec( - select(ProviderCatalogAlias).where( - (ProviderCatalogAlias.provider == provider) - & (ProviderCatalogAlias.slug == slug) - ) - ).all() session.exec( ProviderCatalogAlias.__table__.delete().where( (ProviderCatalogAlias.provider == provider) @@ -1254,12 +1364,34 @@ def is_catalog_bootstrap_ready( } return all( status is not None - and status.bootstrap_completed + and (status.title_index_status == "ready" or status.bootstrap_completed) and bool(status.latest_success_generation) for status in (statuses.get(provider) for provider in providers) ) +def is_provider_stage_ready(status: ProviderIndexStatus | None, *, stage: str) -> bool: + if status is None: + return False + if stage == "title_index": + return ( + status.title_index_status == "ready" or status.bootstrap_completed + ) and bool(status.latest_success_generation) + if stage == "detail_enrichment": + return status.detail_enrichment_status == "ready" + if stage == "canonical_enrichment": + return status.canonical_enrichment_status == "ready" + raise ValueError(f"Unknown provider stage: {stage}") + + +def is_provider_fully_ready(status: ProviderIndexStatus | None) -> bool: + return ( + is_provider_stage_ready(status, stage="title_index") + and is_provider_stage_ready(status, stage="detail_enrichment") + and is_provider_stage_ready(status, stage="canonical_enrichment") + ) + + def catalog_title_count(session: Session, *, provider: Optional[str] = None) -> int: stmt = select(ProviderCatalogTitle) if provider: diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index 14129d9c..75efbaf3 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -1,4 +1,5 @@ from datetime import datetime, timezone +from threading import Event, Thread from types import SimpleNamespace @@ -201,3 +202,225 @@ def test_is_due_handles_naive_next_refresh_after(): ) assert ProviderCatalogIndexer()._is_due(status) is True + + +def test_failed_first_bootstrap_respects_future_retry_backoff(): + from datetime import timedelta + + from app.catalog.indexer import ProviderCatalogIndexer + from app.db import utcnow + + now = utcnow() + status = SimpleNamespace( + provider="aniworld.to", + status="failed", + latest_success_generation=None, + title_index_status="failed", + title_index_next_retry_after=now + timedelta(hours=2), + next_refresh_after=None, + ) + + assert ProviderCatalogIndexer()._is_due(status) is False + + +def test_pick_due_stage_prefers_detail_then_canonical(monkeypatch): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + monkeypatch.setattr(indexer, "_detail_stage_has_due_work", lambda provider: True) + monkeypatch.setattr(indexer, "_canonical_stage_has_due_work", lambda provider: True) + + status = SimpleNamespace( + provider="aniworld.to", + status="partial", + latest_success_generation="gen-1", + title_index_status="ready", + detail_enrichment_status="pending", + detail_next_retry_after=None, + canonical_enrichment_status="pending", + canonical_next_retry_after=None, + next_refresh_after=None, + title_index_next_retry_after=None, + ) + + assert indexer._pick_due_stage(status) == "detail_enrichment" + + +def test_progress_snapshot_exposes_staged_readiness(client): + from app.catalog.indexer import get_catalog_indexer + from app.db import engine, upsert_provider_index_status + from sqlmodel import Session + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="partial", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="pending", + canonical_enrichment_status="failed", + ) + upsert_provider_index_status( + session, + provider="s.to", + refresh_interval_hours=24.0, + status="ready", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="ready", + canonical_enrichment_status="ready", + ) + upsert_provider_index_status( + session, + provider="megakino", + refresh_interval_hours=24.0, + status="ready", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="ready", + canonical_enrichment_status="ready", + ) + + snapshot = get_catalog_indexer().get_progress_snapshot() + by_provider = {item["provider"]: item for item in snapshot["providers"]} + + assert snapshot["bootstrap_ready"] is True + assert by_provider["aniworld.to"]["search_ready"] is True + assert by_provider["aniworld.to"]["full_ready"] is False + assert by_provider["aniworld.to"]["detail_enrichment_status"] == "pending" + assert by_provider["aniworld.to"]["canonical_enrichment_status"] == "failed" + + +def test_write_coordinator_serializes_callbacks(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import CatalogIndexWriteCoordinator + + started = Event() + release = Event() + order: list[str] = [] + + class FakeSession: + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def commit(self): + return None + + monkeypatch.setattr(indexer_module, "Session", lambda _engine: FakeSession()) + + coordinator = CatalogIndexWriteCoordinator() + + def first() -> None: + coordinator.run( + lambda _session: (order.append("first"), started.set(), release.wait(1)) + ) + + def second() -> None: + started.wait(1) + coordinator.run(lambda _session: order.append("second")) + + t1 = Thread(target=first) + t2 = Thread(target=second) + t1.start() + t2.start() + started.wait(1) + assert order == ["first"] + release.set() + t1.join() + t2.join() + + assert order == ["first", "second"] + + +def test_detail_stage_persists_one_title_incrementally(client): + from app.catalog.indexer import ProviderCatalogIndexer + from app.catalog.providers import EpisodeLanguageRecord, EpisodeRecord, TitleRecord + from app.db import ( + ProviderCatalogEpisode, + ProviderTitleIndexState, + engine, + replace_provider_catalog_title, + upsert_provider_index_status, + upsert_provider_title_index_state, + ) + from sqlmodel import Session, select + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="partial", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="pending", + canonical_enrichment_status="pending", + ) + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug="demo", + title="Demo", + media_type_hint="series", + relative_path="/anime/stream/demo", + indexed_generation="gen-1", + ) + upsert_provider_title_index_state( + session, + provider="aniworld.to", + slug="demo", + detail_status="pending", + ) + session.commit() + + indexer = ProviderCatalogIndexer() + indexer._persist_stage_success( + provider="aniworld.to", + stage="detail_enrichment", + title_row=SimpleNamespace(slug="demo"), + payload=TitleRecord( + provider="aniworld.to", + slug="demo", + title="Demo", + aliases=["Demo"], + media_type_hint="series", + relative_path="/anime/stream/demo", + episodes=[ + EpisodeRecord( + season=1, + episode=1, + relative_path="/anime/stream/demo/staffel-1/episode-1", + title_primary="Episode 1", + title_secondary=None, + media_type_hint="episode", + languages=[ + EpisodeLanguageRecord( + language="German Dub", + host_hints=["VOE"], + ) + ], + ) + ], + ), + ) + + with Session(engine) as session: + episodes = session.exec(select(ProviderCatalogEpisode)).all() + state = session.get(ProviderTitleIndexState, ("aniworld.to", "demo")) + + assert len(episodes) == 1 + assert state is not None + assert state.detail_status == "ready" diff --git a/apps/api/tests/unit/catalog/test_metadata.py b/apps/api/tests/unit/catalog/test_metadata.py new file mode 100644 index 00000000..c82d10eb --- /dev/null +++ b/apps/api/tests/unit/catalog/test_metadata.py @@ -0,0 +1,41 @@ +from __future__ import annotations + + +def test_ttl_lru_cache_evicts_oldest_entry(): + from app.catalog.metadata import TtlLruCache + + cache = TtlLruCache[str, int](max_entries=2, ttl_seconds=3600) + cache.set("a", 1) + cache.set("b", 2) + cache.set("c", 3) + + assert cache.get("a") is None + assert cache.get("b") == 2 + assert cache.get("c") == 3 + assert cache.size() == 2 + + +def test_canonical_cache_stats_are_bounded(monkeypatch): + import app.catalog.metadata as metadata + + search_cache = metadata.TtlLruCache[str, list[dict[str, object]]]( + max_entries=2, + ttl_seconds=3600, + ) + show_cache = metadata.TtlLruCache[int, dict[str, object]]( + max_entries=1, + ttl_seconds=3600, + ) + monkeypatch.setattr(metadata, "_search_cache", search_cache) + monkeypatch.setattr(metadata, "_show_cache", show_cache) + + search_cache.set("foo", [{"id": 1}]) + search_cache.set("bar", [{"id": 2}]) + search_cache.set("baz", [{"id": 3}]) + show_cache.set(1, {"id": 1}) + show_cache.set(2, {"id": 2}) + + assert metadata.canonical_cache_stats() == { + "search_entries": 2, + "show_entries": 1, + } From f9313654785070651d000452919e575916ddaea1 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 23:12:22 +0200 Subject: [PATCH 23/45] fix(qbittorrent): avoid sqlite write races when starting downloads Defer background worker startup until after the qBittorrent add request has persisted its client task and job metadata. This prevents the request thread and scheduler worker from contending on the same SQLite rows during the initial add flow, which was surfacing as Sonarr download warnings. Add regression coverage for deferred worker startup in the qBittorrent-compatible API. --- apps/api/app/api/qbittorrent/torrents.py | 9 +++-- apps/api/app/core/scheduler.py | 39 ++++++++++++------- apps/api/tests/conftest.py | 7 +++- .../api/qbittorrent/test_torrents.py | 33 ++++++++++++++++ 4 files changed, 69 insertions(+), 19 deletions(-) diff --git a/apps/api/app/api/qbittorrent/torrents.py b/apps/api/app/api/qbittorrent/torrents.py index 3cd0a2cf..7e3bf67f 100644 --- a/apps/api/app/api/qbittorrent/torrents.py +++ b/apps/api/app/api/qbittorrent/torrents.py @@ -20,7 +20,7 @@ delete_client_task, get_job, ) -from app.core.scheduler import schedule_download, cancel_job +from app.core.scheduler import cancel_job, schedule_download, start_scheduled_job from . import router from .common import public_save_path @@ -101,8 +101,8 @@ def torrents_add( req["provider"] = provider if mode: req["mode"] = mode - job_id = schedule_download(req) - logger.debug(f"Scheduled job_id: {job_id}") + job_id = schedule_download(req, autostart=False) + logger.debug(f"Created scheduled job_id: {job_id}") if not savepath: savepath = str(DOWNLOAD_DIR) @@ -127,6 +127,9 @@ def torrents_add( btih, "queued" if paused else "downloading", site ) ) + if not paused: + start_scheduled_job(job_id, req) + logger.debug(f"Started background worker for job_id: {job_id}") return PlainTextResponse("Ok.") diff --git a/apps/api/app/core/scheduler.py b/apps/api/app/core/scheduler.py index 25eeb87b..999eb775 100644 --- a/apps/api/app/core/scheduler.py +++ b/apps/api/app/core/scheduler.py @@ -309,7 +309,26 @@ def _run_strm(job_id: str, req: dict, stop_event: threading.Event) -> None: RUNNING.pop(job_id, None) -def schedule_download(req: dict) -> str: +def start_scheduled_job(job_id: str, req: dict) -> None: + init_executor() + if EXECUTOR is None: + raise RuntimeError("executor not available") + + stop_event = threading.Event() + mode = str(req.get("mode") or "").strip().lower() + runner = _run_strm if mode == "strm" else _run_download + fut = EXECUTOR.submit(runner, job_id, req, stop_event) + with RUNNING_LOCK: + RUNNING[job_id] = (fut, stop_event) + + +def create_scheduled_job(req: dict) -> str: + with Session(engine) as s: + job = create_job(s, source_site=req.get("site") or "aniworld.to") + return job.id + + +def schedule_download(req: dict, *, autostart: bool = True) -> str: """ Schedule a background download job and return its job identifier. @@ -330,20 +349,10 @@ def schedule_download(req: dict) -> str: Raises: RuntimeError: If the thread pool executor is unavailable after initialization. """ - init_executor() - if EXECUTOR is None: - raise RuntimeError("executor not available") - - with Session(engine) as s: - job = create_job(s, source_site=req.get("site") or "aniworld.to") - - stop_event = threading.Event() - mode = str(req.get("mode") or "").strip().lower() - runner = _run_strm if mode == "strm" else _run_download - fut = EXECUTOR.submit(runner, job.id, req, stop_event) - with RUNNING_LOCK: - RUNNING[job.id] = (fut, stop_event) - return job.id + job_id = create_scheduled_job(req) + if autostart: + start_scheduled_job(job_id, req) + return job_id def cancel_job(job_id: str) -> None: diff --git a/apps/api/tests/conftest.py b/apps/api/tests/conftest.py index 0a03f74d..fe6a123f 100644 --- a/apps/api/tests/conftest.py +++ b/apps/api/tests/conftest.py @@ -128,7 +128,12 @@ def client(tmp_path, monkeypatch): create_db_and_tables() - monkeypatch.setattr(qb_torrents, "schedule_download", lambda req: "job-1") + monkeypatch.setattr( + qb_torrents, + "schedule_download", + lambda req, autostart=True: "job-1", + ) + monkeypatch.setattr(qb_torrents, "start_scheduled_job", lambda job_id, req: None) monkeypatch.setattr(qb_torrents, "cancel_job", lambda job_id: None) with TestClient(app) as c: diff --git a/apps/api/tests/integration/api/qbittorrent/test_torrents.py b/apps/api/tests/integration/api/qbittorrent/test_torrents.py index 4ce10341..d54720b7 100644 --- a/apps/api/tests/integration/api/qbittorrent/test_torrents.py +++ b/apps/api/tests/integration/api/qbittorrent/test_torrents.py @@ -71,3 +71,36 @@ def test_torrents_add_aw_and_sto_prefixes(client): info = client.get("/api/v2/torrents/info") items = info.json() assert len(items) == 2 + + +def test_torrents_add_starts_worker_after_task_write(client, monkeypatch): + from app.utils.magnet import build_magnet + import app.api.qbittorrent.torrents as qb_torrents + + calls: list[tuple[str, str]] = [] + + monkeypatch.setattr( + qb_torrents, + "schedule_download", + lambda req, autostart=True: calls.append( + ("schedule", "autostart" if autostart else "deferred") + ) + or "job-1", + ) + monkeypatch.setattr( + qb_torrents, + "start_scheduled_job", + lambda job_id, req: calls.append(("start", job_id)), + ) + + magnet = build_magnet( + title="Title", + slug="slug", + season=1, + episode=1, + language="German Dub", + ) + response = client.post("/api/v2/torrents/add", data={"urls": magnet}) + + assert response.status_code == 200 + assert calls == [("schedule", "deferred"), ("start", "job-1")] From e7aec8ea7f4556412ae58240ed40281c281a016a Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 23:12:27 +0200 Subject: [PATCH 24/45] fix(catalog): backfill legacy staged readiness fields safely Normalize legacy provider index rows into the staged readiness model without assuming every persisted status object already has the new stage attributes populated. This keeps health output consistent after the staged catalog migration and prevents bootstrap-ready providers from reporting pending title index state due to missing backfilled fields. Add focused tests for interrupted-state recovery and legacy stage backfill behavior. --- apps/api/app/catalog/indexer.py | 52 +++++++++++++++++++++ apps/api/tests/unit/catalog/test_indexer.py | 49 +++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 96662899..d24f5e0b 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -419,6 +419,58 @@ def _ensure_status_rows(self) -> None: ) ) ) + continue + if self._needs_stage_backfill(status): + self._writer.run( + lambda session, provider=provider, status=status, hours=hours: ( + self._backfill_legacy_stage_state( + session, + provider=provider, + status=status, + refresh_interval_hours=hours, + ) + ) + ) + + def _needs_stage_backfill(self, status: ProviderIndexStatus) -> bool: + return bool( + getattr(status, "bootstrap_completed", False) + and getattr(status, "latest_success_generation", None) + and getattr(status, "title_index_status", "pending") == "pending" + ) + + def _backfill_legacy_stage_state( + self, + session: Session, + *, + provider: str, + status: ProviderIndexStatus, + refresh_interval_hours: float, + ) -> None: + ready_at = ( + getattr(status, "latest_completed_at", None) + or getattr(status, "latest_success_at", None) + or utcnow() + ) + full_ready = getattr(status, "status", None) == "ready" + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + title_index_status="ready", + title_index_ready_at=getattr(status, "title_index_ready_at", None) + or ready_at, + title_index_next_retry_after=None, + detail_enrichment_status="ready" if full_ready else "pending", + detail_ready_at=getattr(status, "detail_ready_at", None) + or (ready_at if full_ready else None), + detail_next_retry_after=None, + canonical_enrichment_status="ready" if full_ready else "pending", + canonical_ready_at=getattr(status, "canonical_ready_at", None) + or (ready_at if full_ready else None), + canonical_next_retry_after=None, + commit=False, + ) def _cleanup_stale_generation( self, diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index 75efbaf3..85595d6f 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -299,6 +299,55 @@ def test_progress_snapshot_exposes_staged_readiness(client): assert by_provider["aniworld.to"]["canonical_enrichment_status"] == "failed" +def test_ensure_status_rows_backfills_legacy_ready_stage_fields(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + from app.db import utcnow + + ready_at = utcnow() + legacy_status = SimpleNamespace( + bootstrap_completed=True, + latest_success_generation="gen-1", + title_index_status="pending", + latest_completed_at=ready_at, + latest_success_at=ready_at, + status="ready", + ) + recorded: list[dict[str, object]] = [] + + class FakeSession: + def __enter__(self): + return object() + + def __exit__(self, exc_type, exc, tb): + return False + + monkeypatch.setattr(indexer_module, "Session", lambda _engine: FakeSession()) + monkeypatch.setattr( + indexer_module, + "get_provider_index_status", + lambda _session, provider: legacy_status, + ) + + indexer = ProviderCatalogIndexer() + monkeypatch.setattr( + indexer._writer, + "run", + lambda callback: callback(object()), + ) + monkeypatch.setattr( + indexer_module, + "upsert_provider_index_status", + lambda _session, **kwargs: recorded.append(kwargs), + ) + + indexer._ensure_status_rows() + + assert any(item["title_index_status"] == "ready" for item in recorded) + assert any(item["detail_enrichment_status"] == "ready" for item in recorded) + assert any(item["canonical_enrichment_status"] == "ready" for item in recorded) + + def test_write_coordinator_serializes_callbacks(monkeypatch): import app.catalog.indexer as indexer_module from app.catalog.indexer import CatalogIndexWriteCoordinator From b675f4804650fa8b9841c0077c81ce7187fc29b2 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 23:35:40 +0200 Subject: [PATCH 25/45] refactor(db): add static type hinting mirror for dynamic exports Co-authored-by: Copilot --- apps/api/app/db/__init__.py | 5 +++++ apps/api/app/db/__init__.pyi | 6 ++++++ 2 files changed, 11 insertions(+) create mode 100644 apps/api/app/db/__init__.pyi diff --git a/apps/api/app/db/__init__.py b/apps/api/app/db/__init__.py index 379ec44e..71c12b5e 100644 --- a/apps/api/app/db/__init__.py +++ b/apps/api/app/db/__init__.py @@ -3,9 +3,14 @@ This package contains the SQLite/SQLModel data models and related utilities that used to live in `app/models.py`. Functionality and public API are preserved; imports should now use `from app.db import ...`. +Runtime export is intentionally dynamic; see `__init__.pyi` for Pylance food. """ from . import models as _models +# TODO: Clean up this this runtime facade and static type hinting +# It's a bit hacky but never caused any issues +# we should replace the dynamic globals().update(...) with a normal star import +# or eventually explicitly define what this package exports __all__ = [name for name in dir(_models) if not name.startswith("_")] globals().update({name: getattr(_models, name) for name in __all__}) diff --git a/apps/api/app/db/__init__.pyi b/apps/api/app/db/__init__.pyi new file mode 100644 index 00000000..7d9683b6 --- /dev/null +++ b/apps/api/app/db/__init__.pyi @@ -0,0 +1,6 @@ +"""Static mirror of the dynamic exports in `__init__.py`. + +Python is fine with `globals().update(...)`; Pylance needs a little snack. +""" + +from .models import * From 00ca142c4b66f12491ca00204e576797a28d4506 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Wed, 29 Apr 2026 23:56:53 +0200 Subject: [PATCH 26/45] fix(downloader): bound direct-link resolution per host Add a hard timeout around provider direct-link resolution so a stuck upstream host cannot leave a job in downloading forever. Expose the timeout as PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS and update the example environment configuration accordingly. Add regression coverage for timed-out host resolution and config parsing of the new timeout value. --- apps/api/.env.example | 5 ++ apps/api/app/config.py | 13 +++++ .../core/downloader/provider_resolution.py | 46 ++++++++++++++- apps/api/tests/unit/app/test_config.py | 2 + .../downloader/test_provider_resolution.py | 56 +++++++++++++++++++ 5 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 apps/api/tests/unit/core/downloader/test_provider_resolution.py diff --git a/apps/api/.env.example b/apps/api/.env.example index bf527da9..62fc5c5f 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -167,6 +167,11 @@ PROVIDER_ORDER=VOE,Filemoon,Streamtape,Vidmoly,Doodstream,LoadX,Luluvdo,Vidoza # Default: 12 PROVIDER_REDIRECT_TIMEOUT_SECONDS=12 +# What: Hard timeout in seconds for one video host direct-link resolution +# attempt before AniBridge abandons that host and tries the next fallback. +# Default: 15 +PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS=15 + # What: Extra retry attempts when provider redirect resolution times out or # hits transient network errors # Default: 2 diff --git a/apps/api/app/config.py b/apps/api/app/config.py index 3ef40270..693410b3 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -517,6 +517,19 @@ def _normalize_video_host_name(name: str) -> str | None: PROVIDER_REDIRECT_TIMEOUT_SECONDS = 1 logger.debug("PROVIDER_REDIRECT_TIMEOUT_SECONDS={}", PROVIDER_REDIRECT_TIMEOUT_SECONDS) +try: + PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS = float( + os.getenv("PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", "15") + ) +except ValueError: + PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS = 15.0 +if PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS <= 0: + PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS = 15.0 +logger.debug( + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS={}", + PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS, +) + PROVIDER_REDIRECT_RETRIES = _as_non_negative_int( os.getenv("PROVIDER_REDIRECT_RETRIES"), 2 ) diff --git a/apps/api/app/core/downloader/provider_resolution.py b/apps/api/app/core/downloader/provider_resolution.py index 74185941..169cfd35 100644 --- a/apps/api/app/core/downloader/provider_resolution.py +++ b/apps/api/app/core/downloader/provider_resolution.py @@ -1,11 +1,13 @@ from __future__ import annotations import re +from queue import Queue +from threading import Thread from typing import List, Optional, Tuple, TYPE_CHECKING from loguru import logger -from app.config import PROVIDER_ORDER +from app.config import PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS, PROVIDER_ORDER from .errors import DownloadError, LanguageUnavailableError from .language import normalize_language @@ -15,6 +17,40 @@ _AVAIL_RE = re.compile(r"Available languages:\s*\[([^\]]*)\]", re.IGNORECASE) +def _run_with_timeout( + callback, + *args, + timeout_seconds: float, + operation: str, +): + result_queue: Queue[tuple[str, object]] = Queue(maxsize=1) + + def _target() -> None: + try: + result_queue.put(("result", callback(*args))) + except Exception as exc: + result_queue.put(("error", exc)) + + thread = Thread( + target=_target, + name=f"provider-resolution-{operation}", + daemon=True, + ) + thread.start() + thread.join(timeout_seconds) + if thread.is_alive(): + raise TimeoutError( + f"{operation} timed out after {timeout_seconds:.1f}s" + ) + + outcome, payload = result_queue.get() + if outcome == "error": + if isinstance(payload, BaseException): + raise payload + raise RuntimeError(f"{operation} failed with non-exception payload: {payload!r}") + return payload + + def _parse_available_languages_from_error(msg: str) -> List[str]: """ Extracts a list of available language names from a provider error message. @@ -60,7 +96,13 @@ def _try_get_direct(ep: Episode, provider_name: str, language: str) -> Optional[ language = normalize_language(language) logger.info("Trying provider '{}' for language '{}'", provider_name, language) try: - url = ep.get_direct_link(provider_name, language) # Lib-API + url = _run_with_timeout( + ep.get_direct_link, + provider_name, + language, + timeout_seconds=PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS, + operation=f"{provider_name.lower()}-direct-link", + ) if url: logger.success( "Found direct URL from provider '{}': {}", provider_name, url diff --git a/apps/api/tests/unit/app/test_config.py b/apps/api/tests/unit/app/test_config.py index 07916cdc..40313e3d 100644 --- a/apps/api/tests/unit/app/test_config.py +++ b/apps/api/tests/unit/app/test_config.py @@ -99,6 +99,7 @@ def test_provider_redirect_settings(monkeypatch): import sys monkeypatch.setenv("PROVIDER_REDIRECT_TIMEOUT_SECONDS", "15") + monkeypatch.setenv("PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", "9.5") monkeypatch.setenv("PROVIDER_REDIRECT_RETRIES", "4") monkeypatch.setenv("PROVIDER_CHALLENGE_BACKOFF_SECONDS", "120") @@ -110,6 +111,7 @@ def test_provider_redirect_settings(monkeypatch): cfg = importlib.reload(cfg) assert cfg.PROVIDER_REDIRECT_TIMEOUT_SECONDS == 15 + assert cfg.PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS == 9.5 assert cfg.PROVIDER_REDIRECT_RETRIES == 4 assert cfg.PROVIDER_CHALLENGE_BACKOFF_SECONDS == 120 diff --git a/apps/api/tests/unit/core/downloader/test_provider_resolution.py b/apps/api/tests/unit/core/downloader/test_provider_resolution.py new file mode 100644 index 00000000..6b9ece65 --- /dev/null +++ b/apps/api/tests/unit/core/downloader/test_provider_resolution.py @@ -0,0 +1,56 @@ +import time + +import pytest + + +def test_try_get_direct_times_out_and_returns_none(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + + class SlowEpisode: + def get_direct_link(self, provider_name: str, language: str) -> str: + time.sleep(0.2) + return f"{provider_name}:{language}" + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + + started_at = time.monotonic() + result = provider_resolution._try_get_direct( + SlowEpisode(), + "VOE", + "German Dub", + ) + + assert result is None + assert time.monotonic() - started_at < 0.15 + + +def test_try_get_direct_raises_for_missing_language(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + from app.core.downloader.errors import LanguageUnavailableError + + class MissingLanguageEpisode: + def get_direct_link(self, provider_name: str, language: str) -> str: + del provider_name, language + raise ValueError( + "No provider found for language 'German Dub'. " + "Available languages: ['English Sub', 'German Sub']" + ) + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + + with pytest.raises(LanguageUnavailableError) as exc_info: + provider_resolution._try_get_direct( + MissingLanguageEpisode(), + "VOE", + "German Dub", + ) + + assert exc_info.value.available == ["English Sub", "German Sub"] From 06fdbd6a6de81cc0fe89b959098db972bd67a061 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 00:00:59 +0200 Subject: [PATCH 27/45] perf(scheduler): coalesce download progress writes to sqlite Move yt-dlp progress persistence out of the callback hot path and flush only the latest per-job snapshot from a single writer. This removes concurrent progress-hook writes for the same job, reduces SQLite write pressure, and prevents repeated database locked errors during active downloads. --- apps/api/.env.example | 5 + apps/api/app/config.py | 8 + apps/api/app/core/scheduler.py | 197 +++++++++++++----- apps/api/tests/unit/app/test_config.py | 2 + .../core/scheduler/test_strm_scheduler.py | 113 ++++++++++ 5 files changed, 268 insertions(+), 57 deletions(-) diff --git a/apps/api/.env.example b/apps/api/.env.example index 62fc5c5f..915a4e63 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -177,6 +177,11 @@ PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS=15 # Default: 2 PROVIDER_REDIRECT_RETRIES=2 +# What: How often the background scheduler flushes coalesced download +# progress updates to SQLite while yt-dlp is running. +# Default: 0.5 +JOB_PROGRESS_FLUSH_SECONDS=0.5 + # What: Base cool-down in seconds before retrying a Serienstream redirect that # returned a Turnstile/captcha page. Retries back off linearly from this value. # Default: 300 diff --git a/apps/api/app/config.py b/apps/api/app/config.py index 693410b3..7848c63d 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -535,6 +535,14 @@ def _normalize_video_host_name(name: str) -> str | None: ) logger.debug("PROVIDER_REDIRECT_RETRIES={}", PROVIDER_REDIRECT_RETRIES) +JOB_PROGRESS_FLUSH_SECONDS = _as_non_negative_float( + os.getenv("JOB_PROGRESS_FLUSH_SECONDS"), + 0.5, +) +if JOB_PROGRESS_FLUSH_SECONDS <= 0: + JOB_PROGRESS_FLUSH_SECONDS = 0.5 +logger.debug("JOB_PROGRESS_FLUSH_SECONDS={}", JOB_PROGRESS_FLUSH_SECONDS) + PROVIDER_CHALLENGE_BACKOFF_SECONDS = _as_non_negative_int( os.getenv("PROVIDER_CHALLENGE_BACKOFF_SECONDS"), 300 ) diff --git a/apps/api/app/core/scheduler.py b/apps/api/app/core/scheduler.py index 999eb775..1d5a6779 100644 --- a/apps/api/app/core/scheduler.py +++ b/apps/api/app/core/scheduler.py @@ -1,12 +1,18 @@ from __future__ import annotations import threading from concurrent.futures import ThreadPoolExecutor, Future -from typing import Dict, Tuple, Optional +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple from loguru import logger from sqlmodel import Session import errno -from app.config import MAX_CONCURRENCY, DOWNLOAD_DIR, STRM_PROXY_MODE +from app.config import ( + DOWNLOAD_DIR, + JOB_PROGRESS_FLUSH_SECONDS, + MAX_CONCURRENCY, + STRM_PROXY_MODE, +) from app.utils.strm import allocate_unique_strm_path, build_strm_content from app.core.strm_proxy import StrmIdentity, resolve_direct_url, build_stream_url from app.utils.terminal import ( @@ -27,6 +33,76 @@ RUNNING_LOCK = threading.Lock() +@dataclass(slots=True) +class JobProgressSnapshot: + downloaded_bytes: int + total_bytes: int | None + speed: float | None + eta: int | None + progress: float + + +class JobProgressWriter: + def __init__(self, job_id: str, flush_interval_seconds: float) -> None: + self._job_id = job_id + self._flush_interval_seconds = flush_interval_seconds + self._lock = threading.Lock() + self._wake_event = threading.Event() + self._stop_event = threading.Event() + self._pending: JobProgressSnapshot | None = None + self._thread = threading.Thread( + target=self._run, + name=f"job-progress-writer-{job_id}", + daemon=True, + ) + + def start(self) -> None: + self._thread.start() + + def publish(self, snapshot: JobProgressSnapshot) -> None: + with self._lock: + self._pending = snapshot + self._wake_event.set() + + def close(self, *, flush: bool) -> None: + self._stop_event.set() + if flush: + self._wake_event.set() + self._thread.join(timeout=5) + + def _drain_pending(self) -> JobProgressSnapshot | None: + with self._lock: + snapshot = self._pending + self._pending = None + return snapshot + + def _flush_snapshot(self, snapshot: JobProgressSnapshot) -> None: + with Session(engine) as session: + update_job( + session, + self._job_id, + status="downloading", + downloaded_bytes=snapshot.downloaded_bytes, + total_bytes=snapshot.total_bytes, + speed=snapshot.speed, + eta=snapshot.eta, + progress=snapshot.progress, + ) + + def _run(self) -> None: + while True: + self._wake_event.wait(self._flush_interval_seconds) + self._wake_event.clear() + snapshot = self._drain_pending() + if snapshot is not None: + self._flush_snapshot(snapshot) + if self._stop_event.is_set(): + final_snapshot = self._drain_pending() + if final_snapshot is not None: + self._flush_snapshot(final_snapshot) + return + + def init_executor() -> None: global EXECUTOR if EXECUTOR is None: @@ -49,70 +125,73 @@ def shutdown_executor() -> None: def _progress_updater(job_id: str, stop_event: threading.Event): - from sqlmodel import Session - from app.db import engine, update_job - reporter: ProgressReporter | None = None last_db_n = -1 + callback_lock = threading.Lock() + writer = JobProgressWriter( + job_id=job_id, + flush_interval_seconds=JOB_PROGRESS_FLUSH_SECONDS, + ) + writer.start() def _cb(d: dict): nonlocal reporter, last_db_n - if stop_event.is_set(): - if reporter: - reporter.close() - raise Exception("Cancelled") - - status = d.get("status") - downloaded = int(d.get("downloaded_bytes") or 0) - total = d.get("total_bytes") or d.get("total_bytes_estimate") - speed = d.get("speed") - eta = d.get("eta") - - # Initialize reporter lazily (label contains job id) - if reporter is None: - reporter = ProgressReporter(label=f"Job {job_id}") - - # Render progress to terminal (TTY bar or stepped logs) - reporter.update( - ProgressSnapshot( - downloaded=downloaded, - total=int(total) if total else None, - speed=float(speed) if speed else None, - eta=int(eta) if eta else None, - status=str(status) if status else None, + with callback_lock: + if stop_event.is_set(): + if reporter: + reporter.close() + raise Exception("Cancelled") + + status = d.get("status") + downloaded = int(d.get("downloaded_bytes") or 0) + total = d.get("total_bytes") or d.get("total_bytes_estimate") + speed = d.get("speed") + eta = d.get("eta") + total_i = int(total) if total else None + speed_f = float(speed) if speed else None + eta_i = int(eta) if eta else None + + if reporter is None: + reporter = ProgressReporter(label=f"Job {job_id}") + + reporter.update( + ProgressSnapshot( + downloaded=downloaded, + total=total_i, + speed=speed_f, + eta=eta_i, + status=str(status) if status else None, + ) ) - ) - # Throttle DB writes to ~1% steps (or on finish) - progress = 0.0 - should_write = True - if total: - try: - total_i = int(total) - step = max(1, total_i // 100) - should_write = downloaded == total_i or downloaded // step != last_db_n - last_db_n = downloaded // step - progress = max(0.0, min(100.0, downloaded / total_i * 100.0)) - except Exception: - should_write = True - - if should_write: - with Session(engine) as s: - update_job( - s, - job_id, - status="downloading" if status != "finished" else "downloading", - downloaded_bytes=downloaded, - total_bytes=int(total) if total else None, - speed=float(speed) if speed else None, - eta=int(eta) if eta else None, - progress=progress, + progress = 0.0 + should_write = True + if total_i: + try: + step = max(1, total_i // 100) + should_write = ( + downloaded == total_i or downloaded // step != last_db_n + ) + last_db_n = downloaded // step + progress = max(0.0, min(100.0, downloaded / total_i * 100.0)) + except Exception: + should_write = True + + if should_write: + writer.publish( + JobProgressSnapshot( + downloaded_bytes=downloaded, + total_bytes=total_i, + speed=speed_f, + eta=eta_i, + progress=progress, + ) ) - if status == "finished" and reporter is not None: - reporter.close() + if status == "finished" and reporter is not None: + reporter.close() - return _cb + return _cb, writer def _run_download(job_id: str, req: dict, stop_event: threading.Event): @@ -136,6 +215,7 @@ def _run_download(job_id: str, req: dict, stop_event: threading.Event): - 'site' (optional, defaults to "aniworld.to") stop_event (threading.Event): Event that, when set, requests cancellation of the download. """ + progress_cb, progress_writer = _progress_updater(job_id, stop_event) try: with Session(engine) as s: site = req.get("site", "aniworld.to") @@ -152,16 +232,18 @@ def _run_download(job_id: str, req: dict, stop_event: threading.Event): ), dest_dir=DOWNLOAD_DIR, title_hint=req.get("title_hint"), - progress_cb=_progress_updater(job_id, stop_event), + progress_cb=progress_cb, stop_event=stop_event, site=req.get("site", "aniworld.to"), ) + progress_writer.close(flush=True) with Session(engine) as s: update_job( s, job_id, status="completed", progress=100.0, result_path=str(dest) ) except OSError as e: + progress_writer.close(flush=False) with Session(engine) as s: if e.errno in (errno.EACCES, errno.EROFS): update_job( @@ -173,6 +255,7 @@ def _run_download(job_id: str, req: dict, stop_event: threading.Event): else: update_job(s, job_id, status="failed", message=str(e)) except Exception as e: + progress_writer.close(flush=False) msg = str(e) status = "failed" if "Cancel" in msg or "cancel" in msg: diff --git a/apps/api/tests/unit/app/test_config.py b/apps/api/tests/unit/app/test_config.py index 40313e3d..651c5c0a 100644 --- a/apps/api/tests/unit/app/test_config.py +++ b/apps/api/tests/unit/app/test_config.py @@ -101,6 +101,7 @@ def test_provider_redirect_settings(monkeypatch): monkeypatch.setenv("PROVIDER_REDIRECT_TIMEOUT_SECONDS", "15") monkeypatch.setenv("PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", "9.5") monkeypatch.setenv("PROVIDER_REDIRECT_RETRIES", "4") + monkeypatch.setenv("JOB_PROGRESS_FLUSH_SECONDS", "0.25") monkeypatch.setenv("PROVIDER_CHALLENGE_BACKOFF_SECONDS", "120") if "app.config" in sys.modules: @@ -113,6 +114,7 @@ def test_provider_redirect_settings(monkeypatch): assert cfg.PROVIDER_REDIRECT_TIMEOUT_SECONDS == 15 assert cfg.PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS == 9.5 assert cfg.PROVIDER_REDIRECT_RETRIES == 4 + assert cfg.JOB_PROGRESS_FLUSH_SECONDS == 0.25 assert cfg.PROVIDER_CHALLENGE_BACKOFF_SECONDS == 120 monkeypatch.setenv("DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC", "not-a-number") diff --git a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py index 263e9312..6c4890ed 100644 --- a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py +++ b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py @@ -1,5 +1,6 @@ import errno import threading +import time from pathlib import Path from sqlmodel import Session @@ -212,3 +213,115 @@ def test_run_strm_creates_proxy_url(tmp_path, monkeypatch): assert mapping is not None assert mapping.resolved_url == "https://example.com/video.mp4" assert mapping.provider_used == "VOE" + + +def test_progress_updater_coalesces_bursty_db_writes(tmp_path, monkeypatch): + scheduler = _setup_scheduler(tmp_path, monkeypatch, strm_proxy_mode="direct") + monkeypatch.setattr(scheduler, "JOB_PROGRESS_FLUSH_SECONDS", 60.0) + + writes: list[dict[str, object]] = [] + + class FakeSession: + def __enter__(self): + return object() + + def __exit__(self, exc_type, exc, tb): + return False + + class FakeReporter: + def __init__(self, label: str): + self.label = label + + def update(self, snapshot): + del snapshot + + def close(self): + return None + + monkeypatch.setattr(scheduler, "Session", lambda _engine: FakeSession()) + monkeypatch.setattr(scheduler, "ProgressReporter", FakeReporter) + monkeypatch.setattr( + scheduler, + "update_job", + lambda _session, job_id, **fields: writes.append( + {"job_id": job_id, **fields} + ), + ) + + callback, writer = scheduler._progress_updater("job-1", threading.Event()) + callback( + { + "status": "downloading", + "downloaded_bytes": 1024, + "total_bytes": 10_000, + "speed": 1000, + } + ) + callback( + { + "status": "downloading", + "downloaded_bytes": 2048, + "total_bytes": 10_000, + "speed": 2000, + "eta": 5, + } + ) + writer.close(flush=True) + + assert len(writes) == 1 + assert writes[0]["job_id"] == "job-1" + assert writes[0]["downloaded_bytes"] == 2048 + assert writes[0]["total_bytes"] == 10_000 + assert writes[0]["speed"] == 2000.0 + assert writes[0]["eta"] == 5 + + +def test_progress_updater_flushes_without_final_close(tmp_path, monkeypatch): + scheduler = _setup_scheduler(tmp_path, monkeypatch, strm_proxy_mode="direct") + monkeypatch.setattr(scheduler, "JOB_PROGRESS_FLUSH_SECONDS", 0.01) + + writes: list[dict[str, object]] = [] + + class FakeSession: + def __enter__(self): + return object() + + def __exit__(self, exc_type, exc, tb): + return False + + class FakeReporter: + def __init__(self, label: str): + self.label = label + + def update(self, snapshot): + del snapshot + + def close(self): + return None + + monkeypatch.setattr(scheduler, "Session", lambda _engine: FakeSession()) + monkeypatch.setattr(scheduler, "ProgressReporter", FakeReporter) + monkeypatch.setattr( + scheduler, + "update_job", + lambda _session, job_id, **fields: writes.append( + {"job_id": job_id, **fields} + ), + ) + + callback, writer = scheduler._progress_updater("job-2", threading.Event()) + callback( + { + "status": "downloading", + "downloaded_bytes": 5000, + "total_bytes": 10_000, + "speed": 4000, + "eta": 1, + } + ) + time.sleep(0.05) + writer.close(flush=False) + + assert writes + assert writes[-1]["job_id"] == "job-2" + assert writes[-1]["downloaded_bytes"] == 5000 From 7dc4021b17c04038141bb674b8ebc607279918f6 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 00:14:44 +0200 Subject: [PATCH 28/45] fix(downloader): improve logging for download host resolution failures Enhance logging messages to provide clearer context during download host resolution and retry attempts. This includes details about preferred and resolved providers, as well as error messages for failed downloads. --- apps/api/app/core/downloader/download.py | 24 ++++++++++++++++++++---- apps/api/app/utils/logger.py | 3 +++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/apps/api/app/core/downloader/download.py b/apps/api/app/core/downloader/download.py index f7b11970..38a1d566 100644 --- a/apps/api/app/core/downloader/download.py +++ b/apps/api/app/core/downloader/download.py @@ -172,7 +172,12 @@ def _attempt_download( direct, chosen = get_direct_url_with_fallback( ep, preferred=provider, language=language ) - logger.info("Chosen provider: {}, direct URL: {}", chosen, direct) + logger.info( + "Resolved initial download host: preferred={} resolved={} direct_url={}", + provider, + chosen, + direct, + ) base_hint = title_hint if not base_hint and slug and season is not None and episode is not None: @@ -193,7 +198,11 @@ def _attempt_download( ) except Exception as exc: msg = str(exc) - logger.warning("Primary download failed: {}", msg) + logger.warning( + "Download failed after resolving host {}: {}", + chosen, + msg, + ) tried_alt = False providers_left = [ @@ -206,7 +215,13 @@ def _attempt_download( direct3, chosen3 = get_direct_url_with_fallback( ep, preferred=provider_name, language=language ) - logger.info("Retrying download via alternate provider {}", chosen3) + logger.info( + "Retrying download after {} failed: next_preferred={} resolved={} direct_url={}", + chosen, + provider_name, + chosen3, + direct3, + ) temp_path, info = _ydl_download( direct3, dest_dir, @@ -219,7 +234,8 @@ def _attempt_download( break except Exception as exc3: logger.warning( - "Alternate provider {} failed to download: {}", + "Retry attempt failed: next_preferred={} resolved_or_attempted={} error={}", + provider_name, provider_name, exc3, ) diff --git a/apps/api/app/utils/logger.py b/apps/api/app/utils/logger.py index ceeee3ec..14884d58 100644 --- a/apps/api/app/utils/logger.py +++ b/apps/api/app/utils/logger.py @@ -64,6 +64,9 @@ def emit(self, record: logging.LogRecord) -> None: handlers=[intercept_handler], level=stdlib_level, force=True ) logging.captureWarnings(True) + # TODO: intercept this: + # INFO [alembic.runtime.migration] Will assume non-transactional DDL. + # INFO [alembic.runtime.migration] Context impl SQLiteImpl. logging.lastResort = None for name in ( "uvicorn", From 9b4dffe922bd2f02cf76486446561582d6362c0c Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 00:42:47 +0200 Subject: [PATCH 29/45] fix(indexer): improve error handling for title indexing failures Enhance error reporting by capturing exception messages during title indexing failures. This change ensures that the error details are passed correctly to the failure handling functions, improving debugging and logging capabilities. --- apps/api/app/catalog/indexer.py | 10 ++++++---- apps/api/app/db/__init__.pyi | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index d24f5e0b..543e2e36 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -684,14 +684,15 @@ def on_index_loaded(total_titles: int) -> None: queue.put(_QUEUE_SENTINEL) writer.join(timeout=5) completed_at = utcnow() + error_text = str(exc) self._writer.run( - lambda session: self._finish_title_index_failure( + lambda session, error=error_text: self._finish_title_index_failure( session, provider=provider, generation=generation, refresh_interval_hours=refresh_interval_hours, completed_at=completed_at, - error=str(exc), + error=error, ) ) self._set_progress( @@ -1014,12 +1015,13 @@ def _run_row_stage(self, *, provider: str, stage: str, concurrency: int) -> None ) except Exception as exc: failure_count += 1 + error_text = str(exc) self._persist_stage_failure( provider=provider, stage=stage, title_row=title_row, state=state, - error=str(exc), + error=error_text, ) self._advance_failed_progress( provider, @@ -1031,7 +1033,7 @@ def _run_row_stage(self, *, provider: str, stage: str, concurrency: int) -> None remaining.cancel() completed_at = utcnow() self._writer.run( - lambda session, error=str(exc): ( + lambda session, error=error_text: ( self._mark_stage_failed( session, provider=provider, diff --git a/apps/api/app/db/__init__.pyi b/apps/api/app/db/__init__.pyi index 7d9683b6..61cf7267 100644 --- a/apps/api/app/db/__init__.pyi +++ b/apps/api/app/db/__init__.pyi @@ -3,4 +3,4 @@ Python is fine with `globals().update(...)`; Pylance needs a little snack. """ -from .models import * +from .models import * # noqa: F403 From bdbcff4b762fd169ea4924ad4db5fd2e301c4448 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 00:43:31 +0200 Subject: [PATCH 30/45] style: run ruff format --- apps/api/app/catalog/indexer.py | 8 +++++--- apps/api/app/core/downloader/provider_resolution.py | 8 ++++---- apps/api/app/core/scheduler.py | 2 +- apps/api/app/providers/megakino/sitemap.py | 3 +-- apps/api/app/utils/domain_resolver.py | 4 +++- .../tests/integration/api/qbittorrent/test_torrents.py | 8 ++++---- apps/api/tests/unit/core/scheduler/test_strm_scheduler.py | 8 ++------ .../tests/unit/providers/megakino/test_domain_resolver.py | 8 ++++++-- 8 files changed, 26 insertions(+), 23 deletions(-) diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 543e2e36..cc9fe42e 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -1674,7 +1674,9 @@ def _log_bootstrap_state(self) -> None: status.detail_enrichment_status, status.canonical_enrichment_status, status.latest_success_generation, - status.next_refresh_after.isoformat() - if status.next_refresh_after is not None - else None, + ( + status.next_refresh_after.isoformat() + if status.next_refresh_after is not None + else None + ), ) diff --git a/apps/api/app/core/downloader/provider_resolution.py b/apps/api/app/core/downloader/provider_resolution.py index 169cfd35..cf4a8c90 100644 --- a/apps/api/app/core/downloader/provider_resolution.py +++ b/apps/api/app/core/downloader/provider_resolution.py @@ -39,15 +39,15 @@ def _target() -> None: thread.start() thread.join(timeout_seconds) if thread.is_alive(): - raise TimeoutError( - f"{operation} timed out after {timeout_seconds:.1f}s" - ) + raise TimeoutError(f"{operation} timed out after {timeout_seconds:.1f}s") outcome, payload = result_queue.get() if outcome == "error": if isinstance(payload, BaseException): raise payload - raise RuntimeError(f"{operation} failed with non-exception payload: {payload!r}") + raise RuntimeError( + f"{operation} failed with non-exception payload: {payload!r}" + ) return payload diff --git a/apps/api/app/core/scheduler.py b/apps/api/app/core/scheduler.py index 1d5a6779..ad2e5c23 100644 --- a/apps/api/app/core/scheduler.py +++ b/apps/api/app/core/scheduler.py @@ -2,7 +2,7 @@ import threading from concurrent.futures import ThreadPoolExecutor, Future from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple +from typing import Dict, Optional, Tuple from loguru import logger from sqlmodel import Session import errno diff --git a/apps/api/app/providers/megakino/sitemap.py b/apps/api/app/providers/megakino/sitemap.py index 75d3faea..0db6b15d 100644 --- a/apps/api/app/providers/megakino/sitemap.py +++ b/apps/api/app/providers/megakino/sitemap.py @@ -11,8 +11,7 @@ from loguru import logger MEGAKINO_SITEMAP_USER_AGENT = ( - "Mozilla/5.0 (AniBridge Megakino Indexer; " - "+https://github.com/Zzackllack/AniBridge)" + "Mozilla/5.0 (AniBridge Megakino Indexer; +https://github.com/Zzackllack/AniBridge)" ) diff --git a/apps/api/app/utils/domain_resolver.py b/apps/api/app/utils/domain_resolver.py index 6337de45..4084c3da 100644 --- a/apps/api/app/utils/domain_resolver.py +++ b/apps/api/app/utils/domain_resolver.py @@ -472,7 +472,9 @@ def fetch_megakino_domain( normalized_candidates.append(domain) if not normalized_candidates: - logger.warning("Megakino domain resolution failed; no candidate seeds available.") + logger.warning( + "Megakino domain resolution failed; no candidate seeds available." + ) return None logger.info( diff --git a/apps/api/tests/integration/api/qbittorrent/test_torrents.py b/apps/api/tests/integration/api/qbittorrent/test_torrents.py index d54720b7..dfe6ea58 100644 --- a/apps/api/tests/integration/api/qbittorrent/test_torrents.py +++ b/apps/api/tests/integration/api/qbittorrent/test_torrents.py @@ -82,10 +82,10 @@ def test_torrents_add_starts_worker_after_task_write(client, monkeypatch): monkeypatch.setattr( qb_torrents, "schedule_download", - lambda req, autostart=True: calls.append( - ("schedule", "autostart" if autostart else "deferred") - ) - or "job-1", + lambda req, autostart=True: ( + calls.append(("schedule", "autostart" if autostart else "deferred")) + or "job-1" + ), ) monkeypatch.setattr( qb_torrents, diff --git a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py index 6c4890ed..3db86426 100644 --- a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py +++ b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py @@ -243,9 +243,7 @@ def close(self): monkeypatch.setattr( scheduler, "update_job", - lambda _session, job_id, **fields: writes.append( - {"job_id": job_id, **fields} - ), + lambda _session, job_id, **fields: writes.append({"job_id": job_id, **fields}), ) callback, writer = scheduler._progress_updater("job-1", threading.Event()) @@ -304,9 +302,7 @@ def close(self): monkeypatch.setattr( scheduler, "update_job", - lambda _session, job_id, **fields: writes.append( - {"job_id": job_id, **fields} - ), + lambda _session, job_id, **fields: writes.append({"job_id": job_id, **fields}), ) callback, writer = scheduler._progress_updater("job-2", threading.Event()) diff --git a/apps/api/tests/unit/providers/megakino/test_domain_resolver.py b/apps/api/tests/unit/providers/megakino/test_domain_resolver.py index 7e1933b5..4026f5d2 100644 --- a/apps/api/tests/unit/providers/megakino/test_domain_resolver.py +++ b/apps/api/tests/unit/providers/megakino/test_domain_resolver.py @@ -96,7 +96,9 @@ def fake_get(url, *, timeout=0, allow_redirects=True, headers=None): def test_fetch_megakino_domain_prefers_seed_order(monkeypatch): probed: list[str] = [] - monkeypatch.setattr(domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example", "second.example"]) + monkeypatch.setattr( + domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example", "second.example"] + ) monkeypatch.setattr( domain_resolver, "_fetch_github_domain_hint", @@ -119,7 +121,9 @@ def fake_probe(base_url: str, timeout=0): def test_fetch_megakino_domain_returns_none_when_all_candidates_fail(monkeypatch): - monkeypatch.setattr(domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example", "second.example"]) + monkeypatch.setattr( + domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example", "second.example"] + ) monkeypatch.setattr( domain_resolver, "_fetch_github_domain_hint", From a888fa11b1705ba9f43f99c024d251d093984f8d Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 01:03:01 +0200 Subject: [PATCH 31/45] feat(title-resolver): enhance title resolution with in-memory index lookup Add functionality to resolve titles from an in-memory index before querying the database. This improves performance by prioritizing faster lookups and reduces reliance on database access when the catalog is ready. --- apps/api/app/utils/title_resolver.py | 90 ++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 25 deletions(-) diff --git a/apps/api/app/utils/title_resolver.py b/apps/api/app/utils/title_resolver.py index b69f5b12..1867d166 100644 --- a/apps/api/app/utils/title_resolver.py +++ b/apps/api/app/utils/title_resolver.py @@ -503,6 +503,15 @@ def resolve_series_title( if title: logger.info(f"Resolved title for slug '{slug}' on {site}: {title}") return title + # Attempt to resolve from the in-memory/indexed alphabet sources first + index = load_or_refresh_index(site) + title = index.get(slug) + if title: + logger.info(f"Resolved title for slug '{slug}' on {site}: {title}") + return title + + # No title found in DB or in-memory index. If the provider catalog is + # already considered ready/bootstrapped, treat this as a definitive miss. if get_catalog_readiness_error() is None: logger.warning( "No indexed title found for slug '{}' on {} after catalog bootstrap.", @@ -510,13 +519,10 @@ def resolve_series_title( site, ) return None - index = load_or_refresh_index(site) - title = index.get(slug) - if title: - logger.info(f"Resolved title for slug '{slug}' on {site}: {title}") - else: - logger.warning(f"No title found for slug: {slug} on {site}") - return title + + # Catalog is still bootstrapping and no in-memory title found. + logger.warning(f"No title found for slug: {slug} on {site}") + return None def load_or_refresh_alternatives(site: str = "aniworld.to") -> Dict[str, List[str]]: @@ -720,23 +726,10 @@ def slug_from_query(q: str, site: Optional[str] = None) -> Optional[Tuple[str, s """ if not q: return None - readiness_error = get_catalog_readiness_error() - if readiness_error is None: - providers = [site] if site else list(CATALOG_SITES_LIST) - preferred = [provider for provider in providers if provider != "megakino"] - fallback = [provider for provider in providers if provider == "megakino"] - with Session(engine) as session: - for batch in (preferred, fallback): - if not batch: - continue - rows = search_indexed_provider_titles( - session, - query=q, - providers=batch, - limit=1, - ) - if rows: - return (rows[0].provider, rows[0].slug) + # Prefer searching the in-memory alphabet indexes first (fast, deterministic + # for unit tests and for recent file-based index loads). If no indexed + # match is found, fall back to the DB-backed provider index search when + # the catalog reports readiness. def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: """ @@ -800,14 +793,61 @@ def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: return ("s.to", api_slug) return None + # 1) If a specific site was requested, only consult that site's in-memory + # index (and then its DB-backed index) — do NOT fall back to other sites. if site: - return _search_sites([site]) + result = _search_sites([site]) + if result: + return result + readiness_error = get_catalog_readiness_error() + if readiness_error is None: + with Session(engine) as session: + rows = search_indexed_provider_titles( + session, + query=q, + providers=[site], + limit=1, + ) + if rows: + candidate = rows[0] + cand_score = _score_title_candidate( + _match_tokens(q), _normalize_alnum(q), candidate.title + ) + if cand_score >= _MIN_TITLE_MATCH_SCORE: + return (candidate.provider, candidate.slug) + return None + # 2) No specific site requested: try index-based lookup across primary sites primary_sites = [s for s in CATALOG_SITES_LIST if s != "megakino"] result = _search_sites(primary_sites) if result: return result + # 3) If no index match, and the catalog is ready, try the DB-backed search + readiness_error = get_catalog_readiness_error() + if readiness_error is None: + providers = list(CATALOG_SITES_LIST) + preferred = [provider for provider in providers if provider != "megakino"] + fallback = [provider for provider in providers if provider == "megakino"] + with Session(engine) as session: + for batch in (preferred, fallback): + if not batch: + continue + rows = search_indexed_provider_titles( + session, + query=q, + providers=batch, + limit=1, + ) + if rows: + candidate = rows[0] + cand_score = _score_title_candidate( + _match_tokens(q), _normalize_alnum(q), candidate.title + ) + if cand_score >= _MIN_TITLE_MATCH_SCORE: + return (candidate.provider, candidate.slug) + + # 3) Megakino-specific direct slug/fallback handling if "megakino" in CATALOG_SITES_LIST or "megakino" in _PROVIDER_CACHE: raw = (q or "").strip() direct_slug = _extract_slug(raw, "megakino") From a7bb3e4e5669e52ea77dc9f6ab9e7df103f3c080 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 22:30:02 +0200 Subject: [PATCH 32/45] fix: harden indexed catalog, torznab, and downloader edge cases bypass catalog readiness for empty torznab search test responses honor STRM_FILES_MODE=only in indexed torznab item emission fall back to default provider languages when episode language rows are absent bound provider timeout workers and add cooperative crawl cancellation redact resolved direct URLs from download logs only mark qBittorrent tasks downloading after worker startup succeeds prevent non-flush scheduler shutdown from writing stale progress move health snapshot handlers off the event loop --- apps/api/app/api/health.py | 4 +- apps/api/app/api/qbittorrent/torrents.py | 24 +++- apps/api/app/api/torznab/api.py | 113 +++++++++--------- apps/api/app/catalog/indexer.py | 20 +++- apps/api/app/catalog/providers.py | 28 ++++- apps/api/app/config.py | 19 +-- apps/api/app/core/downloader/download.py | 4 +- .../core/downloader/provider_resolution.py | 40 ++----- apps/api/app/core/scheduler.py | 6 +- apps/api/app/utils/domain_resolver.py | 85 ++++++++----- apps/api/app/utils/terminal.py | 5 +- apps/api/app/utils/title_resolver.py | 29 ++++- 12 files changed, 233 insertions(+), 144 deletions(-) diff --git a/apps/api/app/api/health.py b/apps/api/app/api/health.py index 49f996bb..835f70f5 100644 --- a/apps/api/app/api/health.py +++ b/apps/api/app/api/health.py @@ -8,7 +8,7 @@ @router.get("/health") -async def healthcheck(): +def healthcheck(): return { "status": "ok", "catalog": get_catalog_indexer().get_progress_snapshot(), @@ -16,5 +16,5 @@ async def healthcheck(): @router.get("/health/catalog") -async def catalog_healthcheck(): +def catalog_healthcheck(): return get_catalog_indexer().get_progress_snapshot() diff --git a/apps/api/app/api/qbittorrent/torrents.py b/apps/api/app/api/qbittorrent/torrents.py index 7e3bf67f..306954a4 100644 --- a/apps/api/app/api/qbittorrent/torrents.py +++ b/apps/api/app/api/qbittorrent/torrents.py @@ -120,15 +120,33 @@ def torrents_add( save_path=published_savepath, category=category, job_id=job_id, - state="queued" if paused else "downloading", + state="queued", ) logger.success( "Torrent task upserted for hash={}, state={}, site={}".format( - btih, "queued" if paused else "downloading", site + btih, "queued", site ) ) if not paused: - start_scheduled_job(job_id, req) + try: + start_scheduled_job(job_id, req) + except Exception as exc: + logger.error("Failed to start scheduled job {}: {}", job_id, exc) + return PlainTextResponse("Failed to start download.", status_code=500) + upsert_client_task( + session, + hash=btih, + name=name, + slug=slug, + season=season, + episode=episode, + language=language, + site=site, + save_path=published_savepath, + category=category, + job_id=job_id, + state="downloading", + ) logger.debug(f"Started background worker for job_id: {job_id}") return PlainTextResponse("Ok.") diff --git a/apps/api/app/api/torznab/api.py b/apps/api/app/api/torznab/api.py index 06c9a19f..1805c8c6 100644 --- a/apps/api/app/api/torznab/api.py +++ b/apps/api/app/api/torznab/api.py @@ -437,27 +437,28 @@ def _indexed_preview_results( language=language, site=provider, ) - magnet = tn_module.build_magnet( - title=release_title, - slug=row.slug, - season=provider_season_i, - episode=provider_episode_i, - language=language, - provider=None, - site=provider, - ) - _build_item( - channel=channel, - title=release_title, - magnet=magnet, - pubdate=now, - cat_id=cat_id, - guid_str=f"{provider}:{row.slug}:{season_i}:{episode_i}:{language}", - language=language, - ) - count += 1 - if count >= max(1, limit): - return count + if STRM_FILES_MODE != "only": + magnet = tn_module.build_magnet( + title=release_title, + slug=row.slug, + season=provider_season_i, + episode=provider_episode_i, + language=language, + provider=None, + site=provider, + ) + _build_item( + channel=channel, + title=release_title, + magnet=magnet, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{row.slug}:{season_i}:{episode_i}:{language}", + language=language, + ) + count += 1 + if count >= max(1, limit): + return count if STRM_FILES_MODE in ("only", "both"): magnet_strm = tn_module.build_magnet( title=release_title + strm_suffix, @@ -508,45 +509,49 @@ def _emit_indexed_mapped_episode( season=provider_season, episode=provider_episode, ) + language_values = [ + language_row.language for language_row in languages + ] or _default_languages_for_site(provider) emitted = 0 - for language_row in languages or []: + for language in language_values: release_title = tn_module.build_release_name( series_title=title, season=canonical_season, episode=canonical_episode, height=None, vcodec=None, - language=language_row.language, - site=provider, - ) - magnet = tn_module.build_magnet( - title=release_title, - slug=slug, - season=provider_season, - episode=provider_episode, - language=language_row.language, - provider=None, + language=language, site=provider, ) - _build_item( - channel=channel, - title=release_title, - magnet=magnet, - pubdate=now, - cat_id=cat_id, - guid_str=f"{provider}:{slug}:S{canonical_season}E{canonical_episode}:{language_row.language}", - language=language_row.language, - ) - emitted += 1 - if emitted >= max_items: - return emitted + if STRM_FILES_MODE != "only": + magnet = tn_module.build_magnet( + title=release_title, + slug=slug, + season=provider_season, + episode=provider_episode, + language=language, + provider=None, + site=provider, + ) + _build_item( + channel=channel, + title=release_title, + magnet=magnet, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{slug}:S{canonical_season}E{canonical_episode}:{language}", + language=language, + ) + emitted += 1 + if emitted >= max_items: + return emitted if STRM_FILES_MODE in ("only", "both"): magnet_strm = tn_module.build_magnet( title=release_title + strm_suffix, slug=slug, season=provider_season, episode=provider_episode, - language=language_row.language, + language=language, provider=None, site=provider, mode="strm", @@ -557,8 +562,8 @@ def _emit_indexed_mapped_episode( magnet=magnet_strm, pubdate=now, cat_id=cat_id, - guid_str=f"{provider}:{slug}:S{canonical_season}E{canonical_episode}:{language_row.language}:strm", - language=language_row.language, + guid_str=f"{provider}:{slug}:S{canonical_season}E{canonical_episode}:{language}:strm", + language=language, ) emitted += 1 if emitted >= max_items: @@ -630,13 +635,6 @@ def torznab_api( if t == "search": import app.api.torznab as tn - try: - require_catalog_ready() - except CatalogNotReadyError as exc: - from fastapi import HTTPException - - raise HTTPException(status_code=503, detail=str(exc)) from exc - rss, channel = _rss_root() q_str = (q or "").strip() strm_suffix = " [STRM]" @@ -660,6 +658,13 @@ def torznab_api( if not q_str: return _rss_response(rss) + try: + require_catalog_ready() + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc + if movie_preferred: count = _indexed_preview_results( tn_module=tn, diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index cc9fe42e..2366415c 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -658,6 +658,9 @@ def on_index_loaded(total_titles: int) -> None: ) queue.put(_QUEUE_SENTINEL) writer.join(timeout=30) + if writer.is_alive(): + detail = f": {writer_failure[0]}" if writer_failure else "" + raise RuntimeError(f"writer thread did not finish within 30s{detail}") if writer_failure: raise RuntimeError(str(writer_failure[0])) completed_at = utcnow() @@ -1277,11 +1280,12 @@ def _mark_stage_ready( ) -> None: with Session(engine) as read_session: status = get_provider_index_status(read_session, provider=provider) - overall_status = "ready" if is_provider_fully_ready(status) else "partial" + if status is not None: + status = status.model_copy(deep=True) payload = { "provider": provider, "refresh_interval_hours": refresh_interval_hours, - "status": overall_status, + "status": "partial", "active_stage": None, "latest_completed_at": completed_at, "last_error_summary": "", @@ -1291,12 +1295,20 @@ def _mark_stage_ready( payload["detail_enrichment_status"] = "ready" payload["detail_ready_at"] = completed_at payload["detail_next_retry_after"] = None + if status is not None: + status.detail_enrichment_status = "ready" + status.detail_ready_at = completed_at + status.detail_next_retry_after = None else: payload["canonical_enrichment_status"] = "ready" payload["canonical_ready_at"] = completed_at payload["canonical_next_retry_after"] = None - if status is not None and status.detail_enrichment_status == "ready": - payload["status"] = "ready" + if status is not None: + status.canonical_enrichment_status = "ready" + status.canonical_ready_at = completed_at + status.canonical_next_retry_after = None + if status is not None and is_provider_fully_ready(status): + payload["status"] = "ready" upsert_provider_index_status(session, **payload) def _mark_stage_failed( diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py index 354e47e2..0940fc79 100644 --- a/apps/api/app/catalog/providers.py +++ b/apps/api/app/catalog/providers.py @@ -3,7 +3,9 @@ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError from dataclasses import dataclass, field from difflib import SequenceMatcher +import inspect import re +from threading import Event from typing import Any, Callable, Optional from urllib.parse import urlparse @@ -19,6 +21,11 @@ from app.utils.domain_resolver import get_megakino_base_url from app.utils.http_client import get as http_get +_TITLE_CRAWL_EXECUTOR = ThreadPoolExecutor( + max_workers=4, + thread_name_prefix="provider-title-crawl", +) + @dataclass(slots=True) class EpisodeLanguageRecord: @@ -84,15 +91,21 @@ def _relative_path(url: str) -> str: def _run_with_timeout( timeout_seconds: float, func: Callable[..., Any], *args, **kwargs ): - executor = ThreadPoolExecutor(max_workers=1) - future = executor.submit(func, *args, **kwargs) + cancel_event = Event() + signature = inspect.signature(func) + accepts_cancel_event = "cancel_event" in signature.parameters or any( + parameter.kind == inspect.Parameter.VAR_KEYWORD + for parameter in signature.parameters.values() + ) + submit_kwargs = dict(kwargs) + if accepts_cancel_event: + submit_kwargs["cancel_event"] = cancel_event + future = _TITLE_CRAWL_EXECUTOR.submit(func, *args, **submit_kwargs) try: return future.result(timeout=max(0.001, timeout_seconds)) except FutureTimeoutError as exc: - future.cancel() + cancel_event.set() raise TimeoutError(f"title crawl exceeded {int(timeout_seconds)}s") from exc - finally: - executor.shutdown(wait=False, cancel_futures=True) def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageRecord]: @@ -517,7 +530,10 @@ def _crawl_aniworld_like_detail( slug: str, title: str, aliases: list[str], + cancel_event: Event | None = None, ) -> TitleRecord: + if cancel_event is not None and cancel_event.is_set(): + raise TimeoutError("title crawl cancelled before start") base_url = str(CATALOG_SITE_CONFIGS[provider_key]["base_url"]).rstrip("/") if provider_key == "aniworld.to": from aniworld.models import AniworldSeries @@ -532,6 +548,8 @@ def _crawl_aniworld_like_detail( episodes: list[EpisodeRecord] = [] for season in series.seasons: + if cancel_event is not None and cancel_event.is_set(): + raise TimeoutError(f"title crawl cancelled for {provider_key}:{slug}") if provider_key == "aniworld.to": episodes.extend(_parse_aniworld_season_rows(season)) else: diff --git a/apps/api/app/config.py b/apps/api/app/config.py index 7848c63d..f66d331a 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -261,19 +261,24 @@ def _ensure_runtime_home() -> Path: MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN = int( os.getenv("MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN", "100") ) -PROVIDER_INDEX_REFRESH_HOURS = float(os.getenv("PROVIDER_INDEX_REFRESH_HOURS", "24")) -PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD = float( +PROVIDER_INDEX_REFRESH_HOURS = _as_non_negative_float( + os.getenv("PROVIDER_INDEX_REFRESH_HOURS"), 24.0 +) +PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD = _as_non_negative_float( os.getenv( "PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD", str(PROVIDER_INDEX_REFRESH_HOURS) - ) + ), + PROVIDER_INDEX_REFRESH_HOURS, ) -PROVIDER_INDEX_REFRESH_HOURS_STO = float( - os.getenv("PROVIDER_INDEX_REFRESH_HOURS_STO", str(PROVIDER_INDEX_REFRESH_HOURS)) +PROVIDER_INDEX_REFRESH_HOURS_STO = _as_non_negative_float( + os.getenv("PROVIDER_INDEX_REFRESH_HOURS_STO", str(PROVIDER_INDEX_REFRESH_HOURS)), + PROVIDER_INDEX_REFRESH_HOURS, ) -PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO = float( +PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO = _as_non_negative_float( os.getenv( "PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO", str(PROVIDER_INDEX_REFRESH_HOURS) - ) + ), + PROVIDER_INDEX_REFRESH_HOURS, ) PROVIDER_INDEX_SCHEDULER_POLL_SECONDS = _as_non_negative_int( os.getenv("PROVIDER_INDEX_SCHEDULER_POLL_SECONDS"), 60 diff --git a/apps/api/app/core/downloader/download.py b/apps/api/app/core/downloader/download.py index 38a1d566..51ccc728 100644 --- a/apps/api/app/core/downloader/download.py +++ b/apps/api/app/core/downloader/download.py @@ -176,7 +176,7 @@ def _attempt_download( "Resolved initial download host: preferred={} resolved={} direct_url={}", provider, chosen, - direct, + "", ) base_hint = title_hint @@ -220,7 +220,7 @@ def _attempt_download( chosen, provider_name, chosen3, - direct3, + "", ) temp_path, info = _ydl_download( direct3, diff --git a/apps/api/app/core/downloader/provider_resolution.py b/apps/api/app/core/downloader/provider_resolution.py index cf4a8c90..63f7675a 100644 --- a/apps/api/app/core/downloader/provider_resolution.py +++ b/apps/api/app/core/downloader/provider_resolution.py @@ -1,8 +1,7 @@ from __future__ import annotations import re -from queue import Queue -from threading import Thread +from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError from typing import List, Optional, Tuple, TYPE_CHECKING from loguru import logger @@ -15,6 +14,10 @@ from aniworld.models import Episode _AVAIL_RE = re.compile(r"Available languages:\s*\[([^\]]*)\]", re.IGNORECASE) +_DIRECT_LINK_EXECUTOR = ThreadPoolExecutor( + max_workers=max(1, len(PROVIDER_ORDER)), + thread_name_prefix="provider-resolution", +) def _run_with_timeout( @@ -23,32 +26,13 @@ def _run_with_timeout( timeout_seconds: float, operation: str, ): - result_queue: Queue[tuple[str, object]] = Queue(maxsize=1) - - def _target() -> None: - try: - result_queue.put(("result", callback(*args))) - except Exception as exc: - result_queue.put(("error", exc)) - - thread = Thread( - target=_target, - name=f"provider-resolution-{operation}", - daemon=True, - ) - thread.start() - thread.join(timeout_seconds) - if thread.is_alive(): - raise TimeoutError(f"{operation} timed out after {timeout_seconds:.1f}s") - - outcome, payload = result_queue.get() - if outcome == "error": - if isinstance(payload, BaseException): - raise payload - raise RuntimeError( - f"{operation} failed with non-exception payload: {payload!r}" - ) - return payload + future = _DIRECT_LINK_EXECUTOR.submit(callback, *args) + try: + return future.result(timeout=max(0.001, timeout_seconds)) + except FutureTimeoutError as exc: + raise TimeoutError( + f"{operation} timed out after {timeout_seconds:.1f}s" + ) from exc def _parse_available_languages_from_error(msg: str) -> List[str]: diff --git a/apps/api/app/core/scheduler.py b/apps/api/app/core/scheduler.py index ad2e5c23..3795b480 100644 --- a/apps/api/app/core/scheduler.py +++ b/apps/api/app/core/scheduler.py @@ -66,8 +66,10 @@ def publish(self, snapshot: JobProgressSnapshot) -> None: def close(self, *, flush: bool) -> None: self._stop_event.set() - if flush: - self._wake_event.set() + if not flush: + with self._lock: + self._pending = None + self._wake_event.set() self._thread.join(timeout=5) def _drain_pending(self) -> JobProgressSnapshot | None: diff --git a/apps/api/app/utils/domain_resolver.py b/apps/api/app/utils/domain_resolver.py index 4084c3da..4bc486df 100644 --- a/apps/api/app/utils/domain_resolver.py +++ b/apps/api/app/utils/domain_resolver.py @@ -459,40 +459,61 @@ def fetch_megakino_domain( raw_candidates.append(env_candidate) raw_candidates.extend(MEGAKINO_REDIRECT_SEEDS) - hint_domain = _fetch_github_domain_hint(timeout=min(timeout, 8)) - if hint_domain: - raw_candidates.append(hint_domain) - - normalized_candidates: list[str] = [] - for candidate in raw_candidates: - domain = _normalize_domain(candidate) - if not domain or domain in seen: - continue - seen.add(domain) - normalized_candidates.append(domain) - - if not normalized_candidates: - logger.warning( - "Megakino domain resolution failed; no candidate seeds available." + def _iter_candidates(candidates: list[str]) -> list[str]: + normalized: list[str] = [] + for candidate in candidates: + domain = _normalize_domain(candidate) + if not domain or domain in seen: + continue + seen.add(domain) + normalized.append(domain) + return normalized + + normalized_candidates = _iter_candidates(raw_candidates) + if normalized_candidates: + logger.info( + "Megakino domain resolution candidates: {}", + ", ".join(normalized_candidates), ) - return None - - logger.info( - "Megakino domain resolution candidates: {}", - ", ".join(normalized_candidates), - ) + for domain in normalized_candidates: + base_url = _build_base_url(domain) + try: + resolved = _probe_megakino_sitemap(base_url, timeout=timeout) + except Exception as exc: + logger.warning( + "Megakino candidate check failed for {}: {}", base_url, exc + ) + continue + if resolved: + logger.success("Megakino domain resolved: {}", resolved) + return resolved + logger.warning("Megakino candidate failed validation: {}", domain) - for domain in normalized_candidates: - base_url = _build_base_url(domain) - try: - resolved = _probe_megakino_sitemap(base_url, timeout=timeout) - except Exception as exc: - logger.warning("Megakino candidate check failed for {}: {}", base_url, exc) - continue - if resolved: - logger.success("Megakino domain resolved: {}", resolved) - return resolved - logger.warning("Megakino candidate failed validation: {}", domain) + hint_domain = _fetch_github_domain_hint(timeout=min(timeout, 8)) + if hint_domain: + hinted_candidates = _iter_candidates([hint_domain]) + if hinted_candidates: + logger.info( + "Megakino domain resolution fallback candidate: {}", + ", ".join(hinted_candidates), + ) + for domain in hinted_candidates: + base_url = _build_base_url(domain) + try: + resolved = _probe_megakino_sitemap(base_url, timeout=timeout) + except Exception as exc: + logger.warning( + "Megakino hinted candidate check failed for {}: {}", + base_url, + exc, + ) + continue + if resolved: + logger.success("Megakino domain resolved: {}", resolved) + return resolved + logger.warning( + "Megakino hinted candidate failed validation: {}", domain + ) logger.warning("Megakino domain resolution failed; no candidate succeeded.") return None diff --git a/apps/api/app/utils/terminal.py b/apps/api/app/utils/terminal.py index 065446d7..ac4afded 100644 --- a/apps/api/app/utils/terminal.py +++ b/apps/api/app/utils/terminal.py @@ -108,8 +108,11 @@ def update(self, snap: ProgressSnapshot) -> None: step = max(1, int(PROGRESS_STEP_PERCENT)) if pct == 100 or pct // step > self._last_step_pct // step: self._last_step_pct = pct + speed_unit = "MB/s" if self.unit == "B" else f"{self.unit}/s" speed = ( - f"{float(snap.speed) / (1024 * 1024):.2f} MB/s" + f"{float(snap.speed) / (1024 * 1024):.2f} {speed_unit}" + if self.unit == "B" and snap.speed is not None + else f"{float(snap.speed):.2f} {speed_unit}" if snap.speed is not None else "-" ) diff --git a/apps/api/app/utils/title_resolver.py b/apps/api/app/utils/title_resolver.py index 1867d166..6a233d5e 100644 --- a/apps/api/app/utils/title_resolver.py +++ b/apps/api/app/utils/title_resolver.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import inspect import re import time from difflib import SequenceMatcher @@ -12,11 +13,13 @@ from bs4 import BeautifulSoup # type: ignore from loguru import logger from sqlmodel import Session +from sqlmodel import select from app.utils.logger import config as configure_logger from app.utils.http_client import get as http_get # type: ignore from app.catalog import get_catalog_readiness_error from app.db import ( + ProviderCatalogAlias, engine, list_indexed_titles_for_provider, resolve_indexed_title, @@ -638,6 +641,24 @@ def _score_title_candidate( ) +def _score_indexed_db_candidate(session: Session, *, query: str, candidate) -> float: + query_tokens = _match_tokens(query) + query_norm = _normalize_alnum(query) + best_score = _score_title_candidate(query_tokens, query_norm, candidate.title) + alias_rows = session.exec( + select(ProviderCatalogAlias).where( + (ProviderCatalogAlias.provider == candidate.provider) + & (ProviderCatalogAlias.slug == candidate.slug) + & (ProviderCatalogAlias.indexed_generation == candidate.indexed_generation) + ) + ).all() + for alias in alias_rows: + alias_score = _score_title_candidate(query_tokens, query_norm, alias.alias) + if alias_score > best_score: + best_score = alias_score + return best_score + + def _build_sto_search_terms(query: str) -> List[str]: """ Builds ordered search variants for S.to from a raw query. @@ -810,8 +831,8 @@ def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: ) if rows: candidate = rows[0] - cand_score = _score_title_candidate( - _match_tokens(q), _normalize_alnum(q), candidate.title + cand_score = _score_indexed_db_candidate( + session, query=q, candidate=candidate ) if cand_score >= _MIN_TITLE_MATCH_SCORE: return (candidate.provider, candidate.slug) @@ -841,8 +862,8 @@ def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: ) if rows: candidate = rows[0] - cand_score = _score_title_candidate( - _match_tokens(q), _normalize_alnum(q), candidate.title + cand_score = _score_indexed_db_candidate( + session, query=q, candidate=candidate ) if cand_score >= _MIN_TITLE_MATCH_SCORE: return (candidate.provider, candidate.slug) From c9f5fd5e37c573b7d59e7c40a60a8e4c04ada1b4 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 22:32:33 +0200 Subject: [PATCH 33/45] fix(db)!: make staged provider catalog rows generation-distinct store provider catalog titles, aliases, episodes, and episode languages per generation scope staged replacements to the target generation keep live catalog generations intact during failed staged refreshes add migration to rebuild provider catalog tables with generation-aware primary keys implement real downgrade paths for provider index stage migration make generation rollback inserts PK-safe in provider mapping downgrade BREAKING CHANGE: provider catalog tables now use generation-aware primary keys. Any direct SQL, external tooling, or handwritten migrations that assumed uniqueness on provider/slug or provider/slug/season/episode without indexed_generation must be updated before deploying this schema change. --- ...60429_0005_provider_mapping_generations.py | 62 ++- .../20260429_0006_provider_index_stages.py | 72 ++- ...60430_0007_provider_catalog_generations.py | 500 ++++++++++++++++++ apps/api/app/db/models.py | 88 +-- apps/api/tests/conftest.py | 10 +- .../api/qbittorrent/test_torrents.py | 32 +- .../tests/integration/api/torznab/test_api.py | 16 +- .../api/torznab/test_indexed_catalog.py | 11 + .../api/torznab/test_specials_mapping.py | 13 +- .../core/scheduler/test_strm_scheduler.py | 10 +- apps/api/tests/unit/db/test_models.py | 63 +++ .../unit/utils/title_resolver/test_sto.py | 51 +- 12 files changed, 862 insertions(+), 66 deletions(-) create mode 100644 apps/api/app/db/migrations/versions/20260430_0007_provider_catalog_generations.py diff --git a/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py b/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py index 7b8f9471..68164112 100644 --- a/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py +++ b/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py @@ -240,7 +240,22 @@ def downgrade() -> None: source, rationale, last_verified_at - FROM providerseriesmapping + FROM ( + SELECT + provider, + slug, + tvdb_id, + confidence, + source, + rationale, + last_verified_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, tvdb_id + ORDER BY indexed_generation DESC, last_verified_at DESC + ) AS rn + FROM providerseriesmapping + ) + WHERE rn = 1 """, "indexes": [ "CREATE INDEX ix_providerseriesmapping_confidence ON providerseriesmapping (confidence)", @@ -299,7 +314,33 @@ def downgrade() -> None: source, rationale, last_verified_at - FROM providerepisodemapping + FROM ( + SELECT + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + confidence, + source, + rationale, + last_verified_at, + ROW_NUMBER() OVER ( + PARTITION BY + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode + ORDER BY indexed_generation DESC, last_verified_at DESC + ) AS rn + FROM providerepisodemapping + ) + WHERE rn = 1 """, "indexes": [ "CREATE INDEX ix_providerepisodemapping_confidence ON providerepisodemapping (confidence)", @@ -338,7 +379,22 @@ def downgrade() -> None: source, rationale, last_verified_at - FROM providermoviemapping + FROM ( + SELECT + provider, + slug, + tmdb_id, + confidence, + source, + rationale, + last_verified_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, tmdb_id + ORDER BY indexed_generation DESC, last_verified_at DESC + ) AS rn + FROM providermoviemapping + ) + WHERE rn = 1 """, "indexes": [ "CREATE INDEX ix_providermoviemapping_confidence ON providermoviemapping (confidence)", diff --git a/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py b/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py index 9e3373f2..6a2c8f93 100644 --- a/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py +++ b/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py @@ -192,4 +192,74 @@ def upgrade() -> None: def downgrade() -> None: - pass + conn = op.get_bind() + inspector = sa.inspect(conn) + + providerindexstatus_indexes = [ + "ix_providerindexstatus_canonical_next_retry_after", + "ix_providerindexstatus_canonical_ready_at", + "ix_providerindexstatus_canonical_enrichment_status", + "ix_providerindexstatus_detail_next_retry_after", + "ix_providerindexstatus_detail_ready_at", + "ix_providerindexstatus_detail_enrichment_status", + "ix_providerindexstatus_title_index_next_retry_after", + "ix_providerindexstatus_title_index_ready_at", + "ix_providerindexstatus_title_index_status", + "ix_providerindexstatus_active_stage", + ] + providerindexstatus_columns = [ + "canonical_next_retry_after", + "canonical_ready_at", + "canonical_enrichment_status", + "detail_next_retry_after", + "detail_ready_at", + "detail_enrichment_status", + "title_index_next_retry_after", + "title_index_ready_at", + "title_index_status", + "active_stage", + ] + if inspector.has_table("providerindexstatus"): + existing_columns = { + column["name"] for column in inspector.get_columns("providerindexstatus") + } + existing_indexes = { + index["name"] for index in inspector.get_indexes("providerindexstatus") + } + with op.batch_alter_table("providerindexstatus") as batch_op: + for index_name in providerindexstatus_indexes: + if index_name in existing_indexes: + batch_op.drop_index(index_name) + for column_name in providerindexstatus_columns: + if column_name in existing_columns: + batch_op.drop_column(column_name) + + staged_columns = [ + "canonical_last_error_summary", + "canonical_failure_count", + "canonical_next_retry_after", + "canonical_last_success_at", + "canonical_last_attempted_at", + "canonical_status", + "detail_last_error_summary", + "detail_failure_count", + "detail_next_retry_after", + "detail_last_success_at", + "detail_last_attempted_at", + "detail_status", + ] + if inspector.has_table("providertitleindexstate"): + existing_columns = { + column["name"] + for column in inspector.get_columns("providertitleindexstate") + } + existing_indexes = { + index["name"] for index in inspector.get_indexes("providertitleindexstate") + } + with op.batch_alter_table("providertitleindexstate") as batch_op: + for column_name in staged_columns: + index_name = f"ix_providertitleindexstate_{column_name}" + if index_name in existing_indexes: + batch_op.drop_index(index_name) + if column_name in existing_columns: + batch_op.drop_column(column_name) diff --git a/apps/api/app/db/migrations/versions/20260430_0007_provider_catalog_generations.py b/apps/api/app/db/migrations/versions/20260430_0007_provider_catalog_generations.py new file mode 100644 index 00000000..31710954 --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260430_0007_provider_catalog_generations.py @@ -0,0 +1,500 @@ +"""Make provider catalog rows generation-distinct + +Revision ID: 20260430_0007 +Revises: 20260429_0006 +Create Date: 2026-04-30 12:00:00.000000 +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260430_0007" +down_revision = "20260429_0006" +branch_labels = None +depends_on = None + + +def _rebuild_table( + *, + table_name: str, + temp_table: str, + create_sql: str, + copy_sql: str, + index_sql: list[str], + require_generation_in_pk: bool, +) -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + if not inspector.has_table(table_name): + return + pk = inspector.get_pk_constraint(table_name) or {} + pk_columns = pk.get("constrained_columns") or [] + if ("indexed_generation" in pk_columns) is require_generation_in_pk: + return + + op.execute(sa.text(f"DROP TABLE IF EXISTS {temp_table}")) + op.execute(sa.text(create_sql)) + op.execute(sa.text(copy_sql)) + op.drop_table(table_name) + op.rename_table(temp_table, table_name) + for statement in index_sql: + op.execute(sa.text(statement)) + + +def upgrade() -> None: + specs = { + "providercatalogtitle": { + "temp_table": "providercatalogtitle_v2", + "create_sql": """ + CREATE TABLE providercatalogtitle_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + title VARCHAR NOT NULL, + normalized_title VARCHAR NOT NULL, + media_type_hint VARCHAR NOT NULL, + relative_path VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, indexed_generation) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogtitle_v2 ( + provider, + slug, + indexed_generation, + title, + normalized_title, + media_type_hint, + relative_path, + last_indexed_at + ) + SELECT + provider, + slug, + indexed_generation, + title, + normalized_title, + media_type_hint, + relative_path, + last_indexed_at + FROM providercatalogtitle + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogtitle_title ON providercatalogtitle (title)", + "CREATE INDEX ix_providercatalogtitle_normalized_title ON providercatalogtitle (normalized_title)", + "CREATE INDEX ix_providercatalogtitle_media_type_hint ON providercatalogtitle (media_type_hint)", + "CREATE INDEX ix_providercatalogtitle_indexed_generation ON providercatalogtitle (indexed_generation)", + "CREATE INDEX ix_providercatalogtitle_last_indexed_at ON providercatalogtitle (last_indexed_at)", + ], + }, + "providercatalogalias": { + "temp_table": "providercatalogalias_v2", + "create_sql": """ + CREATE TABLE providercatalogalias_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + alias VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + normalized_alias VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, alias, indexed_generation) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogalias_v2 ( + provider, + slug, + alias, + indexed_generation, + normalized_alias, + last_indexed_at + ) + SELECT + provider, + slug, + alias, + indexed_generation, + normalized_alias, + last_indexed_at + FROM providercatalogalias + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogalias_normalized_alias ON providercatalogalias (normalized_alias)", + "CREATE INDEX ix_providercatalogalias_indexed_generation ON providercatalogalias (indexed_generation)", + "CREATE INDEX ix_providercatalogalias_last_indexed_at ON providercatalogalias (last_indexed_at)", + ], + }, + "providercatalogepisode": { + "temp_table": "providercatalogepisode_v2", + "create_sql": """ + CREATE TABLE providercatalogepisode_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + season INTEGER NOT NULL, + episode INTEGER NOT NULL, + indexed_generation VARCHAR NOT NULL, + title_primary VARCHAR, + title_secondary VARCHAR, + relative_path VARCHAR NOT NULL, + media_type_hint VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, season, episode, indexed_generation) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogepisode_v2 ( + provider, + slug, + season, + episode, + indexed_generation, + title_primary, + title_secondary, + relative_path, + media_type_hint, + last_indexed_at + ) + SELECT + provider, + slug, + season, + episode, + indexed_generation, + title_primary, + title_secondary, + relative_path, + media_type_hint, + last_indexed_at + FROM providercatalogepisode + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogepisode_media_type_hint ON providercatalogepisode (media_type_hint)", + "CREATE INDEX ix_providercatalogepisode_indexed_generation ON providercatalogepisode (indexed_generation)", + "CREATE INDEX ix_providercatalogepisode_last_indexed_at ON providercatalogepisode (last_indexed_at)", + ], + }, + "providerepisodelanguage": { + "temp_table": "providerepisodelanguage_v2", + "create_sql": """ + CREATE TABLE providerepisodelanguage_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + season INTEGER NOT NULL, + episode INTEGER NOT NULL, + language VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + normalized_language VARCHAR NOT NULL, + host_hints JSON, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY ( + provider, + slug, + season, + episode, + language, + indexed_generation + ) + ) + """, + "copy_sql": """ + INSERT INTO providerepisodelanguage_v2 ( + provider, + slug, + season, + episode, + language, + indexed_generation, + normalized_language, + host_hints, + last_indexed_at + ) + SELECT + provider, + slug, + season, + episode, + language, + indexed_generation, + normalized_language, + host_hints, + last_indexed_at + FROM providerepisodelanguage + """, + "index_sql": [ + "CREATE INDEX ix_providerepisodelanguage_normalized_language ON providerepisodelanguage (normalized_language)", + "CREATE INDEX ix_providerepisodelanguage_indexed_generation ON providerepisodelanguage (indexed_generation)", + "CREATE INDEX ix_providerepisodelanguage_last_indexed_at ON providerepisodelanguage (last_indexed_at)", + ], + }, + } + + for table_name, spec in specs.items(): + _rebuild_table( + table_name=table_name, + temp_table=spec["temp_table"], + create_sql=spec["create_sql"], + copy_sql=spec["copy_sql"], + index_sql=spec["index_sql"], + require_generation_in_pk=True, + ) + + +def downgrade() -> None: + specs = { + "providercatalogtitle": { + "temp_table": "providercatalogtitle_v2", + "create_sql": """ + CREATE TABLE providercatalogtitle_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + title VARCHAR NOT NULL, + normalized_title VARCHAR NOT NULL, + media_type_hint VARCHAR NOT NULL, + relative_path VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogtitle_v2 ( + provider, + slug, + title, + normalized_title, + media_type_hint, + relative_path, + indexed_generation, + last_indexed_at + ) + SELECT + provider, + slug, + title, + normalized_title, + media_type_hint, + relative_path, + indexed_generation, + last_indexed_at + FROM ( + SELECT + provider, + slug, + title, + normalized_title, + media_type_hint, + relative_path, + indexed_generation, + last_indexed_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug + ORDER BY indexed_generation DESC, last_indexed_at DESC + ) AS rn + FROM providercatalogtitle + ) + WHERE rn = 1 + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogtitle_title ON providercatalogtitle (title)", + "CREATE INDEX ix_providercatalogtitle_normalized_title ON providercatalogtitle (normalized_title)", + "CREATE INDEX ix_providercatalogtitle_media_type_hint ON providercatalogtitle (media_type_hint)", + "CREATE INDEX ix_providercatalogtitle_indexed_generation ON providercatalogtitle (indexed_generation)", + "CREATE INDEX ix_providercatalogtitle_last_indexed_at ON providercatalogtitle (last_indexed_at)", + ], + }, + "providercatalogalias": { + "temp_table": "providercatalogalias_v2", + "create_sql": """ + CREATE TABLE providercatalogalias_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + alias VARCHAR NOT NULL, + normalized_alias VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, alias) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogalias_v2 ( + provider, + slug, + alias, + normalized_alias, + indexed_generation, + last_indexed_at + ) + SELECT + provider, + slug, + alias, + normalized_alias, + indexed_generation, + last_indexed_at + FROM ( + SELECT + provider, + slug, + alias, + normalized_alias, + indexed_generation, + last_indexed_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, alias + ORDER BY indexed_generation DESC, last_indexed_at DESC + ) AS rn + FROM providercatalogalias + ) + WHERE rn = 1 + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogalias_normalized_alias ON providercatalogalias (normalized_alias)", + "CREATE INDEX ix_providercatalogalias_indexed_generation ON providercatalogalias (indexed_generation)", + "CREATE INDEX ix_providercatalogalias_last_indexed_at ON providercatalogalias (last_indexed_at)", + ], + }, + "providercatalogepisode": { + "temp_table": "providercatalogepisode_v2", + "create_sql": """ + CREATE TABLE providercatalogepisode_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + season INTEGER NOT NULL, + episode INTEGER NOT NULL, + title_primary VARCHAR, + title_secondary VARCHAR, + relative_path VARCHAR NOT NULL, + media_type_hint VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, season, episode) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogepisode_v2 ( + provider, + slug, + season, + episode, + title_primary, + title_secondary, + relative_path, + media_type_hint, + indexed_generation, + last_indexed_at + ) + SELECT + provider, + slug, + season, + episode, + title_primary, + title_secondary, + relative_path, + media_type_hint, + indexed_generation, + last_indexed_at + FROM ( + SELECT + provider, + slug, + season, + episode, + title_primary, + title_secondary, + relative_path, + media_type_hint, + indexed_generation, + last_indexed_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, season, episode + ORDER BY indexed_generation DESC, last_indexed_at DESC + ) AS rn + FROM providercatalogepisode + ) + WHERE rn = 1 + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogepisode_media_type_hint ON providercatalogepisode (media_type_hint)", + "CREATE INDEX ix_providercatalogepisode_indexed_generation ON providercatalogepisode (indexed_generation)", + "CREATE INDEX ix_providercatalogepisode_last_indexed_at ON providercatalogepisode (last_indexed_at)", + ], + }, + "providerepisodelanguage": { + "temp_table": "providerepisodelanguage_v2", + "create_sql": """ + CREATE TABLE providerepisodelanguage_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + season INTEGER NOT NULL, + episode INTEGER NOT NULL, + language VARCHAR NOT NULL, + normalized_language VARCHAR NOT NULL, + host_hints JSON, + indexed_generation VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, season, episode, language) + ) + """, + "copy_sql": """ + INSERT INTO providerepisodelanguage_v2 ( + provider, + slug, + season, + episode, + language, + normalized_language, + host_hints, + indexed_generation, + last_indexed_at + ) + SELECT + provider, + slug, + season, + episode, + language, + normalized_language, + host_hints, + indexed_generation, + last_indexed_at + FROM ( + SELECT + provider, + slug, + season, + episode, + language, + normalized_language, + host_hints, + indexed_generation, + last_indexed_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, season, episode, language + ORDER BY indexed_generation DESC, last_indexed_at DESC + ) AS rn + FROM providerepisodelanguage + ) + WHERE rn = 1 + """, + "index_sql": [ + "CREATE INDEX ix_providerepisodelanguage_normalized_language ON providerepisodelanguage (normalized_language)", + "CREATE INDEX ix_providerepisodelanguage_indexed_generation ON providerepisodelanguage (indexed_generation)", + "CREATE INDEX ix_providerepisodelanguage_last_indexed_at ON providerepisodelanguage (last_indexed_at)", + ], + }, + } + + for table_name, spec in specs.items(): + _rebuild_table( + table_name=table_name, + temp_table=spec["temp_table"], + create_sql=spec["create_sql"], + copy_sql=spec["copy_sql"], + index_sql=spec["index_sql"], + require_generation_in_pk=False, + ) diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index f320ba61..d3c72462 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -237,11 +237,11 @@ class ProviderTitleIndexState(ModelBase, table=True): class ProviderCatalogTitle(ModelBase, table=True): provider: str = Field(primary_key=True) slug: str = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) title: str = Field(index=True) normalized_title: str = Field(index=True) media_type_hint: str = Field(default="series", index=True) relative_path: str - indexed_generation: str = Field(index=True) last_indexed_at: datetime = Field(default_factory=utcnow, index=True) @@ -249,8 +249,8 @@ class ProviderCatalogAlias(ModelBase, table=True): provider: str = Field(primary_key=True) slug: str = Field(primary_key=True) alias: str = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) normalized_alias: str = Field(index=True) - indexed_generation: str = Field(index=True) last_indexed_at: datetime = Field(default_factory=utcnow, index=True) @@ -259,11 +259,11 @@ class ProviderCatalogEpisode(ModelBase, table=True): slug: str = Field(primary_key=True) season: int = Field(primary_key=True) episode: int = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) title_primary: Optional[str] = None title_secondary: Optional[str] = None relative_path: str media_type_hint: str = Field(default="episode", index=True) - indexed_generation: str = Field(index=True) last_indexed_at: datetime = Field(default_factory=utcnow, index=True) @@ -273,9 +273,9 @@ class ProviderEpisodeLanguage(ModelBase, table=True): season: int = Field(primary_key=True) episode: int = Field(primary_key=True) language: str = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) normalized_language: str = Field(index=True) host_hints: Optional[list[str]] = Field(sa_column=Column(JSON), default=None) - indexed_generation: str = Field(index=True) last_indexed_at: datetime = Field(default_factory=utcnow, index=True) @@ -952,16 +952,16 @@ def replace_provider_catalog_title( relative_path: str, indexed_generation: str, ) -> ProviderCatalogTitle: - rec = session.get(ProviderCatalogTitle, (provider, slug)) + rec = session.get(ProviderCatalogTitle, (provider, slug, indexed_generation)) if rec is None: rec = ProviderCatalogTitle( provider=provider, slug=slug, + indexed_generation=indexed_generation, title=title, normalized_title=normalize_catalog_text(title), media_type_hint=media_type_hint, relative_path=relative_path, - indexed_generation=indexed_generation, last_indexed_at=utcnow(), ) else: @@ -969,7 +969,6 @@ def replace_provider_catalog_title( rec.normalized_title = normalize_catalog_text(title) rec.media_type_hint = media_type_hint rec.relative_path = relative_path - rec.indexed_generation = indexed_generation rec.last_indexed_at = utcnow() session.add(rec) return rec @@ -987,6 +986,7 @@ def replace_provider_catalog_aliases( ProviderCatalogAlias.__table__.delete().where( (ProviderCatalogAlias.provider == provider) & (ProviderCatalogAlias.slug == slug) + & (ProviderCatalogAlias.indexed_generation == indexed_generation) ) ) seen: set[str] = set() @@ -1000,8 +1000,8 @@ def replace_provider_catalog_aliases( provider=provider, slug=slug, alias=alias_clean, - normalized_alias=normalize_catalog_text(alias_clean), indexed_generation=indexed_generation, + normalized_alias=normalize_catalog_text(alias_clean), last_indexed_at=utcnow(), ) ) @@ -1019,12 +1019,14 @@ def replace_provider_catalog_episodes( ProviderEpisodeLanguage.__table__.delete().where( (ProviderEpisodeLanguage.provider == provider) & (ProviderEpisodeLanguage.slug == slug) + & (ProviderEpisodeLanguage.indexed_generation == indexed_generation) ) ) session.exec( ProviderCatalogEpisode.__table__.delete().where( (ProviderCatalogEpisode.provider == provider) & (ProviderCatalogEpisode.slug == slug) + & (ProviderCatalogEpisode.indexed_generation == indexed_generation) ) ) for item in episodes: @@ -1034,11 +1036,11 @@ def replace_provider_catalog_episodes( slug=slug, season=int(item["season"]), episode=int(item["episode"]), + indexed_generation=indexed_generation, title_primary=item.get("title_primary"), title_secondary=item.get("title_secondary"), relative_path=item["relative_path"], media_type_hint=item.get("media_type_hint", "episode"), - indexed_generation=indexed_generation, last_indexed_at=utcnow(), ) ) @@ -1053,9 +1055,9 @@ def replace_provider_catalog_episodes( season=int(item["season"]), episode=int(item["episode"]), language=language, + indexed_generation=indexed_generation, normalized_language=normalize_catalog_text(language), host_hints=list(language_payload.get("host_hints") or []), - indexed_generation=indexed_generation, last_indexed_at=utcnow(), ) ) @@ -1296,22 +1298,23 @@ def upsert_canonical_series( rec.mal_id = mal_id rec.last_synced_at = utcnow() session.add(rec) - session.exec( - CanonicalSeriesAlias.__table__.delete().where( - CanonicalSeriesAlias.tvdb_id == tvdb_id - ) - ) - for alias in aliases or []: - alias_clean = (alias or "").strip() - if not alias_clean: - continue - session.add( - CanonicalSeriesAlias( - tvdb_id=tvdb_id, - alias=alias_clean, - normalized_alias=normalize_catalog_text(alias_clean), + if aliases is not None: + session.exec( + CanonicalSeriesAlias.__table__.delete().where( + CanonicalSeriesAlias.tvdb_id == tvdb_id ) ) + for alias in aliases: + alias_clean = (alias or "").strip() + if not alias_clean: + continue + session.add( + CanonicalSeriesAlias( + tvdb_id=tvdb_id, + alias=alias_clean, + normalized_alias=normalize_catalog_text(alias_clean), + ) + ) return rec @@ -1408,9 +1411,10 @@ def resolve_indexed_title( status = session.get(ProviderIndexStatus, provider) if status is None or not status.latest_success_generation: return None - row = session.get(ProviderCatalogTitle, (provider, slug)) - if row is None or row.indexed_generation != status.latest_success_generation: - return None + row = session.get( + ProviderCatalogTitle, + (provider, slug, status.latest_success_generation), + ) return row.title if row else None @@ -1442,15 +1446,22 @@ def search_indexed_provider_titles( if visible_generations.get(row.provider) == row.indexed_generation ] + alias_rows = session.exec( + select(ProviderCatalogAlias).where(ProviderCatalogAlias.provider.in_(providers)) + ).all() + aliases_by_key: dict[tuple[str, str, str], list[str]] = {} + for alias in alias_rows: + if visible_generations.get(alias.provider) != alias.indexed_generation: + continue + aliases_by_key.setdefault( + (alias.provider, alias.slug, alias.indexed_generation), [] + ).append(alias.normalized_alias) + def _score(row: ProviderCatalogTitle) -> tuple[int, int]: names = [row.normalized_title] - alias_rows = session.exec( - select(ProviderCatalogAlias).where( - (ProviderCatalogAlias.provider == row.provider) - & (ProviderCatalogAlias.slug == row.slug) - ) - ).all() - names.extend(alias.normalized_alias for alias in alias_rows) + names.extend( + aliases_by_key.get((row.provider, row.slug, row.indexed_generation), []) + ) best = 0 exact = 0 for name in names: @@ -1462,12 +1473,9 @@ def _score(row: ProviderCatalogTitle) -> tuple[int, int]: best = overlap return (exact, best) - ranked = sorted( - rows, - key=lambda row: _score(row), - reverse=True, - ) - filtered = [row for row in ranked if _score(row)[1] > 0 or _score(row)[0] > 0] + scored_rows = [(row, _score(row)) for row in rows] + ranked = sorted(scored_rows, key=lambda item: item[1], reverse=True) + filtered = [row for row, score in ranked if score[1] > 0 or score[0] > 0] return filtered[: max(1, limit)] diff --git a/apps/api/tests/conftest.py b/apps/api/tests/conftest.py index fe6a123f..8886999a 100644 --- a/apps/api/tests/conftest.py +++ b/apps/api/tests/conftest.py @@ -128,11 +128,11 @@ def client(tmp_path, monkeypatch): create_db_and_tables() - monkeypatch.setattr( - qb_torrents, - "schedule_download", - lambda req, autostart=True: "job-1", - ) + def _schedule_download(req, *, autostart=True): + del req, autostart + return "job-1" + + monkeypatch.setattr(qb_torrents, "schedule_download", _schedule_download) monkeypatch.setattr(qb_torrents, "start_scheduled_job", lambda job_id, req: None) monkeypatch.setattr(qb_torrents, "cancel_job", lambda job_id: None) diff --git a/apps/api/tests/integration/api/qbittorrent/test_torrents.py b/apps/api/tests/integration/api/qbittorrent/test_torrents.py index dfe6ea58..4afad448 100644 --- a/apps/api/tests/integration/api/qbittorrent/test_torrents.py +++ b/apps/api/tests/integration/api/qbittorrent/test_torrents.py @@ -79,13 +79,15 @@ def test_torrents_add_starts_worker_after_task_write(client, monkeypatch): calls: list[tuple[str, str]] = [] + def _schedule_download(req, *, autostart=True): + del req + calls.append(("schedule", "autostart" if autostart else "deferred")) + return "job-1" + monkeypatch.setattr( qb_torrents, "schedule_download", - lambda req, autostart=True: ( - calls.append(("schedule", "autostart" if autostart else "deferred")) - or "job-1" - ), + _schedule_download, ) monkeypatch.setattr( qb_torrents, @@ -104,3 +106,25 @@ def test_torrents_add_starts_worker_after_task_write(client, monkeypatch): assert response.status_code == 200 assert calls == [("schedule", "deferred"), ("start", "job-1")] + + +def test_torrents_add_returns_500_when_start_fails(client, monkeypatch): + from app.utils.magnet import build_magnet + import app.api.qbittorrent.torrents as qb_torrents + + monkeypatch.setattr( + qb_torrents, + "start_scheduled_job", + lambda job_id, req: (_ for _ in ()).throw(RuntimeError("boom")), + ) + + magnet = build_magnet( + title="Title", + slug="slug", + season=1, + episode=1, + language="German Dub", + ) + response = client.post("/api/v2/torrents/add", data={"urls": magnet}) + + assert response.status_code == 500 diff --git a/apps/api/tests/integration/api/torznab/test_api.py b/apps/api/tests/integration/api/torznab/test_api.py index d7d388ef..946a1728 100644 --- a/apps/api/tests/integration/api/torznab/test_api.py +++ b/apps/api/tests/integration/api/torznab/test_api.py @@ -218,6 +218,7 @@ def test_tvsearch_uses_id_resolved_query_when_q_missing(client, monkeypatch): def test_tvsearch_season_search_emits_multiple_episodes(client, monkeypatch) -> None: import app.api.torznab as tn + import app.api.torznab.api as torznab_api _seed_ready_tv_catalog( canonical_title="Series", @@ -231,6 +232,7 @@ def test_tvsearch_season_search_emits_multiple_episodes(client, monkeypatch) -> AssertionError("unexpected live probe") ), ) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", @@ -239,7 +241,7 @@ def test_tvsearch_season_search_emits_multiple_episodes(client, monkeypatch) -> assert resp.status_code == 200 root = ET.fromstring(resp.text) items = root.findall("./channel/item") - assert len(items) == 6 + assert len(items) == 3 urls = [ ( item.find("enclosure").get("url") @@ -257,6 +259,7 @@ def test_tvsearch_season_search_fallback_stops_on_consecutive_misses( client, monkeypatch ) -> None: import app.api.torznab as tn + import app.api.torznab.api as torznab_api _seed_ready_tv_catalog( canonical_title="Series", @@ -270,6 +273,7 @@ def test_tvsearch_season_search_fallback_stops_on_consecutive_misses( AssertionError("unexpected live probe") ), ) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", @@ -278,11 +282,12 @@ def test_tvsearch_season_search_fallback_stops_on_consecutive_misses( assert resp.status_code == 200 root = ET.fromstring(resp.text) items = root.findall("./channel/item") - assert len(items) == 4 + assert len(items) == 2 def test_tvsearch_ep_zero_is_treated_as_season_search(client, monkeypatch) -> None: import app.api.torznab as tn + import app.api.torznab.api as torznab_api _seed_ready_tv_catalog( canonical_title="Series", @@ -296,6 +301,7 @@ def test_tvsearch_ep_zero_is_treated_as_season_search(client, monkeypatch) -> No AssertionError("unexpected live probe") ), ) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", @@ -304,7 +310,7 @@ def test_tvsearch_ep_zero_is_treated_as_season_search(client, monkeypatch) -> No assert resp.status_code == 200 root = ET.fromstring(resp.text) items = root.findall("./channel/item") - assert len(items) == 4 + assert len(items) == 2 urls = [ ( item.find("enclosure").get("url") @@ -319,6 +325,7 @@ def test_tvsearch_ep_zero_is_treated_as_season_search(client, monkeypatch) -> No def test_tvsearch_fast_season_mode_avoids_live_probe(client, monkeypatch) -> None: import app.api.torznab as tn + import app.api.torznab.api as torznab_api _seed_ready_tv_catalog( canonical_title="Series", @@ -332,6 +339,7 @@ def test_tvsearch_fast_season_mode_avoids_live_probe(client, monkeypatch) -> Non AssertionError("unexpected live probe") ), ) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", @@ -340,7 +348,7 @@ def test_tvsearch_fast_season_mode_avoids_live_probe(client, monkeypatch) -> Non assert resp.status_code == 200 root = ET.fromstring(resp.text) items = root.findall("./channel/item") - assert len(items) == 4 + assert len(items) == 2 def test_tvsearch_season_search_limit_is_hard_item_cap(client) -> None: diff --git a/apps/api/tests/integration/api/torznab/test_indexed_catalog.py b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py index 5c3d0bf5..d40cd41f 100644 --- a/apps/api/tests/integration/api/torznab/test_indexed_catalog.py +++ b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py @@ -130,6 +130,17 @@ def test_search_returns_503_when_catalog_bootstrap_pending(client) -> None: assert "bootstrap" in response.json()["detail"].lower() +def test_search_test_result_bypasses_catalog_bootstrap(client, monkeypatch) -> None: + import app.api.torznab.api as torznab_api + + monkeypatch.setattr(torznab_api, "TORZNAB_RETURN_TEST_RESULT", True) + + response = client.get("/torznab/api", params={"t": "search"}) + + assert response.status_code == 200 + assert "" in response.text + + def test_search_uses_indexed_catalog_without_live_probe(client, monkeypatch) -> None: _seed_ready_catalog() monkeypatch.setattr( diff --git a/apps/api/tests/integration/api/torznab/test_specials_mapping.py b/apps/api/tests/integration/api/torznab/test_specials_mapping.py index 3100a9e9..5049c105 100644 --- a/apps/api/tests/integration/api/torznab/test_specials_mapping.py +++ b/apps/api/tests/integration/api/torznab/test_specials_mapping.py @@ -157,8 +157,13 @@ def test_tvsearch_falls_back_to_special_mapping_when_requested_episode_missing( assert "aw_e=4" in url -def test_tvsearch_reuses_resolved_special_mapping_across_languages(client) -> None: +def test_tvsearch_reuses_resolved_special_mapping_across_languages( + client, monkeypatch +) -> None: + import app.api.torznab.api as torznab_api + _seed_special_mapping_catalog(languages=["German Sub", "English Sub"]) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", @@ -167,7 +172,7 @@ def test_tvsearch_reuses_resolved_special_mapping_across_languages(client) -> No assert resp.status_code == 200 root = ET.fromstring(resp.text) items = root.findall("./channel/item") - assert len(items) == 4 + assert len(items) == 2 urls = [ ( item.find("enclosure").get("url") @@ -176,8 +181,8 @@ def test_tvsearch_reuses_resolved_special_mapping_across_languages(client) -> No ) for item in items ] - assert sum("aw_lang=German+Sub" in url for url in urls) == 2 - assert sum("aw_lang=English+Sub" in url for url in urls) == 2 + assert sum("aw_lang=German+Sub" in url for url in urls) == 1 + assert sum("aw_lang=English+Sub" in url for url in urls) == 1 def test_tvsearch_guid_alias_suffix_only_when_alias_differs(client) -> None: diff --git a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py index 3db86426..57e70e20 100644 --- a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py +++ b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py @@ -315,9 +315,11 @@ def close(self): "eta": 1, } ) - time.sleep(0.05) writer.close(flush=False) + deadline = time.monotonic() + 0.5 + while time.monotonic() < deadline: + if writes: + break + time.sleep(0.01) - assert writes - assert writes[-1]["job_id"] == "job-2" - assert writes[-1]["downloaded_bytes"] == 5000 + assert writes == [] diff --git a/apps/api/tests/unit/db/test_models.py b/apps/api/tests/unit/db/test_models.py index 2e0d136a..0f162b4f 100644 --- a/apps/api/tests/unit/db/test_models.py +++ b/apps/api/tests/unit/db/test_models.py @@ -175,3 +175,66 @@ def test_replace_canonical_episodes_dedupes_duplicate_numbers(client): assert len(rows) == 2 assert {(row.season, row.episode) for row in rows} == {(1, 1), (1, 2)} + + +def test_upsert_canonical_series_keeps_aliases_when_omitted(client): + from sqlmodel import Session, select + from app.db import CanonicalSeriesAlias, engine, upsert_canonical_series + + with Session(engine) as s: + upsert_canonical_series( + s, + tvdb_id=999, + title="Demo Show", + aliases=["Demo Alias"], + ) + upsert_canonical_series( + s, + tvdb_id=999, + title="Demo Show Renamed", + aliases=None, + ) + s.commit() + + aliases = s.exec( + select(CanonicalSeriesAlias).where(CanonicalSeriesAlias.tvdb_id == 999) + ).all() + + assert [alias.alias for alias in aliases] == ["Demo Alias"] + + +def test_replace_provider_catalog_title_keeps_live_generation_intact(client): + from sqlmodel import Session, select + from app.db import ProviderCatalogTitle, engine, replace_provider_catalog_title + + with Session(engine) as s: + replace_provider_catalog_title( + s, + provider="aniworld.to", + slug="demo-show", + title="Demo Show", + media_type_hint="series", + relative_path="/anime/stream/demo-show", + indexed_generation="gen-live", + ) + replace_provider_catalog_title( + s, + provider="aniworld.to", + slug="demo-show", + title="Demo Show Updated", + media_type_hint="series", + relative_path="/anime/stream/demo-show-updated", + indexed_generation="gen-staged", + ) + s.commit() + + rows = s.exec( + select(ProviderCatalogTitle).where( + ProviderCatalogTitle.provider == "aniworld.to" + ) + ).all() + + assert {(row.slug, row.indexed_generation, row.title) for row in rows} == { + ("demo-show", "gen-live", "Demo Show"), + ("demo-show", "gen-staged", "Demo Show Updated"), + } diff --git a/apps/api/tests/unit/utils/title_resolver/test_sto.py b/apps/api/tests/unit/utils/title_resolver/test_sto.py index 7bcf0f41..5116e228 100644 --- a/apps/api/tests/unit/utils/title_resolver/test_sto.py +++ b/apps/api/tests/unit/utils/title_resolver/test_sto.py @@ -1,6 +1,7 @@ from urllib.parse import parse_qs, urlparse import requests +from sqlmodel import Session import app.utils.title_resolver as tr @@ -93,7 +94,7 @@ def test_slug_from_query_prefers_precise_title_over_shared_token( monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) - assert tr.slug_from_query("Rookie Le flic de Los Angeles") == ( + assert tr.slug_from_query("Rookie Le flic de Los Angeles", site="s.to") == ( "s.to", "the-rookie", ) @@ -116,3 +117,51 @@ def test_slug_from_query_rejects_low_confidence_overlap(monkeypatch) -> None: monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) assert tr.slug_from_query("Rookie Le flic de Los Angeles") is None + + +def test_slug_from_query_accepts_db_alias_match(monkeypatch) -> None: + from app.db import ( + engine, + replace_provider_catalog_aliases, + replace_provider_catalog_title, + upsert_provider_index_status, + ) + + monkeypatch.setattr(tr, "CATALOG_SITES_LIST", ["s.to"]) + monkeypatch.setattr(tr, "load_or_refresh_index", lambda _site: {}) + monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) + monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) + monkeypatch.setattr(tr, "get_catalog_readiness_error", lambda: None) + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="s.to", + refresh_interval_hours=24.0, + status="ready", + current_generation="gen-alias", + latest_success_generation="gen-alias", + bootstrap_completed=True, + ) + replace_provider_catalog_title( + session, + provider="s.to", + slug="the-rookie", + title="The Rookie", + media_type_hint="series", + relative_path="/serie/the-rookie", + indexed_generation="gen-alias", + ) + replace_provider_catalog_aliases( + session, + provider="s.to", + slug="the-rookie", + aliases=["Rookie Le flic de Los Angeles"], + indexed_generation="gen-alias", + ) + session.commit() + + assert tr.slug_from_query("Rookie Le flic de Los Angeles") == ( + "s.to", + "the-rookie", + ) From ec11cc806ba4ba15027fa4e7a490fc4396326c6b Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 22:33:32 +0200 Subject: [PATCH 34/45] style: run ruff formatter --- apps/api/app/utils/title_resolver.py | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/app/utils/title_resolver.py b/apps/api/app/utils/title_resolver.py index 6a233d5e..31bc4be6 100644 --- a/apps/api/app/utils/title_resolver.py +++ b/apps/api/app/utils/title_resolver.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -import inspect import re import time from difflib import SequenceMatcher From c529047dfdc61168d96ce52c10411bdae85f6573 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 22:47:32 +0200 Subject: [PATCH 35/45] test: isolate title resolver DB fallback tests - stub catalog readiness in low-confidence overlap unit coverage - initialize schema explicitly for DB-backed alias lookup coverage - remove dependency on pre-existing local test database state --- apps/api/tests/unit/utils/title_resolver/test_sto.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/api/tests/unit/utils/title_resolver/test_sto.py b/apps/api/tests/unit/utils/title_resolver/test_sto.py index 5116e228..e059be4d 100644 --- a/apps/api/tests/unit/utils/title_resolver/test_sto.py +++ b/apps/api/tests/unit/utils/title_resolver/test_sto.py @@ -115,12 +115,14 @@ def test_slug_from_query_rejects_low_confidence_overlap(monkeypatch) -> None: ) monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) + monkeypatch.setattr(tr, "get_catalog_readiness_error", lambda: "not-ready") assert tr.slug_from_query("Rookie Le flic de Los Angeles") is None def test_slug_from_query_accepts_db_alias_match(monkeypatch) -> None: from app.db import ( + create_db_and_tables, engine, replace_provider_catalog_aliases, replace_provider_catalog_title, @@ -132,6 +134,7 @@ def test_slug_from_query_accepts_db_alias_match(monkeypatch) -> None: monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) monkeypatch.setattr(tr, "get_catalog_readiness_error", lambda: None) + create_db_and_tables() with Session(engine) as session: upsert_provider_index_status( From 65022b780a4a13dcd40883940b7982ebaa9758ee Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 22:55:46 +0200 Subject: [PATCH 36/45] test: stabilize torznab and title-resolver CI expectations - pin STRM_FILES_MODE in the torznab hard-cap integration test - bind title_resolver to the test database engine in alias lookup coverage - remove CI dependence on ambient module import order and env defaults --- apps/api/tests/integration/api/torznab/test_api.py | 5 ++++- apps/api/tests/unit/utils/title_resolver/test_sto.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/tests/integration/api/torznab/test_api.py b/apps/api/tests/integration/api/torznab/test_api.py index 946a1728..90edb438 100644 --- a/apps/api/tests/integration/api/torznab/test_api.py +++ b/apps/api/tests/integration/api/torznab/test_api.py @@ -351,7 +351,10 @@ def test_tvsearch_fast_season_mode_avoids_live_probe(client, monkeypatch) -> Non assert len(items) == 2 -def test_tvsearch_season_search_limit_is_hard_item_cap(client) -> None: +def test_tvsearch_season_search_limit_is_hard_item_cap(client, monkeypatch) -> None: + import app.api.torznab.api as torznab_api + + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "both") _seed_ready_tv_catalog( canonical_title="Series", query_aliases=["foo"], diff --git a/apps/api/tests/unit/utils/title_resolver/test_sto.py b/apps/api/tests/unit/utils/title_resolver/test_sto.py index e059be4d..4648ab13 100644 --- a/apps/api/tests/unit/utils/title_resolver/test_sto.py +++ b/apps/api/tests/unit/utils/title_resolver/test_sto.py @@ -134,6 +134,7 @@ def test_slug_from_query_accepts_db_alias_match(monkeypatch) -> None: monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) monkeypatch.setattr(tr, "get_catalog_readiness_error", lambda: None) + monkeypatch.setattr(tr, "engine", engine) create_db_and_tables() with Session(engine) as session: From cb6564385df5174b8ee738152a797b7f859043c4 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 23:47:44 +0200 Subject: [PATCH 37/45] fix: harden catalog indexing shutdown and retry scheduling - fail closed when the title index writer cannot be stopped cleanly - avoid marking enrichment stages ready while rows are only deferred by retry windows - remove implicit commits from provider generation cleanup helpers - handle missing catalog tables in title resolver DB fallbacks --- apps/api/app/catalog/indexer.py | 125 +++++++++++++++++- .../core/downloader/provider_resolution.py | 33 +++-- apps/api/app/core/scheduler.py | 26 +++- 3 files changed, 164 insertions(+), 20 deletions(-) diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 2366415c..b03d6138 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -3,7 +3,7 @@ from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait from dataclasses import dataclass from datetime import timedelta -from queue import Empty, Queue +from queue import Empty, Full, Queue from threading import Event, Lock, Semaphore, Thread from time import monotonic from uuid import uuid4 @@ -644,6 +644,7 @@ def on_index_loaded(total_titles: int) -> None: ) observer = CatalogCrawlObserver(on_index_loaded=on_index_loaded) + writer_shutdown_signaled = False try: rows = load_provider_title_index(provider, observer=observer) @@ -656,11 +657,15 @@ def on_index_loaded(total_titles: int) -> None: current_slug=row.slug, queue_depth=queue.qsize(), ) - queue.put(_QUEUE_SENTINEL) + self._signal_title_index_writer_shutdown(provider=provider, queue=queue) + writer_shutdown_signaled = True writer.join(timeout=30) - if writer.is_alive(): - detail = f": {writer_failure[0]}" if writer_failure else "" - raise RuntimeError(f"writer thread did not finish within 30s{detail}") + self._ensure_title_index_writer_stopped( + provider=provider, + writer=writer, + writer_failure=writer_failure, + timeout_seconds=30, + ) if writer_failure: raise RuntimeError(str(writer_failure[0])) completed_at = utcnow() @@ -684,8 +689,15 @@ def on_index_loaded(total_titles: int) -> None: logger.exception( "Provider catalog title index failed for {}: {}", provider, exc ) - queue.put(_QUEUE_SENTINEL) + if not writer_shutdown_signaled: + self._signal_title_index_writer_shutdown(provider=provider, queue=queue) writer.join(timeout=5) + self._ensure_title_index_writer_stopped( + provider=provider, + writer=writer, + writer_failure=writer_failure, + timeout_seconds=5, + ) completed_at = utcnow() error_text = str(exc) self._writer.run( @@ -708,6 +720,37 @@ def on_index_loaded(total_titles: int) -> None: finally: reporter.close() + def _signal_title_index_writer_shutdown( + self, + *, + provider: str, + queue: Queue[TitleRecord | object], + ) -> None: + try: + queue.put_nowait(_QUEUE_SENTINEL) + except Full as exc: + logger.error( + "Provider catalog title index writer queue is full during shutdown for {}", + provider, + ) + raise RuntimeError( + f"writer shutdown queue is full for provider {provider}" + ) from exc + + def _ensure_title_index_writer_stopped( + self, + *, + provider: str, + writer: Thread, + writer_failure: list[BaseException], + timeout_seconds: int, + ) -> None: + if writer.is_alive(): + detail = f": {writer_failure[0]}" if writer_failure else "" + raise RuntimeError( + f"writer thread did not finish within {timeout_seconds}s for {provider}{detail}" + ) + def _finish_title_index_success( self, session: Session, @@ -929,6 +972,9 @@ def _run_canonical_enrichment_stage(self, provider: str) -> None: def _run_row_stage(self, *, provider: str, stage: str, concurrency: int) -> None: refresh_interval_hours = self._refresh_interval_hours(provider) + generation = self._visible_generation(provider) + if generation is None: + return self._mark_stage_running( provider=provider, stage=stage, @@ -961,6 +1007,24 @@ def _run_row_stage(self, *, provider: str, stage: str, concurrency: int) -> None limit=max(1, concurrency * 2), ) if not due_rows: + remaining_rows = self._writer.run( + lambda session: self._count_remaining_stage_rows( + session, + provider=provider, + stage=stage, + generation=generation, + ) + ) + if remaining_rows: + self._writer.run( + lambda session: self._mark_stage_pending( + session, + provider=provider, + stage=stage, + refresh_interval_hours=refresh_interval_hours, + ) + ) + return completed_at = utcnow() self._writer.run( lambda session: self._mark_stage_ready( @@ -1311,6 +1375,28 @@ def _mark_stage_ready( payload["status"] = "ready" upsert_provider_index_status(session, **payload) + def _mark_stage_pending( + self, + session: Session, + *, + provider: str, + stage: str, + refresh_interval_hours: float, + ) -> None: + payload = { + "provider": provider, + "refresh_interval_hours": refresh_interval_hours, + "status": "partial", + "active_stage": None, + "last_error_summary": "", + "commit": False, + } + if stage == "detail_enrichment": + payload["detail_enrichment_status"] = "pending" + else: + payload["canonical_enrichment_status"] = "pending" + upsert_provider_index_status(session, **payload) + def _mark_stage_failed( self, session: Session, @@ -1401,6 +1487,33 @@ def _load_due_stage_rows( break return due + def _count_remaining_stage_rows( + self, + session: Session, + *, + provider: str, + stage: str, + generation: str, + ) -> int: + rows = session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ).all() + remaining = 0 + for row in rows: + state = session.get(ProviderTitleIndexState, (provider, row.slug)) + if state is None: + state = ProviderTitleIndexState(provider=provider, slug=row.slug) + if stage == "detail_enrichment": + if state.detail_status != "ready": + remaining += 1 + continue + if state.canonical_status != "ready": + remaining += 1 + return remaining + def _load_aliases(self, *, provider: str, slug: str) -> list[str]: generation = self._visible_generation(provider) if generation is None: diff --git a/apps/api/app/core/downloader/provider_resolution.py b/apps/api/app/core/downloader/provider_resolution.py index 63f7675a..81956b41 100644 --- a/apps/api/app/core/downloader/provider_resolution.py +++ b/apps/api/app/core/downloader/provider_resolution.py @@ -1,7 +1,7 @@ from __future__ import annotations import re -from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError +import threading from typing import List, Optional, Tuple, TYPE_CHECKING from loguru import logger @@ -14,10 +14,6 @@ from aniworld.models import Episode _AVAIL_RE = re.compile(r"Available languages:\s*\[([^\]]*)\]", re.IGNORECASE) -_DIRECT_LINK_EXECUTOR = ThreadPoolExecutor( - max_workers=max(1, len(PROVIDER_ORDER)), - thread_name_prefix="provider-resolution", -) def _run_with_timeout( @@ -26,13 +22,26 @@ def _run_with_timeout( timeout_seconds: float, operation: str, ): - future = _DIRECT_LINK_EXECUTOR.submit(callback, *args) - try: - return future.result(timeout=max(0.001, timeout_seconds)) - except FutureTimeoutError as exc: - raise TimeoutError( - f"{operation} timed out after {timeout_seconds:.1f}s" - ) from exc + outcome: dict[str, object] = {} + + def _target() -> None: + try: + outcome["result"] = callback(*args) + except BaseException as exc: # pragma: no cover - re-raised on caller thread + outcome["error"] = exc + + worker = threading.Thread( + target=_target, + name=f"{operation}-timeout-guard", + daemon=True, + ) + worker.start() + worker.join(timeout=max(0.001, timeout_seconds)) + if worker.is_alive(): + raise TimeoutError(f"{operation} timed out after {timeout_seconds:.1f}s") + if "error" in outcome: + raise outcome["error"] # type: ignore[misc] + return outcome.get("result") def _parse_available_languages_from_error(msg: str) -> List[str]: diff --git a/apps/api/app/core/scheduler.py b/apps/api/app/core/scheduler.py index 3795b480..fcb2c203 100644 --- a/apps/api/app/core/scheduler.py +++ b/apps/api/app/core/scheduler.py @@ -29,7 +29,7 @@ # global executor + registry EXECUTOR: Optional[ThreadPoolExecutor] = None -RUNNING: Dict[str, Tuple[Future, threading.Event]] = {} +RUNNING: Dict[str, Tuple[Future | None, threading.Event]] = {} RUNNING_LOCK = threading.Lock() @@ -402,8 +402,24 @@ def start_scheduled_job(job_id: str, req: dict) -> None: stop_event = threading.Event() mode = str(req.get("mode") or "").strip().lower() runner = _run_strm if mode == "strm" else _run_download - fut = EXECUTOR.submit(runner, job_id, req, stop_event) with RUNNING_LOCK: + if job_id in RUNNING: + raise RuntimeError(f"job already running: {job_id}") + RUNNING[job_id] = (None, stop_event) + try: + fut = EXECUTOR.submit(runner, job_id, req, stop_event) + except Exception: + with RUNNING_LOCK: + current = RUNNING.get(job_id) + if current is not None and current[1] is stop_event: + RUNNING.pop(job_id, None) + raise + with RUNNING_LOCK: + current = RUNNING.get(job_id) + if current is None or current[1] is not stop_event: + stop_event.set() + fut.cancel() + return RUNNING[job_id] = (fut, stop_event) @@ -447,4 +463,10 @@ def cancel_job(job_id: str) -> None: return fut, ev = item ev.set() + if fut is None: + with RUNNING_LOCK: + current = RUNNING.get(job_id) + if current is not None and current[1] is ev and current[0] is None: + RUNNING.pop(job_id, None) + return fut.cancel() From 06d754c5415c84707d630ea13d786e45deed9f74 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 23:47:59 +0200 Subject: [PATCH 38/45] fix: close scheduler and task state races - pre-register running jobs before executor submit to avoid stale RUNNING entries - mark qBittorrent client tasks failed when background job startup fails - guard provider direct-link timeouts with daemon threads instead of the shared executor --- apps/api/app/api/qbittorrent/torrents.py | 14 +++++ apps/api/app/db/models.py | 2 - apps/api/app/utils/terminal.py | 15 ++++- apps/api/app/utils/title_resolver.py | 72 ++++++++++++++---------- 4 files changed, 71 insertions(+), 32 deletions(-) diff --git a/apps/api/app/api/qbittorrent/torrents.py b/apps/api/app/api/qbittorrent/torrents.py index 306954a4..9059ebc0 100644 --- a/apps/api/app/api/qbittorrent/torrents.py +++ b/apps/api/app/api/qbittorrent/torrents.py @@ -132,6 +132,20 @@ def torrents_add( start_scheduled_job(job_id, req) except Exception as exc: logger.error("Failed to start scheduled job {}: {}", job_id, exc) + upsert_client_task( + session, + hash=btih, + name=name, + slug=slug, + season=season, + episode=episode, + language=language, + site=site, + save_path=published_savepath, + category=category, + job_id=job_id, + state="failed", + ) return PlainTextResponse("Failed to start download.", status_code=500) upsert_client_task( session, diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index d3c72462..91686582 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -1111,7 +1111,6 @@ def prune_provider_generation( & (ProviderMovieMapping.indexed_generation != keep_generation) ) ) - session.commit() def delete_provider_generation( @@ -1162,7 +1161,6 @@ def delete_provider_generation( & (ProviderMovieMapping.indexed_generation == generation) ) ) - session.commit() def _visible_generation_map( diff --git a/apps/api/app/utils/terminal.py b/apps/api/app/utils/terminal.py index ac4afded..1e751a78 100644 --- a/apps/api/app/utils/terminal.py +++ b/apps/api/app/utils/terminal.py @@ -92,7 +92,20 @@ def update(self, snap: ProgressSnapshot) -> None: self._bar.n = downloaded postfix = {} if snap.speed is not None: - postfix["Speed"] = f"{float(snap.speed) / (1024 * 1024):.2f} MB/s" + unit = self.unit + unit_power = { + "B": 0, + "KB": 1, + "KiB": 1, + "MB": 2, + "MiB": 2, + "GB": 3, + "GiB": 3, + "TB": 4, + "TiB": 4, + }.get(unit, 0) + scaled_speed = float(snap.speed) / (1024**unit_power) + postfix["Speed"] = f"{scaled_speed:.2f} {unit}/s" if snap.eta is not None: postfix["ETA"] = f"{int(snap.eta)}s" if postfix: diff --git a/apps/api/app/utils/title_resolver.py b/apps/api/app/utils/title_resolver.py index 31bc4be6..324846ff 100644 --- a/apps/api/app/utils/title_resolver.py +++ b/apps/api/app/utils/title_resolver.py @@ -11,6 +11,7 @@ import requests.exceptions from bs4 import BeautifulSoup # type: ignore from loguru import logger +from sqlalchemy.exc import OperationalError from sqlmodel import Session from sqlmodel import select @@ -821,20 +822,27 @@ def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: return result readiness_error = get_catalog_readiness_error() if readiness_error is None: - with Session(engine) as session: - rows = search_indexed_provider_titles( - session, - query=q, - providers=[site], - limit=1, - ) - if rows: - candidate = rows[0] - cand_score = _score_indexed_db_candidate( - session, query=q, candidate=candidate + try: + with Session(engine) as session: + rows = search_indexed_provider_titles( + session, + query=q, + providers=[site], + limit=1, ) - if cand_score >= _MIN_TITLE_MATCH_SCORE: - return (candidate.provider, candidate.slug) + if rows: + candidate = rows[0] + cand_score = _score_indexed_db_candidate( + session, query=q, candidate=candidate + ) + if cand_score >= _MIN_TITLE_MATCH_SCORE: + return (candidate.provider, candidate.slug) + except OperationalError as exc: + logger.debug( + "Skipping indexed DB lookup for {} because catalog tables are unavailable: {}", + site, + exc, + ) return None # 2) No specific site requested: try index-based lookup across primary sites @@ -849,23 +857,29 @@ def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: providers = list(CATALOG_SITES_LIST) preferred = [provider for provider in providers if provider != "megakino"] fallback = [provider for provider in providers if provider == "megakino"] - with Session(engine) as session: - for batch in (preferred, fallback): - if not batch: - continue - rows = search_indexed_provider_titles( - session, - query=q, - providers=batch, - limit=1, - ) - if rows: - candidate = rows[0] - cand_score = _score_indexed_db_candidate( - session, query=q, candidate=candidate + try: + with Session(engine) as session: + for batch in (preferred, fallback): + if not batch: + continue + rows = search_indexed_provider_titles( + session, + query=q, + providers=batch, + limit=1, ) - if cand_score >= _MIN_TITLE_MATCH_SCORE: - return (candidate.provider, candidate.slug) + if rows: + candidate = rows[0] + cand_score = _score_indexed_db_candidate( + session, query=q, candidate=candidate + ) + if cand_score >= _MIN_TITLE_MATCH_SCORE: + return (candidate.provider, candidate.slug) + except OperationalError as exc: + logger.debug( + "Skipping indexed DB lookup because catalog tables are unavailable: {}", + exc, + ) # 3) Megakino-specific direct slug/fallback handling if "megakino" in CATALOG_SITES_LIST or "megakino" in _PROVIDER_CACHE: From 45de48770208d382df4a43a954fe1d409676ddbb Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 30 Apr 2026 23:48:52 +0200 Subject: [PATCH 39/45] test: align catalog fixtures with hard-cap expectations - seed enough mapped episodes for torznab hard-cap coverage - bootstrap providerindexstatus via migrations in title resolver tests - add regressions for fast-finishing scheduler jobs and deferred stage retries --- .../tests/integration/api/torznab/test_api.py | 2 +- apps/api/tests/unit/catalog/test_indexer.py | 48 +++++++++++++++++++ .../core/scheduler/test_strm_scheduler.py | 30 ++++++++++++ .../unit/utils/title_resolver/test_sto.py | 4 +- 4 files changed, 81 insertions(+), 3 deletions(-) diff --git a/apps/api/tests/integration/api/torznab/test_api.py b/apps/api/tests/integration/api/torznab/test_api.py index 90edb438..388ea21d 100644 --- a/apps/api/tests/integration/api/torznab/test_api.py +++ b/apps/api/tests/integration/api/torznab/test_api.py @@ -358,7 +358,7 @@ def test_tvsearch_season_search_limit_is_hard_item_cap(client, monkeypatch) -> N _seed_ready_tv_catalog( canonical_title="Series", query_aliases=["foo"], - episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2)], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2), (1, 3, 1, 3)], ) resp = client.get( diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index 85595d6f..3b71e7b6 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -473,3 +473,51 @@ def test_detail_stage_persists_one_title_incrementally(client): assert len(episodes) == 1 assert state is not None assert state.detail_status == "ready" + + +def test_run_row_stage_does_not_mark_ready_when_only_future_retries_remain( + monkeypatch, +): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + events: list[str] = [] + + monkeypatch.setattr(indexer, "_refresh_interval_hours", lambda provider: 24.0) + monkeypatch.setattr(indexer, "_visible_generation", lambda provider: "gen-1") + monkeypatch.setattr(indexer, "_count_visible_titles", lambda provider: 1) + monkeypatch.setattr(indexer, "_load_due_stage_rows", lambda **kwargs: []) + monkeypatch.setattr( + indexer, + "_count_remaining_stage_rows", + lambda session, **kwargs: 1, + ) + monkeypatch.setattr(indexer, "_set_progress", lambda *args, **kwargs: None) + monkeypatch.setattr( + indexer._writer, + "run", + lambda callback: callback(object()), + ) + monkeypatch.setattr( + indexer, + "_mark_stage_running", + lambda **kwargs: events.append("running"), + ) + monkeypatch.setattr( + indexer, + "_mark_stage_pending", + lambda session, **kwargs: events.append("pending"), + ) + monkeypatch.setattr( + indexer, + "_mark_stage_ready", + lambda *args, **kwargs: events.append("ready"), + ) + + indexer._run_row_stage( + provider="aniworld.to", + stage="detail_enrichment", + concurrency=1, + ) + + assert events == ["running", "pending"] diff --git a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py index 57e70e20..aef8ea81 100644 --- a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py +++ b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py @@ -1,6 +1,7 @@ import errno import threading import time +from concurrent.futures import Future from pathlib import Path from sqlmodel import Session @@ -323,3 +324,32 @@ def close(self): time.sleep(0.01) assert writes == [] + + +def test_start_scheduled_job_cleans_up_fast_finishing_runner(tmp_path, monkeypatch): + scheduler = _setup_scheduler(tmp_path, monkeypatch, strm_proxy_mode="direct") + + class ImmediateExecutor: + def submit(self, runner, job_id, req, stop_event): + fut = Future() + runner(job_id, req, stop_event) + fut.set_result(None) + return fut + + def fake_runner(job_id, req, stop_event): + del req, stop_event + with scheduler.RUNNING_LOCK: + assert job_id in scheduler.RUNNING + scheduler.RUNNING.pop(job_id, None) + + with scheduler.RUNNING_LOCK: + scheduler.RUNNING.clear() + + monkeypatch.setattr(scheduler, "init_executor", lambda: None) + monkeypatch.setattr(scheduler, "EXECUTOR", ImmediateExecutor()) + monkeypatch.setattr(scheduler, "_run_download", fake_runner) + + scheduler.start_scheduled_job("job-fast", {}) + + with scheduler.RUNNING_LOCK: + assert "job-fast" not in scheduler.RUNNING diff --git a/apps/api/tests/unit/utils/title_resolver/test_sto.py b/apps/api/tests/unit/utils/title_resolver/test_sto.py index 4648ab13..3d32d39f 100644 --- a/apps/api/tests/unit/utils/title_resolver/test_sto.py +++ b/apps/api/tests/unit/utils/title_resolver/test_sto.py @@ -122,7 +122,7 @@ def test_slug_from_query_rejects_low_confidence_overlap(monkeypatch) -> None: def test_slug_from_query_accepts_db_alias_match(monkeypatch) -> None: from app.db import ( - create_db_and_tables, + apply_migrations, engine, replace_provider_catalog_aliases, replace_provider_catalog_title, @@ -135,7 +135,7 @@ def test_slug_from_query_accepts_db_alias_match(monkeypatch) -> None: monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) monkeypatch.setattr(tr, "get_catalog_readiness_error", lambda: None) monkeypatch.setattr(tr, "engine", engine) - create_db_and_tables() + apply_migrations() with Session(engine) as session: upsert_provider_index_status( From 4d90199e498d083c11024dba1f8a2352bb34563c Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Fri, 1 May 2026 20:51:26 +0200 Subject: [PATCH 40/45] fix: harden catalog indexing and qBittorrent state handling Isolate timed-out provider detail crawls so stuck tasks cannot starve later title enrichment work. Preserve queued torrent state semantics in the qBittorrent shim, dedupe duplicate episode-language inserts, and ensure title-index failures are still persisted even when writer shutdown also errors. Also block title refresh while enrichment rows are still unfinished and treat OperationalError during indexed title lookups as a catalog-not-ready fallback instead of bubbling the exception. --- apps/api/app/api/qbittorrent/common.py | 21 +++++ apps/api/app/api/qbittorrent/sync.py | 14 ++-- apps/api/app/api/qbittorrent/torrents.py | 13 +-- apps/api/app/catalog/indexer.py | 82 ++++++++++++++++--- apps/api/app/catalog/providers.py | 16 ++-- apps/api/app/db/models.py | 33 ++++++-- apps/api/app/utils/title_resolver.py | 25 +++++- .../integration/api/qbittorrent/test_more.py | 24 ++++++ .../api/qbittorrent/test_torrents.py | 24 ++++++ apps/api/tests/unit/catalog/test_indexer.py | 79 ++++++++++++++++++ apps/api/tests/unit/db/test_models.py | 38 +++++++++ .../utils/title_resolver/test_file_index.py | 41 ++++++++++ 12 files changed, 362 insertions(+), 48 deletions(-) diff --git a/apps/api/app/api/qbittorrent/common.py b/apps/api/app/api/qbittorrent/common.py index ea989e10..cfd4bdd3 100644 --- a/apps/api/app/api/qbittorrent/common.py +++ b/apps/api/app/api/qbittorrent/common.py @@ -15,6 +15,27 @@ def public_save_path() -> str: return QBIT_PUBLIC_SAVE_PATH or str(DOWNLOAD_DIR) +def coerce_torrent_state(*, stored_state: str | None, job_status: str | None) -> str: + """Map persisted task state and scheduler status to qBittorrent states.""" + if job_status == "completed": + return "uploading" + if job_status == "failed": + return "error" + if job_status == "cancelled": + return "pausedDL" + + normalized = (stored_state or "").strip().lower() + if normalized == "queued": + return "queuedDL" + if normalized == "paused": + return "pausedDL" + if normalized == "completed": + return "uploading" + if normalized == "error" or normalized == "failed": + return "error" + return "downloading" + + # Categories map compatible with qBittorrent format CATEGORIES: Dict[str, dict] = { "prowlarr": { diff --git a/apps/api/app/api/qbittorrent/sync.py b/apps/api/app/api/qbittorrent/sync.py index c92bc97f..df8a4706 100644 --- a/apps/api/app/api/qbittorrent/sync.py +++ b/apps/api/app/api/qbittorrent/sync.py @@ -8,7 +8,7 @@ from app.db import get_session, get_job from . import router -from .common import CATEGORIES, public_save_path +from .common import CATEGORIES, coerce_torrent_state, public_save_path from app.config import DOWNLOAD_DIR, QBIT_PUBLIC_SAVE_PATH @@ -27,14 +27,10 @@ def sync_maindata(session: Session = Depends(get_session)): for r in rows: job = get_job(session, r.job_id) if r.job_id else None progress = (job.progress or 0.0) / 100.0 if job else 0.0 - state = "downloading" - if job: - if job.status == "completed": - state = "uploading" - elif job.status == "failed": - state = "error" - elif job.status == "cancelled": - state = "pausedDL" + state = coerce_torrent_state( + stored_state=r.state, + job_status=job.status if job else None, + ) size_val = int(job.total_bytes or 0) if job else 0 save_path_val = ( diff --git a/apps/api/app/api/qbittorrent/torrents.py b/apps/api/app/api/qbittorrent/torrents.py index 9059ebc0..fc9f9f5c 100644 --- a/apps/api/app/api/qbittorrent/torrents.py +++ b/apps/api/app/api/qbittorrent/torrents.py @@ -23,7 +23,7 @@ from app.core.scheduler import cancel_job, schedule_download, start_scheduled_job from . import router -from .common import public_save_path +from .common import coerce_torrent_state, public_save_path @router.post("/torrents/add") @@ -184,7 +184,9 @@ def torrents_info( if category and (r.category or "") != category: continue job = get_job(session, r.job_id) if r.job_id else None - state = r.state + state = coerce_torrent_state( + stored_state=r.state, job_status=job.status if job else None + ) progress = 0.0 dlspeed = 0 eta = 0 @@ -197,19 +199,12 @@ def torrents_info( f"Job {job.id}: status={job.status}, progress={progress}, speed={dlspeed}, eta={eta}" ) if job.status == "completed": - state = "uploading" dlspeed = 0 if job.result_path and os.path.exists(job.result_path): try: size = int(os.path.getsize(job.result_path)) except Exception: pass - elif job.status == "failed": - state = "error" - elif job.status == "cancelled": - state = "pausedDL" - else: - state = "downloading" content_path = None save_path_val = r.save_path or (QBIT_PUBLIC_SAVE_PATH or str(DOWNLOAD_DIR)) diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index b03d6138..9de72e1b 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -545,12 +545,14 @@ def _pick_due_stage(self, status: ProviderIndexStatus) -> str | None: retry_after=getattr(status, "detail_next_retry_after", None), ): return "detail_enrichment" + return None if self._canonical_stage_has_due_work(status.provider): if self._stage_due( stage_status=getattr(status, "canonical_enrichment_status", "pending"), retry_after=getattr(status, "canonical_next_retry_after", None), ): return "canonical_enrichment" + return None if self._stage_due( stage_status=title_index_status, retry_after=getattr(status, "title_index_next_retry_after", None), @@ -689,17 +691,16 @@ def on_index_loaded(total_titles: int) -> None: logger.exception( "Provider catalog title index failed for {}: {}", provider, exc ) - if not writer_shutdown_signaled: - self._signal_title_index_writer_shutdown(provider=provider, queue=queue) - writer.join(timeout=5) - self._ensure_title_index_writer_stopped( + error_text = self._shutdown_title_index_writer( provider=provider, + queue=queue, + writer_shutdown_signaled=writer_shutdown_signaled, writer=writer, writer_failure=writer_failure, timeout_seconds=5, + error_text=str(exc), ) completed_at = utcnow() - error_text = str(exc) self._writer.run( lambda session, error=error_text: self._finish_title_index_failure( session, @@ -751,6 +752,42 @@ def _ensure_title_index_writer_stopped( f"writer thread did not finish within {timeout_seconds}s for {provider}{detail}" ) + def _shutdown_title_index_writer( + self, + *, + provider: str, + queue: Queue[TitleRecord | object], + writer_shutdown_signaled: bool, + writer: Thread, + writer_failure: list[BaseException], + timeout_seconds: int, + error_text: str, + ) -> str: + shutdown_errors: list[str] = [] + + if not writer_shutdown_signaled: + try: + self._signal_title_index_writer_shutdown(provider=provider, queue=queue) + except Exception as exc: + shutdown_errors.append(f"writer shutdown signal failed: {exc}") + try: + writer.join(timeout=timeout_seconds) + except Exception as exc: + shutdown_errors.append(f"writer join failed: {exc}") + try: + self._ensure_title_index_writer_stopped( + provider=provider, + writer=writer, + writer_failure=writer_failure, + timeout_seconds=timeout_seconds, + ) + except Exception as exc: + shutdown_errors.append(str(exc)) + + if not shutdown_errors: + return error_text + return " | ".join([error_text, *shutdown_errors]) + def _finish_title_index_success( self, session: Session, @@ -1426,19 +1463,38 @@ def _mark_stage_failed( upsert_provider_index_status(session, **payload) def _detail_stage_has_due_work(self, provider: str) -> bool: - return bool( - self._load_due_stage_rows( - provider=provider, stage="detail_enrichment", limit=1 - ) + return self._has_unfinished_stage_rows( + provider=provider, stage="detail_enrichment" ) def _canonical_stage_has_due_work(self, provider: str) -> bool: - return bool( - self._load_due_stage_rows( - provider=provider, stage="canonical_enrichment", limit=1 - ) + return self._has_unfinished_stage_rows( + provider=provider, stage="canonical_enrichment" ) + def _has_unfinished_stage_rows(self, *, provider: str, stage: str) -> bool: + generation = self._visible_generation(provider) + if generation is None: + return False + with Session(engine) as session: + rows = session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ).all() + for row in rows: + state = session.get(ProviderTitleIndexState, (provider, row.slug)) + if state is None: + return True + if stage == "detail_enrichment": + if state.detail_status != "ready": + return True + continue + if state.canonical_status != "ready": + return True + return False + def _load_due_stage_rows( self, *, diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py index 0940fc79..0e1cb93d 100644 --- a/apps/api/app/catalog/providers.py +++ b/apps/api/app/catalog/providers.py @@ -21,11 +21,6 @@ from app.utils.domain_resolver import get_megakino_base_url from app.utils.http_client import get as http_get -_TITLE_CRAWL_EXECUTOR = ThreadPoolExecutor( - max_workers=4, - thread_name_prefix="provider-title-crawl", -) - @dataclass(slots=True) class EpisodeLanguageRecord: @@ -100,12 +95,21 @@ def _run_with_timeout( submit_kwargs = dict(kwargs) if accepts_cancel_event: submit_kwargs["cancel_event"] = cancel_event - future = _TITLE_CRAWL_EXECUTOR.submit(func, *args, **submit_kwargs) + executor = ThreadPoolExecutor( + max_workers=1, + thread_name_prefix="provider-title-crawl", + ) + future = executor.submit(func, *args, **submit_kwargs) + timed_out = False try: return future.result(timeout=max(0.001, timeout_seconds)) except FutureTimeoutError as exc: + timed_out = True cancel_event.set() + future.cancel() raise TimeoutError(f"title crawl exceeded {int(timeout_seconds)}s") from exc + finally: + executor.shutdown(wait=not timed_out, cancel_futures=timed_out) def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageRecord]: diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index 91686582..6c3f00d7 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -1030,12 +1030,14 @@ def replace_provider_catalog_episodes( ) ) for item in episodes: + season_number = int(item["season"]) + episode_number = int(item["episode"]) session.add( ProviderCatalogEpisode( provider=provider, slug=slug, - season=int(item["season"]), - episode=int(item["episode"]), + season=season_number, + episode=episode_number, indexed_generation=indexed_generation, title_primary=item.get("title_primary"), title_secondary=item.get("title_secondary"), @@ -1044,20 +1046,37 @@ def replace_provider_catalog_episodes( last_indexed_at=utcnow(), ) ) + deduped_languages: dict[str, dict[str, Any]] = {} for language_payload in item.get("languages", []): language = str(language_payload.get("language") or "").strip() if not language: continue + normalized_language = normalize_catalog_text(language) + key = normalized_language or language + bucket = deduped_languages.setdefault( + key, + { + "language": language, + "normalized_language": normalized_language, + "host_hints": set(), + }, + ) + bucket["host_hints"].update( + str(host_hint).strip() + for host_hint in language_payload.get("host_hints") or [] + if str(host_hint).strip() + ) + for payload in deduped_languages.values(): session.add( ProviderEpisodeLanguage( provider=provider, slug=slug, - season=int(item["season"]), - episode=int(item["episode"]), - language=language, + season=season_number, + episode=episode_number, + language=str(payload["language"]), indexed_generation=indexed_generation, - normalized_language=normalize_catalog_text(language), - host_hints=list(language_payload.get("host_hints") or []), + normalized_language=str(payload["normalized_language"]), + host_hints=sorted(payload["host_hints"]), last_indexed_at=utcnow(), ) ) diff --git a/apps/api/app/utils/title_resolver.py b/apps/api/app/utils/title_resolver.py index 324846ff..3b8b76af 100644 --- a/apps/api/app/utils/title_resolver.py +++ b/apps/api/app/utils/title_resolver.py @@ -501,8 +501,17 @@ def resolve_series_title( if not slug: logger.warning("No slug provided to resolve_series_title.") return None - with Session(engine) as session: - title = resolve_indexed_title(session, provider=site, slug=slug) + try: + with Session(engine) as session: + title = resolve_indexed_title(session, provider=site, slug=slug) + except OperationalError as exc: + logger.warning( + "Indexed title lookup unavailable for slug '{}' on {}: {}", + slug, + site, + exc, + ) + else: if title: logger.info(f"Resolved title for slug '{slug}' on {site}: {title}") return title @@ -539,8 +548,16 @@ def load_or_refresh_alternatives(site: str = "aniworld.to") -> Dict[str, List[st global _cached_alts readiness_error = get_catalog_readiness_error() if readiness_error is None: - with Session(engine) as session: - rows = list_indexed_titles_for_provider(session, provider=site) + try: + with Session(engine) as session: + rows = list_indexed_titles_for_provider(session, provider=site) + except OperationalError as exc: + logger.warning( + "Indexed alternatives lookup unavailable for {}: {}", + site, + exc, + ) + else: if rows: # The indexed request path no longer needs a full alternatives dump. # Keep a minimal compatibility shape for older helper call sites. diff --git a/apps/api/tests/integration/api/qbittorrent/test_more.py b/apps/api/tests/integration/api/qbittorrent/test_more.py index 81e113eb..fae5fb1f 100644 --- a/apps/api/tests/integration/api/qbittorrent/test_more.py +++ b/apps/api/tests/integration/api/qbittorrent/test_more.py @@ -88,3 +88,27 @@ def test_sync_maindata_state_mapping(client): assert ts["h1"]["state"] == "uploading" assert ts["h2"]["state"] == "error" assert ts["h3"]["state"] == "pausedDL" + + +def test_sync_maindata_preserves_queued_state(client): + from sqlmodel import Session + from app.db import engine, create_job, upsert_client_task + + with Session(engine) as s: + job = create_job(s) + upsert_client_task( + s, + hash="queued-hash", + name="Queued", + slug="slug", + season=1, + episode=4, + language="German Dub", + save_path=None, + category=None, + job_id=job.id, + state="queued", + ) + + data = client.get("/api/v2/sync/maindata").json() + assert data["torrents"]["queued-hash"]["state"] == "queuedDL" diff --git a/apps/api/tests/integration/api/qbittorrent/test_torrents.py b/apps/api/tests/integration/api/qbittorrent/test_torrents.py index 4afad448..0a4a05dc 100644 --- a/apps/api/tests/integration/api/qbittorrent/test_torrents.py +++ b/apps/api/tests/integration/api/qbittorrent/test_torrents.py @@ -108,6 +108,27 @@ def _schedule_download(req, *, autostart=True): assert calls == [("schedule", "deferred"), ("start", "job-1")] +def test_torrents_info_preserves_queued_state_for_paused_add(client): + from app.utils.magnet import build_magnet + + magnet = build_magnet( + title="Queued Title", + slug="queued-title", + season=1, + episode=1, + language="German Dub", + ) + + response = client.post( + "/api/v2/torrents/add", data={"urls": magnet, "paused": "true"} + ) + + assert response.status_code == 200 + + info = client.get("/api/v2/torrents/info").json() + assert info[0]["state"] == "queuedDL" + + def test_torrents_add_returns_500_when_start_fails(client, monkeypatch): from app.utils.magnet import build_magnet import app.api.qbittorrent.torrents as qb_torrents @@ -128,3 +149,6 @@ def test_torrents_add_returns_500_when_start_fails(client, monkeypatch): response = client.post("/api/v2/torrents/add", data={"urls": magnet}) assert response.status_code == 500 + + info = client.get("/api/v2/torrents/info").json() + assert info[0]["state"] == "error" diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index 3b71e7b6..27fb5016 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -246,6 +246,85 @@ def test_pick_due_stage_prefers_detail_then_canonical(monkeypatch): assert indexer._pick_due_stage(status) == "detail_enrichment" +def test_pick_due_stage_blocks_title_refresh_while_detail_retry_is_backed_off( + monkeypatch, +): + from datetime import timedelta + + from app.catalog.indexer import ProviderCatalogIndexer + from app.db import utcnow + + indexer = ProviderCatalogIndexer() + monkeypatch.setattr(indexer, "_detail_stage_has_due_work", lambda provider: True) + monkeypatch.setattr( + indexer, "_canonical_stage_has_due_work", lambda provider: False + ) + + status = SimpleNamespace( + provider="aniworld.to", + status="partial", + latest_success_generation="gen-1", + title_index_status="ready", + detail_enrichment_status="failed", + detail_next_retry_after=utcnow() + timedelta(hours=1), + canonical_enrichment_status="pending", + canonical_next_retry_after=None, + next_refresh_after=None, + title_index_next_retry_after=None, + ) + + assert indexer._pick_due_stage(status) is None + + +def test_title_index_failure_persists_even_when_writer_shutdown_raises(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + recorded_errors: list[str] = [] + + monkeypatch.setattr( + indexer_module, + "load_provider_title_index", + lambda provider, observer=None: (_ for _ in ()).throw(RuntimeError("boom")), + ) + monkeypatch.setattr(indexer, "_refresh_interval_hours", lambda provider: 24.0) + monkeypatch.setattr( + indexer, + "_signal_title_index_writer_shutdown", + lambda **kwargs: (_ for _ in ()).throw(RuntimeError("queue full")), + ) + monkeypatch.setattr( + indexer, + "_ensure_title_index_writer_stopped", + lambda **kwargs: (_ for _ in ()).throw(RuntimeError("writer still alive")), + ) + monkeypatch.setattr(indexer, "_set_progress", lambda *args, **kwargs: None) + + def fake_writer_run(callback): + session = object() + return callback(session) + + monkeypatch.setattr(indexer._writer, "run", fake_writer_run) + monkeypatch.setattr( + indexer, + "_finish_title_index_failure", + lambda session, **kwargs: recorded_errors.append(kwargs["error"]), + ) + monkeypatch.setattr( + indexer_module, + "upsert_provider_index_status", + lambda session, **kwargs: None, + ) + + indexer._run_title_index_stage("aniworld.to") + + assert recorded_errors + assert "boom" in recorded_errors[0] + assert "queue full" in recorded_errors[0] + assert "writer still alive" in recorded_errors[0] + + def test_progress_snapshot_exposes_staged_readiness(client): from app.catalog.indexer import get_catalog_indexer from app.db import engine, upsert_provider_index_status diff --git a/apps/api/tests/unit/db/test_models.py b/apps/api/tests/unit/db/test_models.py index 0f162b4f..c5246809 100644 --- a/apps/api/tests/unit/db/test_models.py +++ b/apps/api/tests/unit/db/test_models.py @@ -238,3 +238,41 @@ def test_replace_provider_catalog_title_keeps_live_generation_intact(client): ("demo-show", "gen-live", "Demo Show"), ("demo-show", "gen-staged", "Demo Show Updated"), } + + +def test_replace_provider_catalog_episodes_dedupes_languages_and_host_hints(client): + from sqlmodel import Session, select + from app.db import ( + ProviderEpisodeLanguage, + engine, + replace_provider_catalog_episodes, + ) + + with Session(engine) as s: + replace_provider_catalog_episodes( + s, + provider="aniworld.to", + slug="demo-show", + indexed_generation="gen-1", + episodes=[ + { + "season": 1, + "episode": 1, + "relative_path": "/anime/stream/demo-show/staffel-1/episode-1", + "title_primary": "Pilot", + "title_secondary": None, + "media_type_hint": "episode", + "languages": [ + {"language": "German Dub", "host_hints": ["VOE", "VOE"]}, + {"language": "German Dub", "host_hints": ["Filemoon"]}, + ], + } + ], + ) + s.commit() + + rows = s.exec(select(ProviderEpisodeLanguage)).all() + + assert len(rows) == 1 + assert rows[0].language == "German Dub" + assert rows[0].host_hints == ["Filemoon", "VOE"] diff --git a/apps/api/tests/unit/utils/title_resolver/test_file_index.py b/apps/api/tests/unit/utils/title_resolver/test_file_index.py index 804ca7c5..171a04ca 100644 --- a/apps/api/tests/unit/utils/title_resolver/test_file_index.py +++ b/apps/api/tests/unit/utils/title_resolver/test_file_index.py @@ -21,3 +21,44 @@ def test_load_index_from_file(tmp_path, monkeypatch): assert resolve_series_title("slug-one") == "Title One" assert resolve_series_title(None) is None + + +def test_resolve_series_title_falls_back_when_index_db_is_unavailable(monkeypatch): + import app.utils.title_resolver as tr + from sqlalchemy.exc import OperationalError + + monkeypatch.setattr( + tr, + "Session", + lambda _engine: (_ for _ in ()).throw( + OperationalError("stmt", {}, Exception("db down")) + ), + ) + monkeypatch.setattr( + tr, + "load_or_refresh_index", + lambda site="aniworld.to": {"slug-one": "Title One"}, + ) + + assert tr.resolve_series_title("slug-one") == "Title One" + + +def test_load_alternatives_falls_back_when_index_db_is_unavailable(monkeypatch): + import app.utils.title_resolver as tr + from sqlalchemy.exc import OperationalError + + monkeypatch.setattr(tr, "get_catalog_readiness_error", lambda: None) + monkeypatch.setattr( + tr, + "Session", + lambda _engine: (_ for _ in ()).throw( + OperationalError("stmt", {}, Exception("db down")) + ), + ) + monkeypatch.setattr( + tr, "_cached_alts", {"aniworld.to": {"slug-one": ["Title One"]}} + ) + monkeypatch.setattr(tr, "_get_site_cfg", lambda site: {}) + monkeypatch.setattr(tr, "_should_refresh", lambda *args, **kwargs: False) + + assert tr.load_or_refresh_alternatives() == {"slug-one": ["Title One"]} From 5f8ac4ebbf80bf6038b3286d7830893c4fd75b26 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Fri, 22 May 2026 19:01:23 +0200 Subject: [PATCH 41/45] fix(catalog): prevent ambiguous usable provider mappings Collapse ambiguous provider episode title matches into a single conflict mapping, filter indexed title generations in SQL, and allow site-scoped DB title lookup before all providers are bootstrapped. --- apps/api/app/catalog/providers.py | 31 ++++++++++++-------- apps/api/app/db/models.py | 27 ++++++++++++------ apps/api/app/utils/title_resolver.py | 42 +++++++++++++--------------- 3 files changed, 57 insertions(+), 43 deletions(-) diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py index 0e1cb93d..270e5741 100644 --- a/apps/api/app/catalog/providers.py +++ b/apps/api/app/catalog/providers.py @@ -430,19 +430,26 @@ def _build_tv_canonical_payload( candidate for score, candidate in scored if score >= top_score - 0.05 ] confidence = "high_confidence" if top_score >= 0.85 else "low_confidence" - for candidate in plausible: - episode_mappings.append( - { - "provider_season": provider_episode.season, - "provider_episode": provider_episode.episode, - "tvdb_id": match.tvdb_id, - "canonical_season": int(candidate["season"]), - "canonical_episode": int(candidate["episode"]), - "confidence": confidence, - "source": "title_match", - "rationale": f"title score={top_score:.2f}", - } + candidate = plausible[0] + rationale = f"title score={top_score:.2f}" + if len(plausible) > 1: + confidence = "conflict" + rationale = ( + f"ambiguous title matches; top score={top_score:.2f}; " + f"{len(plausible)} plausible candidates" ) + episode_mappings.append( + { + "provider_season": provider_episode.season, + "provider_episode": provider_episode.episode, + "tvdb_id": match.tvdb_id, + "canonical_season": int(candidate["season"]), + "canonical_episode": int(candidate["episode"]), + "confidence": confidence, + "source": "title_match", + "rationale": rationale, + } + ) return CanonicalPayload( series=series_payload, diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index 6c3f00d7..8ceed917 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -1452,24 +1452,33 @@ def search_indexed_provider_titles( visible_generations = _visible_generation_map(session, providers=providers) if not visible_generations: return [] + visible_generation_pairs = list(visible_generations.items()) stmt = select(ProviderCatalogTitle).where( - ProviderCatalogTitle.provider.in_(providers) + (ProviderCatalogTitle.provider.in_(providers)) + & ( + tuple_( + ProviderCatalogTitle.provider, + ProviderCatalogTitle.indexed_generation, + ).in_(visible_generation_pairs) + ) ) if media_type_hint is not None: stmt = stmt.where(ProviderCatalogTitle.media_type_hint == media_type_hint) - rows = [ - row - for row in session.exec(stmt).all() - if visible_generations.get(row.provider) == row.indexed_generation - ] + rows = session.exec(stmt).all() alias_rows = session.exec( - select(ProviderCatalogAlias).where(ProviderCatalogAlias.provider.in_(providers)) + select(ProviderCatalogAlias).where( + (ProviderCatalogAlias.provider.in_(providers)) + & ( + tuple_( + ProviderCatalogAlias.provider, + ProviderCatalogAlias.indexed_generation, + ).in_(visible_generation_pairs) + ) + ) ).all() aliases_by_key: dict[tuple[str, str, str], list[str]] = {} for alias in alias_rows: - if visible_generations.get(alias.provider) != alias.indexed_generation: - continue aliases_by_key.setdefault( (alias.provider, alias.slug, alias.indexed_generation), [] ).append(alias.normalized_alias) diff --git a/apps/api/app/utils/title_resolver.py b/apps/api/app/utils/title_resolver.py index 3b8b76af..14ab4f5c 100644 --- a/apps/api/app/utils/title_resolver.py +++ b/apps/api/app/utils/title_resolver.py @@ -837,29 +837,27 @@ def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: result = _search_sites([site]) if result: return result - readiness_error = get_catalog_readiness_error() - if readiness_error is None: - try: - with Session(engine) as session: - rows = search_indexed_provider_titles( - session, - query=q, - providers=[site], - limit=1, - ) - if rows: - candidate = rows[0] - cand_score = _score_indexed_db_candidate( - session, query=q, candidate=candidate - ) - if cand_score >= _MIN_TITLE_MATCH_SCORE: - return (candidate.provider, candidate.slug) - except OperationalError as exc: - logger.debug( - "Skipping indexed DB lookup for {} because catalog tables are unavailable: {}", - site, - exc, + try: + with Session(engine) as session: + rows = search_indexed_provider_titles( + session, + query=q, + providers=[site], + limit=1, ) + if rows: + candidate = rows[0] + cand_score = _score_indexed_db_candidate( + session, query=q, candidate=candidate + ) + if cand_score >= _MIN_TITLE_MATCH_SCORE: + return (candidate.provider, candidate.slug) + except OperationalError as exc: + logger.debug( + "Skipping indexed DB lookup for {} because catalog tables are unavailable: {}", + site, + exc, + ) return None # 2) No specific site requested: try index-based lookup across primary sites From 3674446ef06d2e6992a73ab2e3a34b7d872bd04e Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Fri, 22 May 2026 19:09:03 +0200 Subject: [PATCH 42/45] fix(api): prevent stale workers from racing provider fallback Track timed-out direct-link lookup workers per episode and skip further provider fallback attempts while the original lookup is still running. Clear the catalog indexer stop flag before starting a new scheduler thread so reused interpreter lifecycles can restart indexing. --- apps/api/app/catalog/indexer.py | 1 + .../core/downloader/provider_resolution.py | 59 ++++++++++++++++++- apps/api/tests/unit/catalog/test_indexer.py | 17 ++++++ .../downloader/test_provider_resolution.py | 56 ++++++++++++++++++ 4 files changed, 132 insertions(+), 1 deletion(-) diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 9de72e1b..14e3697c 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -192,6 +192,7 @@ def start(self) -> None: return if self._thread is not None and self._thread.is_alive(): return + self._stop_event.clear() self._thread = Thread( target=self._run_loop, name="provider-catalog-indexer", diff --git a/apps/api/app/core/downloader/provider_resolution.py b/apps/api/app/core/downloader/provider_resolution.py index 81956b41..9d0bfebf 100644 --- a/apps/api/app/core/downloader/provider_resolution.py +++ b/apps/api/app/core/downloader/provider_resolution.py @@ -2,6 +2,7 @@ import re import threading +import weakref from typing import List, Optional, Tuple, TYPE_CHECKING from loguru import logger @@ -14,6 +15,51 @@ from aniworld.models import Episode _AVAIL_RE = re.compile(r"Available languages:\s*\[([^\]]*)\]", re.IGNORECASE) +_DIRECT_LINK_TIMEOUT_LOCK = threading.Lock() +_DIRECT_LINK_TIMED_OUT_WORKERS: weakref.WeakKeyDictionary[ + object, list[threading.Thread] +] = weakref.WeakKeyDictionary() +_DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID: dict[int, list[threading.Thread]] = {} + + +class DirectLinkTimeoutError(TimeoutError): + def __init__(self, message: str, worker: threading.Thread) -> None: + super().__init__(message) + self.worker = worker + + +def _episode_has_active_timed_out_worker(ep: object) -> bool: + with _DIRECT_LINK_TIMEOUT_LOCK: + try: + workers = _DIRECT_LINK_TIMED_OUT_WORKERS.get(ep, []) + except TypeError: + workers = _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.get(id(ep), []) + active_workers = [worker for worker in workers if worker.is_alive()] + if active_workers: + try: + _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = active_workers + except TypeError: + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = active_workers + return True + try: + _DIRECT_LINK_TIMED_OUT_WORKERS.pop(ep, None) + except TypeError: + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.pop(id(ep), None) + return False + + +def _track_timed_out_worker(ep: object, worker: threading.Thread) -> None: + with _DIRECT_LINK_TIMEOUT_LOCK: + try: + existing_workers = _DIRECT_LINK_TIMED_OUT_WORKERS.get(ep, []) + except TypeError: + existing_workers = _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.get(id(ep), []) + workers = [existing for existing in existing_workers if existing.is_alive()] + workers.append(worker) + try: + _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = workers + except TypeError: + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = workers def _run_with_timeout( @@ -38,7 +84,10 @@ def _target() -> None: worker.start() worker.join(timeout=max(0.001, timeout_seconds)) if worker.is_alive(): - raise TimeoutError(f"{operation} timed out after {timeout_seconds:.1f}s") + raise DirectLinkTimeoutError( + f"{operation} timed out after {timeout_seconds:.1f}s", + worker, + ) if "error" in outcome: raise outcome["error"] # type: ignore[misc] return outcome.get("result") @@ -87,6 +136,12 @@ def _try_get_direct(ep: Episode, provider_name: str, language: str) -> Optional[ LanguageUnavailableError: If the provider reports the requested language is not offered; the exception contains the list of available languages. """ language = normalize_language(language) + if _episode_has_active_timed_out_worker(ep): + logger.warning( + "Skipping provider '{}' because a timed-out direct-link lookup is still running for this episode.", + provider_name, + ) + return None logger.info("Trying provider '{}' for language '{}'", provider_name, language) try: url = _run_with_timeout( @@ -104,6 +159,8 @@ def _try_get_direct(ep: Episode, provider_name: str, language: str) -> Optional[ logger.warning("Provider '{}' returned no URL.", provider_name) except Exception as exc: msg = str(exc) + if isinstance(exc, DirectLinkTimeoutError): + _track_timed_out_worker(ep, exc.worker) if "No provider found for language" in msg: available = _parse_available_languages_from_error(msg) logger.error( diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index 27fb5016..119e95af 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -20,6 +20,23 @@ def fake_run_due_once() -> None: assert calls == ["called"] +def test_catalog_scheduler_start_clears_previous_stop(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + indexer._stop_event.set() + + monkeypatch.setattr(indexer_module, "ANIBRIDGE_TEST_MODE", False) + monkeypatch.setattr(indexer, "_ensure_status_rows", lambda: None) + monkeypatch.setattr(indexer, "_log_bootstrap_state", lambda: None) + monkeypatch.setattr(indexer, "_run_loop", lambda: None) + + indexer.start() + + assert not indexer._stop_event.is_set() + + def test_catalog_progress_tracks_crawl_and_persist_counts(): from app.catalog.indexer import ProviderCatalogIndexer diff --git a/apps/api/tests/unit/core/downloader/test_provider_resolution.py b/apps/api/tests/unit/core/downloader/test_provider_resolution.py index 6b9ece65..0c7d909f 100644 --- a/apps/api/tests/unit/core/downloader/test_provider_resolution.py +++ b/apps/api/tests/unit/core/downloader/test_provider_resolution.py @@ -28,6 +28,62 @@ def get_direct_link(self, provider_name: str, language: str) -> str: assert time.monotonic() - started_at < 0.15 +def test_try_get_direct_skips_while_timed_out_worker_is_running(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + + class SlowEpisode: + def __init__(self) -> None: + self.calls = 0 + + def get_direct_link(self, provider_name: str, language: str) -> str: + self.calls += 1 + time.sleep(0.2) + return f"{provider_name}:{language}" + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + + episode = SlowEpisode() + + assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None + assert ( + provider_resolution._try_get_direct(episode, "Doodstream", "German Dub") is None + ) + assert episode.calls == 1 + + +def test_try_get_direct_handles_episode_without_weakref_support(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + + class SlottedEpisode: + __slots__ = ("calls",) + + def __init__(self) -> None: + self.calls = 0 + + def get_direct_link(self, provider_name: str, language: str) -> str: + self.calls += 1 + time.sleep(0.2) + return f"{provider_name}:{language}" + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + + episode = SlottedEpisode() + + assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None + assert ( + provider_resolution._try_get_direct(episode, "Doodstream", "German Dub") is None + ) + assert episode.calls == 1 + + def test_try_get_direct_raises_for_missing_language(monkeypatch): import app.core.downloader.provider_resolution as provider_resolution from app.core.downloader.errors import LanguageUnavailableError From 337b9a2b1fb744d2ed6d94cfb51d69a3ef89fca3 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Fri, 22 May 2026 19:17:33 +0200 Subject: [PATCH 43/45] fix(resolver): rescore multiple site-scoped DB candidates Query a small batch of indexed title candidates for site-scoped slug resolution and return the first candidate whose rescored title match meets the resolver threshold. --- apps/api/app/utils/title_resolver.py | 5 ++-- .../unit/utils/title_resolver/test_sto.py | 24 +++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/apps/api/app/utils/title_resolver.py b/apps/api/app/utils/title_resolver.py index 14ab4f5c..02f1ea80 100644 --- a/apps/api/app/utils/title_resolver.py +++ b/apps/api/app/utils/title_resolver.py @@ -843,10 +843,9 @@ def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: session, query=q, providers=[site], - limit=1, + limit=5, ) - if rows: - candidate = rows[0] + for candidate in rows: cand_score = _score_indexed_db_candidate( session, query=q, candidate=candidate ) diff --git a/apps/api/tests/unit/utils/title_resolver/test_sto.py b/apps/api/tests/unit/utils/title_resolver/test_sto.py index 3d32d39f..a7195c8e 100644 --- a/apps/api/tests/unit/utils/title_resolver/test_sto.py +++ b/apps/api/tests/unit/utils/title_resolver/test_sto.py @@ -1,4 +1,5 @@ from urllib.parse import parse_qs, urlparse +from types import SimpleNamespace import requests from sqlmodel import Session @@ -169,3 +170,26 @@ def test_slug_from_query_accepts_db_alias_match(monkeypatch) -> None: "s.to", "the-rookie", ) + + +def test_site_scoped_db_lookup_checks_later_candidates(monkeypatch) -> None: + first = SimpleNamespace(provider="s.to", slug="weak-match") + second = SimpleNamespace(provider="s.to", slug="strong-match") + seen_limits: list[int] = [] + + def fake_search(session, *, query, providers, limit, **kwargs): + del session, query, providers, kwargs + seen_limits.append(limit) + return [first, second] + + def fake_score(session, *, query, candidate): + del session, query + return 0.1 if candidate is first else tr._MIN_TITLE_MATCH_SCORE + + monkeypatch.setattr(tr, "load_or_refresh_index", lambda _site: {}) + monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) + monkeypatch.setattr(tr, "search_indexed_provider_titles", fake_search) + monkeypatch.setattr(tr, "_score_indexed_db_candidate", fake_score) + + assert tr.slug_from_query("Strong Match", site="s.to") == ("s.to", "strong-match") + assert seen_limits == [5] From 3d23b78fe1a7d53a990359711a3fe5bb965e5c01 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Fri, 22 May 2026 20:23:46 +0200 Subject: [PATCH 44/45] fix(api): preserve fallback and scoped catalog readiness Allow provider fallback to continue after a host timeout while suppressing duplicate hung provider attempts. Gate movie Torznab searches on Megakino readiness only and persist row-stage retry deadlines when parking pending catalog enrichment stages. --- apps/api/app/api/torznab/api.py | 16 ++++- apps/api/app/catalog/indexer.py | 38 +++++++++++ .../core/downloader/provider_resolution.py | 64 ++++++++++++++----- .../tests/integration/api/torznab/test_api.py | 40 ++++++++++++ apps/api/tests/unit/catalog/test_indexer.py | 64 ++++++++++++++++++- .../downloader/test_provider_resolution.py | 51 +++++++++++---- 6 files changed, 243 insertions(+), 30 deletions(-) diff --git a/apps/api/app/api/torznab/api.py b/apps/api/app/api/torznab/api.py index 1805c8c6..28e44c4d 100644 --- a/apps/api/app/api/torznab/api.py +++ b/apps/api/app/api/torznab/api.py @@ -32,6 +32,7 @@ find_provider_episode_mapping, find_provider_episode_mappings_for_canonical_episode, find_provider_episode_mappings_for_canonical_season, + get_provider_index_status, get_indexed_episode_languages, get_session, list_indexed_provider_episodes, @@ -78,6 +79,19 @@ def _coerce_positive_int(value: object) -> Optional[int]: return _coerce_positive_int_impl(value) +def _require_provider_title_index_ready(session: Session, *, provider: str) -> None: + status = get_provider_index_status(session, provider=provider) + if status is not None and ( + status.title_index_status == "ready" or status.bootstrap_completed + ): + return + raise CatalogNotReadyError( + "Provider catalog bootstrap is still running. " + f"Pending providers: {provider} " + f"({status.title_index_status if status is not None else 'pending'})." + ) + + def _coerce_non_negative_int(value: object) -> Optional[int]: """Coerce a value into a non-negative integer.""" return _coerce_non_negative_int_impl(value) @@ -707,7 +721,7 @@ def torznab_api( import app.api.torznab as tn try: - require_catalog_ready() + _require_provider_title_index_ready(session, provider="megakino") except CatalogNotReadyError as exc: from fastapi import HTTPException diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py index 14e3697c..1779896b 100644 --- a/apps/api/app/catalog/indexer.py +++ b/apps/api/app/catalog/indexer.py @@ -1059,6 +1059,7 @@ def _run_row_stage(self, *, provider: str, stage: str, concurrency: int) -> None session, provider=provider, stage=stage, + generation=generation, refresh_interval_hours=refresh_interval_hours, ) ) @@ -1419,6 +1420,7 @@ def _mark_stage_pending( *, provider: str, stage: str, + generation: str, refresh_interval_hours: float, ) -> None: payload = { @@ -1429,12 +1431,48 @@ def _mark_stage_pending( "last_error_summary": "", "commit": False, } + retry_after = self._earliest_stage_retry_after( + session, + provider=provider, + stage=stage, + generation=generation, + ) if stage == "detail_enrichment": payload["detail_enrichment_status"] = "pending" + payload["detail_next_retry_after"] = retry_after else: payload["canonical_enrichment_status"] = "pending" + payload["canonical_next_retry_after"] = retry_after upsert_provider_index_status(session, **payload) + def _earliest_stage_retry_after( + self, + session: Session, + *, + provider: str, + stage: str, + generation: str, + ): + retry_column = ( + ProviderTitleIndexState.detail_next_retry_after + if stage == "detail_enrichment" + else ProviderTitleIndexState.canonical_next_retry_after + ) + rows = session.exec( + select(retry_column) + .join( + ProviderCatalogTitle, + (ProviderCatalogTitle.provider == ProviderTitleIndexState.provider) + & (ProviderCatalogTitle.slug == ProviderTitleIndexState.slug), + ) + .where( + (ProviderTitleIndexState.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + & (retry_column.is_not(None)) + ) + ).all() + return min((as_aware_utc(row) for row in rows if row is not None), default=None) + def _mark_stage_failed( self, session: Session, diff --git a/apps/api/app/core/downloader/provider_resolution.py b/apps/api/app/core/downloader/provider_resolution.py index 9d0bfebf..ea5162c2 100644 --- a/apps/api/app/core/downloader/provider_resolution.py +++ b/apps/api/app/core/downloader/provider_resolution.py @@ -17,9 +17,11 @@ _AVAIL_RE = re.compile(r"Available languages:\s*\[([^\]]*)\]", re.IGNORECASE) _DIRECT_LINK_TIMEOUT_LOCK = threading.Lock() _DIRECT_LINK_TIMED_OUT_WORKERS: weakref.WeakKeyDictionary[ - object, list[threading.Thread] + object, dict[tuple[str, str], list[threading.Thread]] ] = weakref.WeakKeyDictionary() -_DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID: dict[int, list[threading.Thread]] = {} +_DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID: dict[ + int, dict[tuple[str, str], list[threading.Thread]] +] = {} class DirectLinkTimeoutError(TimeoutError): @@ -28,38 +30,61 @@ def __init__(self, message: str, worker: threading.Thread) -> None: self.worker = worker -def _episode_has_active_timed_out_worker(ep: object) -> bool: +def _provider_attempt_has_active_timed_out_worker( + ep: object, + provider_name: str, + language: str, +) -> bool: + key = (provider_name, language) with _DIRECT_LINK_TIMEOUT_LOCK: try: - workers = _DIRECT_LINK_TIMED_OUT_WORKERS.get(ep, []) + workers_by_key = _DIRECT_LINK_TIMED_OUT_WORKERS.get(ep, {}) except TypeError: - workers = _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.get(id(ep), []) + workers_by_key = _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.get(id(ep), {}) + workers = workers_by_key.get(key, []) active_workers = [worker for worker in workers if worker.is_alive()] if active_workers: + workers_by_key[key] = active_workers try: - _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = active_workers + _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = workers_by_key except TypeError: - _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = active_workers + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = workers_by_key return True + workers_by_key.pop(key, None) try: - _DIRECT_LINK_TIMED_OUT_WORKERS.pop(ep, None) + if workers_by_key: + _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = workers_by_key + else: + _DIRECT_LINK_TIMED_OUT_WORKERS.pop(ep, None) except TypeError: - _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.pop(id(ep), None) + if workers_by_key: + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = workers_by_key + else: + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.pop(id(ep), None) return False -def _track_timed_out_worker(ep: object, worker: threading.Thread) -> None: +def _track_timed_out_worker( + ep: object, + *, + provider_name: str, + language: str, + worker: threading.Thread, +) -> None: + key = (provider_name, language) with _DIRECT_LINK_TIMEOUT_LOCK: try: - existing_workers = _DIRECT_LINK_TIMED_OUT_WORKERS.get(ep, []) + workers_by_key = _DIRECT_LINK_TIMED_OUT_WORKERS.get(ep, {}) except TypeError: - existing_workers = _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.get(id(ep), []) + workers_by_key = _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.get(id(ep), {}) + existing_workers = workers_by_key.get(key, []) workers = [existing for existing in existing_workers if existing.is_alive()] workers.append(worker) + workers_by_key[key] = workers try: - _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = workers + _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = workers_by_key except TypeError: - _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = workers + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = workers_by_key def _run_with_timeout( @@ -136,9 +161,9 @@ def _try_get_direct(ep: Episode, provider_name: str, language: str) -> Optional[ LanguageUnavailableError: If the provider reports the requested language is not offered; the exception contains the list of available languages. """ language = normalize_language(language) - if _episode_has_active_timed_out_worker(ep): + if _provider_attempt_has_active_timed_out_worker(ep, provider_name, language): logger.warning( - "Skipping provider '{}' because a timed-out direct-link lookup is still running for this episode.", + "Skipping provider '{}' because a timed-out direct-link lookup is still running for this episode and language.", provider_name, ) return None @@ -160,7 +185,12 @@ def _try_get_direct(ep: Episode, provider_name: str, language: str) -> Optional[ except Exception as exc: msg = str(exc) if isinstance(exc, DirectLinkTimeoutError): - _track_timed_out_worker(ep, exc.worker) + _track_timed_out_worker( + ep, + provider_name=provider_name, + language=language, + worker=exc.worker, + ) if "No provider found for language" in msg: available = _parse_available_languages_from_error(msg) logger.error( diff --git a/apps/api/tests/integration/api/torznab/test_api.py b/apps/api/tests/integration/api/torznab/test_api.py index 388ea21d..1dd22506 100644 --- a/apps/api/tests/integration/api/torznab/test_api.py +++ b/apps/api/tests/integration/api/torznab/test_api.py @@ -145,6 +145,46 @@ def test_search(client): assert root.find("channel") is not None +def test_movie_search_uses_megakino_readiness_only(client, monkeypatch): + import app.api.torznab.api as torznab_api + from app.db import engine, upsert_provider_index_status + + called: list[list[str]] = [] + + def fake_indexed_preview_results(**kwargs): + called.append(kwargs["providers"]) + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="megakino", + refresh_interval_hours=24.0, + status="ready", + latest_success_generation="gen-movie", + current_generation="gen-movie", + bootstrap_completed=True, + title_index_status="ready", + ) + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="pending", + title_index_status="pending", + ) + + monkeypatch.setattr( + torznab_api, + "_indexed_preview_results", + fake_indexed_preview_results, + ) + + resp = client.get("/torznab/api", params={"t": "movie", "q": "movie"}) + + assert resp.status_code == 200 + assert called == [["megakino"]] + + def test_tvsearch_happy_path(client, monkeypatch): import app.api.torznab as tn diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py index 119e95af..e24ddfbf 100644 --- a/apps/api/tests/unit/catalog/test_indexer.py +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -1,4 +1,4 @@ -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from threading import Event, Thread from types import SimpleNamespace @@ -617,3 +617,65 @@ def test_run_row_stage_does_not_mark_ready_when_only_future_retries_remain( ) assert events == ["running", "pending"] + + +def test_mark_stage_pending_persists_earliest_row_retry(client): + from app.catalog.indexer import ProviderCatalogIndexer + from app.db import ( + engine, + get_provider_index_status, + replace_provider_catalog_title, + upsert_provider_index_status, + upsert_provider_title_index_state, + utcnow, + ) + from sqlmodel import Session + + retry_later = utcnow() + timedelta(hours=2) + retry_earlier = utcnow() + timedelta(minutes=30) + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="partial", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="running", + ) + for slug, retry_after in ( + ("later", retry_later), + ("earlier", retry_earlier), + ): + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug=slug, + title=slug, + media_type_hint="series", + relative_path=f"/anime/stream/{slug}", + indexed_generation="gen-1", + ) + upsert_provider_title_index_state( + session, + provider="aniworld.to", + slug=slug, + detail_status="pending", + detail_next_retry_after=retry_after, + ) + + ProviderCatalogIndexer()._mark_stage_pending( + session, + provider="aniworld.to", + stage="detail_enrichment", + generation="gen-1", + refresh_interval_hours=24.0, + ) + status = get_provider_index_status(session, provider="aniworld.to") + + assert status is not None + assert status.detail_enrichment_status == "pending" + assert status.detail_next_retry_after == retry_earlier diff --git a/apps/api/tests/unit/core/downloader/test_provider_resolution.py b/apps/api/tests/unit/core/downloader/test_provider_resolution.py index 0c7d909f..048c5a60 100644 --- a/apps/api/tests/unit/core/downloader/test_provider_resolution.py +++ b/apps/api/tests/unit/core/downloader/test_provider_resolution.py @@ -28,16 +28,17 @@ def get_direct_link(self, provider_name: str, language: str) -> str: assert time.monotonic() - started_at < 0.15 -def test_try_get_direct_skips_while_timed_out_worker_is_running(monkeypatch): +def test_try_get_direct_only_skips_same_provider_while_worker_is_running(monkeypatch): import app.core.downloader.provider_resolution as provider_resolution class SlowEpisode: def __init__(self) -> None: - self.calls = 0 + self.calls: list[str] = [] def get_direct_link(self, provider_name: str, language: str) -> str: - self.calls += 1 - time.sleep(0.2) + self.calls.append(provider_name) + if provider_name == "VOE": + time.sleep(0.2) return f"{provider_name}:{language}" monkeypatch.setattr( @@ -48,11 +49,13 @@ def get_direct_link(self, provider_name: str, language: str) -> str: episode = SlowEpisode() + assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None assert ( - provider_resolution._try_get_direct(episode, "Doodstream", "German Dub") is None + provider_resolution._try_get_direct(episode, "Doodstream", "German Dub") + == "Doodstream:German Dub" ) - assert episode.calls == 1 + assert episode.calls == ["VOE", "Doodstream"] def test_try_get_direct_handles_episode_without_weakref_support(monkeypatch): @@ -62,11 +65,12 @@ class SlottedEpisode: __slots__ = ("calls",) def __init__(self) -> None: - self.calls = 0 + self.calls: list[str] = [] def get_direct_link(self, provider_name: str, language: str) -> str: - self.calls += 1 - time.sleep(0.2) + self.calls.append(provider_name) + if provider_name == "VOE": + time.sleep(0.2) return f"{provider_name}:{language}" monkeypatch.setattr( @@ -77,11 +81,36 @@ def get_direct_link(self, provider_name: str, language: str) -> str: episode = SlottedEpisode() + assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None assert ( - provider_resolution._try_get_direct(episode, "Doodstream", "German Dub") is None + provider_resolution._try_get_direct(episode, "Doodstream", "German Dub") + == "Doodstream:German Dub" ) - assert episode.calls == 1 + assert episode.calls == ["VOE", "Doodstream"] + + +def test_get_direct_url_with_fallback_continues_after_timeout(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + + class Episode: + def get_direct_link(self, provider_name: str, language: str) -> str: + if provider_name == "VOE": + time.sleep(0.2) + return f"{provider_name}:{language}" + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + monkeypatch.setattr(provider_resolution, "PROVIDER_ORDER", ["VOE", "Doodstream"]) + + assert provider_resolution.get_direct_url_with_fallback( + Episode(), + preferred=None, + language="German Dub", + ) == ("Doodstream:German Dub", "Doodstream") def test_try_get_direct_raises_for_missing_language(monkeypatch): From 61351bbcbfc1295ab597f8385f063598c25f3191 Mon Sep 17 00:00:00 2001 From: Zzackllack Date: Thu, 25 Jun 2026 04:15:39 +0200 Subject: [PATCH 45/45] fix(api): resolve catalog readiness and paused torrent resume Scope Torznab searches to ready provider indexes while preserving synthetic validation and optional fallback behavior. Persist download metadata for paused tasks and add qBittorrent-compatible resume handling. --- apps/api/app/api/qbittorrent/torrents.py | 57 ++++++++++++ apps/api/app/api/torznab/api.py | 87 +++++++++++++------ ...260625_0008_client_task_resume_metadata.py | 39 +++++++++ apps/api/app/db/models.py | 10 +++ .../api/qbittorrent/test_torrents.py | 56 ++++++++++++ .../tests/integration/api/torznab/test_api.py | 52 ++++++++++- .../api/torznab/test_indexed_catalog.py | 1 + .../api/torznab/test_specials_mapping.py | 1 + apps/api/tests/unit/db/test_models.py | 20 ++++- internal/agents/api.md | 4 +- 10 files changed, 296 insertions(+), 31 deletions(-) create mode 100644 apps/api/app/db/migrations/versions/20260625_0008_client_task_resume_metadata.py diff --git a/apps/api/app/api/qbittorrent/torrents.py b/apps/api/app/api/qbittorrent/torrents.py index fc9f9f5c..e339865e 100644 --- a/apps/api/app/api/qbittorrent/torrents.py +++ b/apps/api/app/api/qbittorrent/torrents.py @@ -121,6 +121,8 @@ def torrents_add( category=category, job_id=job_id, state="queued", + provider=provider, + mode=mode or None, ) logger.success( "Torrent task upserted for hash={}, state={}, site={}".format( @@ -145,6 +147,8 @@ def torrents_add( category=category, job_id=job_id, state="failed", + provider=provider, + mode=mode or None, ) return PlainTextResponse("Failed to start download.", status_code=500) upsert_client_task( @@ -160,11 +164,64 @@ def torrents_add( category=category, job_id=job_id, state="downloading", + provider=provider, + mode=mode or None, ) logger.debug(f"Started background worker for job_id: {job_id}") return PlainTextResponse("Ok.") +@router.post("/torrents/resume") +def torrents_resume( + session: Session = Depends(get_session), + hashes: str = Form(...), +): + """Start existing queued torrent jobs.""" + requested_hashes = [value.strip().lower() for value in hashes.split("|")] + if requested_hashes == ["all"]: + from app.db import ClientTask + from sqlmodel import select + + tasks = session.exec(select(ClientTask)).all() + else: + tasks = [ + task + for torrent_hash in requested_hashes + if (task := get_client_task(session, torrent_hash)) is not None + ] + + for task in tasks: + job = get_job(session, task.job_id) if task.job_id else None + if job is None or job.status != "queued": + continue + req = { + "slug": task.slug, + "season": task.season, + "episode": task.episode, + "language": task.language, + "site": task.site or "aniworld.to", + "title_hint": task.name, + } + if task.provider: + req["provider"] = task.provider + if task.mode: + req["mode"] = task.mode + try: + start_scheduled_job(job.id, req) + except Exception as exc: + logger.error("Failed to resume scheduled job {}: {}", job.id, exc) + task.state = "failed" + session.add(task) + session.commit() + return PlainTextResponse("Failed to resume download.", status_code=500) + task.state = "downloading" + session.add(task) + session.commit() + logger.debug("Resumed background worker for job_id: {}", job.id) + + return PlainTextResponse("Ok.") + + @router.get("/torrents/info") def torrents_info( session: Session = Depends(get_session), diff --git a/apps/api/app/api/torznab/api.py b/apps/api/app/api/torznab/api.py index 28e44c4d..051d2852 100644 --- a/apps/api/app/api/torznab/api.py +++ b/apps/api/app/api/torznab/api.py @@ -79,16 +79,24 @@ def _coerce_positive_int(value: object) -> Optional[int]: return _coerce_positive_int_impl(value) -def _require_provider_title_index_ready(session: Session, *, provider: str) -> None: - status = get_provider_index_status(session, provider=provider) - if status is not None and ( - status.title_index_status == "ready" or status.bootstrap_completed - ): - return +def _ready_provider_title_indexes( + session: Session, *, providers: list[str] +) -> list[str]: + ready = [] + pending = [] + for provider in providers: + status = get_provider_index_status(session, provider=provider) + if status is not None and status.title_index_status == "ready": + ready.append(provider) + continue + pending.append( + f"{provider} ({status.title_index_status if status is not None else 'pending'})" + ) + if ready: + return ready raise CatalogNotReadyError( "Provider catalog bootstrap is still running. " - f"Pending providers: {provider} " - f"({status.title_index_status if status is not None else 'pending'})." + f"Pending providers: {', '.join(pending)}." ) @@ -672,46 +680,68 @@ def torznab_api( if not q_str: return _rss_response(rss) - try: - require_catalog_ready() - except CatalogNotReadyError as exc: - from fastapi import HTTPException - - raise HTTPException(status_code=503, detail=str(exc)) from exc - if movie_preferred: + try: + movie_providers = _ready_provider_title_indexes( + session, providers=["megakino"] + ) + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc count = _indexed_preview_results( tn_module=tn, session=session, q_str=q_str, channel=channel, cat_id=TORZNAB_CAT_MOVIE, - providers=["megakino"], + providers=movie_providers, limit=limit, strm_suffix=strm_suffix, ) if count == 0: + anime_providers = [ + site for site in CATALOG_SITES_LIST if site != "megakino" + ] + try: + anime_providers = _ready_provider_title_indexes( + session, providers=anime_providers + ) + except CatalogNotReadyError as exc: + logger.debug( + "Skipping anime fallback for movie-preferred search because " + "no anime title indexes are ready: {}", + exc, + ) + return _rss_response(rss) _indexed_preview_results( tn_module=tn, session=session, q_str=q_str, channel=channel, cat_id=TORZNAB_CAT_ANIME, - providers=[ - site for site in CATALOG_SITES_LIST if site != "megakino" - ], + providers=anime_providers, limit=limit, strm_suffix=strm_suffix, ) return _rss_response(rss) + anime_providers = [site for site in CATALOG_SITES_LIST if site != "megakino"] + try: + anime_providers = _ready_provider_title_indexes( + session, providers=anime_providers + ) + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc _indexed_preview_results( tn_module=tn, session=session, q_str=q_str, channel=channel, cat_id=cat_id, - providers=[site for site in CATALOG_SITES_LIST if site != "megakino"], + providers=anime_providers, limit=limit, strm_suffix=strm_suffix, ) @@ -720,13 +750,6 @@ def torznab_api( if t in ("movie", "movie-search"): import app.api.torznab as tn - try: - _require_provider_title_index_ready(session, provider="megakino") - except CatalogNotReadyError as exc: - from fastapi import HTTPException - - raise HTTPException(status_code=503, detail=str(exc)) from exc - rss, channel = _rss_root() q_str = (q or "").strip() strm_suffix = " [STRM]" @@ -739,13 +762,21 @@ def torznab_api( ) return _rss_response(rss) if q_str: + try: + movie_providers = _ready_provider_title_indexes( + session, providers=["megakino"] + ) + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc _indexed_preview_results( tn_module=tn, session=session, q_str=q_str, channel=channel, cat_id=TORZNAB_CAT_MOVIE, - providers=["megakino"], + providers=movie_providers, limit=limit, strm_suffix=strm_suffix, ) diff --git a/apps/api/app/db/migrations/versions/20260625_0008_client_task_resume_metadata.py b/apps/api/app/db/migrations/versions/20260625_0008_client_task_resume_metadata.py new file mode 100644 index 00000000..1f082bf6 --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260625_0008_client_task_resume_metadata.py @@ -0,0 +1,39 @@ +"""Store download metadata required to resume paused client tasks. + +Revision ID: 20260625_0008 +Revises: 20260430_0007 +Create Date: 2026-06-25 03:20:00.000000 +""" + +from alembic import op +import sqlalchemy as sa + + +revision = "20260625_0008" +down_revision = "20260430_0007" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + inspector = sa.inspect(op.get_bind()) + existing_columns = { + column["name"] for column in inspector.get_columns("clienttask") + } + with op.batch_alter_table("clienttask") as batch_op: + if "provider" not in existing_columns: + batch_op.add_column(sa.Column("provider", sa.String(), nullable=True)) + if "mode" not in existing_columns: + batch_op.add_column(sa.Column("mode", sa.String(), nullable=True)) + + +def downgrade() -> None: + inspector = sa.inspect(op.get_bind()) + existing_columns = { + column["name"] for column in inspector.get_columns("clienttask") + } + with op.batch_alter_table("clienttask") as batch_op: + if "mode" in existing_columns: + batch_op.drop_column("mode") + if "provider" in existing_columns: + batch_op.drop_column("provider") diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index 8ceed917..682583da 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -171,6 +171,8 @@ class ClientTask(ModelBase, table=True): episode: int language: str site: Optional[str] = Field(default="aniworld.to", index=True) # Track source site + provider: Optional[str] = None + mode: Optional[str] = None job_id: Optional[str] = Field(default=None, index=True) save_path: Optional[str] = None category: Optional[str] = None @@ -1876,6 +1878,8 @@ def upsert_client_task( job_id: Optional[str], state: str = "queued", site: str = "aniworld.to", + provider: Optional[str] = None, + mode: Optional[str] = None, ) -> ClientTask: """ Insert or update a ClientTask record identified by its hash. @@ -1899,6 +1903,8 @@ def upsert_client_task( episode=episode, language=language, site=site, + provider=provider, + mode=mode, save_path=save_path, category=category, job_id=job_id, @@ -1913,6 +1919,10 @@ def upsert_client_task( rec.episode = episode rec.language = language rec.site = site + if provider is not None: + rec.provider = provider + if mode is not None: + rec.mode = mode rec.save_path = save_path rec.category = category rec.job_id = job_id diff --git a/apps/api/tests/integration/api/qbittorrent/test_torrents.py b/apps/api/tests/integration/api/qbittorrent/test_torrents.py index 0a4a05dc..0c616cc6 100644 --- a/apps/api/tests/integration/api/qbittorrent/test_torrents.py +++ b/apps/api/tests/integration/api/qbittorrent/test_torrents.py @@ -129,6 +129,62 @@ def test_torrents_info_preserves_queued_state_for_paused_add(client): assert info[0]["state"] == "queuedDL" +def test_torrents_resume_starts_paused_job_with_original_request(client, monkeypatch): + from app.utils.magnet import build_magnet + import app.api.qbittorrent.torrents as qb_torrents + from app.db import create_job, engine + from sqlmodel import Session + + started: list[tuple[str, dict]] = [] + + def create_queued_job(req, *, autostart=True): + del autostart + with Session(engine) as session: + return create_job(session, source_site=req["site"]).id + + monkeypatch.setattr(qb_torrents, "schedule_download", create_queued_job) + monkeypatch.setattr( + qb_torrents, + "start_scheduled_job", + lambda job_id, req: started.append((job_id, req)), + ) + magnet = build_magnet( + title="Queued STRM", + slug="queued-strm", + season=2, + episode=3, + language="German Dub", + provider="VOE", + mode="strm", + ) + + add_response = client.post( + "/api/v2/torrents/add", + data={"urls": magnet, "paused": "true"}, + ) + torrent_hash = client.get("/api/v2/torrents/info").json()[0]["hash"] + resume_response = client.post( + "/api/v2/torrents/resume", + data={"hashes": torrent_hash}, + ) + + assert add_response.status_code == 200 + assert resume_response.status_code == 200 + assert len(started) == 1 + assert started[0][1] == { + "slug": "queued-strm", + "season": 2, + "episode": 3, + "language": "German Dub", + "site": "aniworld.to", + "title_hint": "Queued STRM", + "provider": "VOE", + "mode": "strm", + } + info = client.get("/api/v2/torrents/info").json() + assert info[0]["state"] == "downloading" + + def test_torrents_add_returns_500_when_start_fails(client, monkeypatch): from app.utils.magnet import build_magnet import app.api.qbittorrent.torrents as qb_torrents diff --git a/apps/api/tests/integration/api/torznab/test_api.py b/apps/api/tests/integration/api/torznab/test_api.py index 1dd22506..69d559b0 100644 --- a/apps/api/tests/integration/api/torznab/test_api.py +++ b/apps/api/tests/integration/api/torznab/test_api.py @@ -138,7 +138,14 @@ def test_caps(client): ET.fromstring(resp.text) -def test_search(client): +def test_search(client, monkeypatch): + import app.api.torznab.api as torznab_api + + monkeypatch.setattr( + torznab_api, + "_ready_provider_title_indexes", + lambda session, *, providers: providers, + ) resp = client.get("/torznab/api", params={"t": "search", "q": "test"}) assert resp.status_code == 200 root = ET.fromstring(resp.text) @@ -185,6 +192,49 @@ def fake_indexed_preview_results(**kwargs): assert called == [["megakino"]] +def test_generic_search_ignores_unqueried_megakino_readiness(client, monkeypatch): + import app.api.torznab.api as torznab_api + from app.db import engine, upsert_provider_index_status + + called: list[list[str]] = [] + + def fake_indexed_preview_results(**kwargs): + called.append(kwargs["providers"]) + return 1 + + with Session(engine) as session: + for provider in ("aniworld.to", "s.to"): + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=24.0, + status="ready", + latest_success_generation=f"gen-{provider}", + current_generation=f"gen-{provider}", + bootstrap_completed=True, + title_index_status="ready", + ) + upsert_provider_index_status( + session, + provider="megakino", + refresh_interval_hours=24.0, + status="pending", + title_index_status="pending", + ) + + monkeypatch.setattr( + torznab_api, + "_indexed_preview_results", + fake_indexed_preview_results, + ) + + resp = client.get("/torznab/api", params={"t": "search", "q": "anime"}) + + assert resp.status_code == 200 + assert len(called) == 1 + assert set(called[0]) == {"aniworld.to", "s.to"} + + def test_tvsearch_happy_path(client, monkeypatch): import app.api.torznab as tn diff --git a/apps/api/tests/integration/api/torznab/test_indexed_catalog.py b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py index d40cd41f..31f70dfa 100644 --- a/apps/api/tests/integration/api/torznab/test_indexed_catalog.py +++ b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py @@ -27,6 +27,7 @@ def _seed_ready_catalog() -> None: current_generation=generation, latest_success_generation=generation, bootstrap_completed=True, + title_index_status="ready", ) replace_provider_catalog_title( session, diff --git a/apps/api/tests/integration/api/torznab/test_specials_mapping.py b/apps/api/tests/integration/api/torznab/test_specials_mapping.py index 5049c105..aee75a86 100644 --- a/apps/api/tests/integration/api/torznab/test_specials_mapping.py +++ b/apps/api/tests/integration/api/torznab/test_specials_mapping.py @@ -29,6 +29,7 @@ def _seed_special_mapping_catalog(*, languages: list[str]) -> None: current_generation=generation, latest_success_generation=generation, bootstrap_completed=True, + title_index_status="ready", ) replace_provider_catalog_title( session, diff --git a/apps/api/tests/unit/db/test_models.py b/apps/api/tests/unit/db/test_models.py index c5246809..7ab7bd24 100644 --- a/apps/api/tests/unit/db/test_models.py +++ b/apps/api/tests/unit/db/test_models.py @@ -136,8 +136,26 @@ def test_availability_and_clienttask_crud(client): category="anime", job_id="job-1", state="downloading", + provider="VOE", + mode="strm", ) - assert get_client_task(s, "abc") + upsert_client_task( + s, + hash="abc", + name="Updated Name", + slug="slug", + season=1, + episode=1, + language="German Dub", + save_path="/tmp", + category="anime", + job_id="job-1", + state="downloading", + ) + task = get_client_task(s, "abc") + assert task is not None + assert task.provider == "VOE" + assert task.mode == "strm" delete_client_task(s, "abc") assert get_client_task(s, "abc") is None diff --git a/internal/agents/api.md b/internal/agents/api.md index aca058b7..2b53298c 100644 --- a/internal/agents/api.md +++ b/internal/agents/api.md @@ -31,7 +31,9 @@ - Auth: `/auth/login`, `/auth/logout` set `SID` cookie `anibridge`. - Categories: `/torrents/categories` returns configured categories (default `AniBridge`). -- Torrents: `/torrents/add`, `/torrents/delete`, `/torrents/info` mimic qBittorrent responses. +- Torrents: `/torrents/add`, `/torrents/resume`, `/torrents/delete`, and + `/torrents/info` mimic qBittorrent responses. Paused adds retain the provider + and mode metadata required to start the original request when resumed. - Sync: `/sync/maindata` exposes job states for Sonarr integration. - Transfer: `/transfer/info`, `/transfer/speedLimitsMode`, etc., return safe defaults. - Deletion endpoint optionally removes files when `DELETE_FILES_ON_TORRENT_DELETE` is true.