diff --git a/apps/api/.env.example b/apps/api/.env.example index 37f298ab..915a4e63 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -78,6 +78,65 @@ MEGAKINO_BASE_URL=https://megakino1.to # Default: 12 MEGAKINO_TITLES_REFRESH_HOURS=12 +## Scheduled Provider Catalog Index +# What: Default refresh cadence (hours) for the persistent provider catalog index +# Default: 24 +PROVIDER_INDEX_REFRESH_HOURS=24 +# What: Provider-specific refresh cadence override for AniWorld +# Default: PROVIDER_INDEX_REFRESH_HOURS +PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD=24 +# What: Provider-specific refresh cadence override for s.to +# Default: PROVIDER_INDEX_REFRESH_HOURS +PROVIDER_INDEX_REFRESH_HOURS_STO=24 +# What: Provider-specific refresh cadence override for megakino +# Default: PROVIDER_INDEX_REFRESH_HOURS +PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO=24 +# What: Scheduler poll interval in seconds for checking due provider refreshes +# Default: 60 +PROVIDER_INDEX_SCHEDULER_POLL_SECONDS=60 +# What: Maximum number of provider refreshes allowed to run in parallel +# Default: 1 +PROVIDER_INDEX_GLOBAL_CONCURRENCY=1 +# What: Per-provider crawl worker count for AniWorld title refreshes +# Default: 4 +PROVIDER_INDEX_CONCURRENCY_ANIWORLD=4 +# What: Per-provider crawl worker count for s.to title refreshes +# Default: 4 +PROVIDER_INDEX_CONCURRENCY_STO=4 +# What: Per-provider crawl worker count for megakino title refreshes +# Default: 2 +PROVIDER_INDEX_CONCURRENCY_MEGAKINO=2 +# What: Hard timeout in seconds for one provider title crawl +# Default: 45 +PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS=45 +# What: Maximum number of completed title payloads allowed to wait in memory for DB persistence +# Default: 8 +PROVIDER_INDEX_QUEUE_SIZE=8 +# What: Max number of title payloads committed per SQLite writer batch +# Default: 32 +PROVIDER_INDEX_WRITER_BATCH_SIZE=32 +# What: Max seconds the writer may hold a partial batch before forcing a commit +# Default: 1.0 +PROVIDER_INDEX_WRITER_FLUSH_SECONDS=1.0 +# What: Abort a refresh when failed title crawls exceed this percentage of the discovered title set +# Default: 20 +PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT=20 +# What: Minimum seconds between repeated queue-backpressure log lines while crawlers are blocked on persistence +# Default: 15 +PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS=15 +# What: Maximum parallel canonical metadata lookups per provider stage run +# Default: 2 +CANONICAL_INDEX_CONCURRENCY=2 +# What: Max hot in-memory canonical search cache entries +# Default: 512 +CANONICAL_CACHE_MEMORY_MAX_SEARCH=512 +# What: Max hot in-memory canonical show cache entries +# Default: 256 +CANONICAL_CACHE_MEMORY_MAX_SHOW=256 +# What: TTL in seconds for hot in-memory canonical cache entries +# Default: 3600 +CANONICAL_CACHE_TTL_SECONDS=3600 + # What: Domain check interval in minutes (0 disables background checks) # Default: 100 MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN=100 @@ -108,11 +167,21 @@ PROVIDER_ORDER=VOE,Filemoon,Streamtape,Vidmoly,Doodstream,LoadX,Luluvdo,Vidoza # Default: 12 PROVIDER_REDIRECT_TIMEOUT_SECONDS=12 +# What: Hard timeout in seconds for one video host direct-link resolution +# attempt before AniBridge abandons that host and tries the next fallback. +# Default: 15 +PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS=15 + # What: Extra retry attempts when provider redirect resolution times out or # hits transient network errors # Default: 2 PROVIDER_REDIRECT_RETRIES=2 +# What: How often the background scheduler flushes coalesced download +# progress updates to SQLite while yt-dlp is running. +# Default: 0.5 +JOB_PROGRESS_FLUSH_SECONDS=0.5 + # What: Base cool-down in seconds before retrying a Serienstream redirect that # returned a Turnstile/captcha page. Retries back off linearly from this value. # Default: 300 diff --git a/apps/api/app/api/health.py b/apps/api/app/api/health.py index 29526a0a..835f70f5 100644 --- a/apps/api/app/api/health.py +++ b/apps/api/app/api/health.py @@ -2,9 +2,19 @@ from fastapi import APIRouter +from app.catalog import get_catalog_indexer + router = APIRouter() @router.get("/health") -async def healthcheck(): - return {"status": "ok"} +def healthcheck(): + return { + "status": "ok", + "catalog": get_catalog_indexer().get_progress_snapshot(), + } + + +@router.get("/health/catalog") +def catalog_healthcheck(): + return get_catalog_indexer().get_progress_snapshot() diff --git a/apps/api/app/api/qbittorrent/common.py b/apps/api/app/api/qbittorrent/common.py index ea989e10..cfd4bdd3 100644 --- a/apps/api/app/api/qbittorrent/common.py +++ b/apps/api/app/api/qbittorrent/common.py @@ -15,6 +15,27 @@ def public_save_path() -> str: return QBIT_PUBLIC_SAVE_PATH or str(DOWNLOAD_DIR) +def coerce_torrent_state(*, stored_state: str | None, job_status: str | None) -> str: + """Map persisted task state and scheduler status to qBittorrent states.""" + if job_status == "completed": + return "uploading" + if job_status == "failed": + return "error" + if job_status == "cancelled": + return "pausedDL" + + normalized = (stored_state or "").strip().lower() + if normalized == "queued": + return "queuedDL" + if normalized == "paused": + return "pausedDL" + if normalized == "completed": + return "uploading" + if normalized == "error" or normalized == "failed": + return "error" + return "downloading" + + # Categories map compatible with qBittorrent format CATEGORIES: Dict[str, dict] = { "prowlarr": { diff --git a/apps/api/app/api/qbittorrent/sync.py b/apps/api/app/api/qbittorrent/sync.py index c92bc97f..df8a4706 100644 --- a/apps/api/app/api/qbittorrent/sync.py +++ b/apps/api/app/api/qbittorrent/sync.py @@ -8,7 +8,7 @@ from app.db import get_session, get_job from . import router -from .common import CATEGORIES, public_save_path +from .common import CATEGORIES, coerce_torrent_state, public_save_path from app.config import DOWNLOAD_DIR, QBIT_PUBLIC_SAVE_PATH @@ -27,14 +27,10 @@ def sync_maindata(session: Session = Depends(get_session)): for r in rows: job = get_job(session, r.job_id) if r.job_id else None progress = (job.progress or 0.0) / 100.0 if job else 0.0 - state = "downloading" - if job: - if job.status == "completed": - state = "uploading" - elif job.status == "failed": - state = "error" - elif job.status == "cancelled": - state = "pausedDL" + state = coerce_torrent_state( + stored_state=r.state, + job_status=job.status if job else None, + ) size_val = int(job.total_bytes or 0) if job else 0 save_path_val = ( diff --git a/apps/api/app/api/qbittorrent/torrents.py b/apps/api/app/api/qbittorrent/torrents.py index 3cd0a2cf..e339865e 100644 --- a/apps/api/app/api/qbittorrent/torrents.py +++ b/apps/api/app/api/qbittorrent/torrents.py @@ -20,10 +20,10 @@ delete_client_task, get_job, ) -from app.core.scheduler import schedule_download, cancel_job +from app.core.scheduler import cancel_job, schedule_download, start_scheduled_job from . import router -from .common import public_save_path +from .common import coerce_torrent_state, public_save_path @router.post("/torrents/add") @@ -101,8 +101,8 @@ def torrents_add( req["provider"] = provider if mode: req["mode"] = mode - job_id = schedule_download(req) - logger.debug(f"Scheduled job_id: {job_id}") + job_id = schedule_download(req, autostart=False) + logger.debug(f"Created scheduled job_id: {job_id}") if not savepath: savepath = str(DOWNLOAD_DIR) @@ -120,13 +120,105 @@ def torrents_add( save_path=published_savepath, category=category, job_id=job_id, - state="queued" if paused else "downloading", + state="queued", + provider=provider, + mode=mode or None, ) logger.success( "Torrent task upserted for hash={}, state={}, site={}".format( - btih, "queued" if paused else "downloading", site + btih, "queued", site ) ) + if not paused: + try: + start_scheduled_job(job_id, req) + except Exception as exc: + logger.error("Failed to start scheduled job {}: {}", job_id, exc) + upsert_client_task( + session, + hash=btih, + name=name, + slug=slug, + season=season, + episode=episode, + language=language, + site=site, + save_path=published_savepath, + category=category, + job_id=job_id, + state="failed", + provider=provider, + mode=mode or None, + ) + return PlainTextResponse("Failed to start download.", status_code=500) + upsert_client_task( + session, + hash=btih, + name=name, + slug=slug, + season=season, + episode=episode, + language=language, + site=site, + save_path=published_savepath, + category=category, + job_id=job_id, + state="downloading", + provider=provider, + mode=mode or None, + ) + logger.debug(f"Started background worker for job_id: {job_id}") + return PlainTextResponse("Ok.") + + +@router.post("/torrents/resume") +def torrents_resume( + session: Session = Depends(get_session), + hashes: str = Form(...), +): + """Start existing queued torrent jobs.""" + requested_hashes = [value.strip().lower() for value in hashes.split("|")] + if requested_hashes == ["all"]: + from app.db import ClientTask + from sqlmodel import select + + tasks = session.exec(select(ClientTask)).all() + else: + tasks = [ + task + for torrent_hash in requested_hashes + if (task := get_client_task(session, torrent_hash)) is not None + ] + + for task in tasks: + job = get_job(session, task.job_id) if task.job_id else None + if job is None or job.status != "queued": + continue + req = { + "slug": task.slug, + "season": task.season, + "episode": task.episode, + "language": task.language, + "site": task.site or "aniworld.to", + "title_hint": task.name, + } + if task.provider: + req["provider"] = task.provider + if task.mode: + req["mode"] = task.mode + try: + start_scheduled_job(job.id, req) + except Exception as exc: + logger.error("Failed to resume scheduled job {}: {}", job.id, exc) + task.state = "failed" + session.add(task) + session.commit() + return PlainTextResponse("Failed to resume download.", status_code=500) + task.state = "downloading" + session.add(task) + session.commit() + logger.debug("Resumed background worker for job_id: {}", job.id) + return PlainTextResponse("Ok.") @@ -149,7 +241,9 @@ def torrents_info( if category and (r.category or "") != category: continue job = get_job(session, r.job_id) if r.job_id else None - state = r.state + state = coerce_torrent_state( + stored_state=r.state, job_status=job.status if job else None + ) progress = 0.0 dlspeed = 0 eta = 0 @@ -162,19 +256,12 @@ def torrents_info( f"Job {job.id}: status={job.status}, progress={progress}, speed={dlspeed}, eta={eta}" ) if job.status == "completed": - state = "uploading" dlspeed = 0 if job.result_path and os.path.exists(job.result_path): try: size = int(os.path.getsize(job.result_path)) except Exception: pass - elif job.status == "failed": - state = "error" - elif job.status == "cancelled": - state = "pausedDL" - else: - state = "downloading" content_path = None save_path_val = r.save_path or (QBIT_PUBLIC_SAVE_PATH or str(DOWNLOAD_DIR)) diff --git a/apps/api/app/api/torznab/api.py b/apps/api/app/api/torznab/api.py index 2e6e5337..051d2852 100644 --- a/apps/api/app/api/torznab/api.py +++ b/apps/api/app/api/torznab/api.py @@ -9,8 +9,11 @@ from loguru import logger from sqlmodel import Session +from app.catalog import require_catalog_ready +from app.catalog.exceptions import CatalogNotReadyError from app.config import ( ANIBRIDGE_TEST_MODE, + CATALOG_SITES_LIST, SPECIALS_METADATA_ENABLED, STRM_FILES_MODE, TORZNAB_CAT_ANIME, @@ -18,14 +21,24 @@ TORZNAB_RETURN_TEST_RESULT, TORZNAB_SEASON_SEARCH_MAX_CONSECUTIVE_MISSES, TORZNAB_SEASON_SEARCH_MAX_EPISODES, - TORZNAB_SEASON_SEARCH_MODE, TORZNAB_TEST_EPISODE, TORZNAB_TEST_LANGUAGE, TORZNAB_TEST_SEASON, TORZNAB_TEST_SLUG, TORZNAB_TEST_TITLE, ) -from app.db import get_session +from app.db import ( + find_canonical_series_by_ids_or_title, + find_provider_episode_mapping, + find_provider_episode_mappings_for_canonical_episode, + find_provider_episode_mappings_for_canonical_season, + get_provider_index_status, + get_indexed_episode_languages, + get_session, + list_indexed_provider_episodes, + resolve_indexed_title, + search_indexed_provider_titles, +) from app.providers.aniworld.specials import ( SpecialIds, resolve_special_mapping_from_episode_request, @@ -66,6 +79,27 @@ def _coerce_positive_int(value: object) -> Optional[int]: return _coerce_positive_int_impl(value) +def _ready_provider_title_indexes( + session: Session, *, providers: list[str] +) -> list[str]: + ready = [] + pending = [] + for provider in providers: + status = get_provider_index_status(session, provider=provider) + if status is not None and status.title_index_status == "ready": + ready.append(provider) + continue + pending.append( + f"{provider} ({status.title_index_status if status is not None else 'pending'})" + ) + if ready: + return ready + raise CatalogNotReadyError( + "Provider catalog bootstrap is still running. " + f"Pending providers: {', '.join(pending)}." + ) + + def _coerce_non_negative_int(value: object) -> Optional[int]: """Coerce a value into a non-negative integer.""" return _coerce_non_negative_int_impl(value) @@ -355,6 +389,223 @@ def _rss_response(rss: ET.Element) -> Response: return Response(content=xml, media_type="application/rss+xml; charset=utf-8") +def _indexed_preview_results( + *, + tn_module, + session: Session, + q_str: str, + channel: ET.Element, + cat_id: int, + providers: list[str], + limit: int, + strm_suffix: str, +) -> int: + rows = search_indexed_provider_titles( + session, + query=q_str, + providers=providers, + limit=max(1, limit), + ) + now = datetime.now(timezone.utc) + count = 0 + for row in rows: + provider = row.provider + title = row.title + episodes = list_indexed_provider_episodes( + session, + provider=provider, + slug=row.slug, + ) + if episodes: + target = sorted(episodes, key=lambda item: (item.season, item.episode))[0] + mapping = find_provider_episode_mapping( + session, + provider=provider, + slug=row.slug, + provider_season=target.season, + provider_episode=target.episode, + ) + languages = get_indexed_episode_languages( + session, + provider=provider, + slug=row.slug, + season=target.season, + episode=target.episode, + ) + language_values = [ + item.language for item in languages + ] or _default_languages_for_site(provider) + season_i = ( + mapping.canonical_season if mapping is not None else target.season + ) + episode_i = ( + mapping.canonical_episode if mapping is not None else target.episode + ) + provider_season_i = target.season + provider_episode_i = target.episode + else: + language_values = _default_languages_for_site(provider) + season_i = 1 + episode_i = 1 + provider_season_i = 1 + provider_episode_i = 1 + for language in language_values: + release_title = tn_module.build_release_name( + series_title=title, + season=None if cat_id == TORZNAB_CAT_MOVIE else season_i, + episode=None if cat_id == TORZNAB_CAT_MOVIE else episode_i, + height=None, + vcodec=None, + language=language, + site=provider, + ) + if STRM_FILES_MODE != "only": + magnet = tn_module.build_magnet( + title=release_title, + slug=row.slug, + season=provider_season_i, + episode=provider_episode_i, + language=language, + provider=None, + site=provider, + ) + _build_item( + channel=channel, + title=release_title, + magnet=magnet, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{row.slug}:{season_i}:{episode_i}:{language}", + language=language, + ) + count += 1 + if count >= max(1, limit): + return count + if STRM_FILES_MODE in ("only", "both"): + magnet_strm = tn_module.build_magnet( + title=release_title + strm_suffix, + slug=row.slug, + season=provider_season_i, + episode=provider_episode_i, + language=language, + provider=None, + site=provider, + mode="strm", + ) + _build_item( + channel=channel, + title=release_title + strm_suffix, + magnet=magnet_strm, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{row.slug}:{season_i}:{episode_i}:{language}:strm", + language=language, + ) + count += 1 + if count >= max(1, limit): + return count + return count + + +def _emit_indexed_mapped_episode( + *, + tn_module, + session: Session, + channel: ET.Element, + provider: str, + slug: str, + title: str, + canonical_season: int, + canonical_episode: int, + provider_season: int, + provider_episode: int, + cat_id: int, + now: datetime, + strm_suffix: str, + max_items: int, +) -> int: + languages = get_indexed_episode_languages( + session, + provider=provider, + slug=slug, + season=provider_season, + episode=provider_episode, + ) + language_values = [ + language_row.language for language_row in languages + ] or _default_languages_for_site(provider) + emitted = 0 + for language in language_values: + release_title = tn_module.build_release_name( + series_title=title, + season=canonical_season, + episode=canonical_episode, + height=None, + vcodec=None, + language=language, + site=provider, + ) + if STRM_FILES_MODE != "only": + magnet = tn_module.build_magnet( + title=release_title, + slug=slug, + season=provider_season, + episode=provider_episode, + language=language, + provider=None, + site=provider, + ) + _build_item( + channel=channel, + title=release_title, + magnet=magnet, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{slug}:S{canonical_season}E{canonical_episode}:{language}", + language=language, + ) + emitted += 1 + if emitted >= max_items: + return emitted + if STRM_FILES_MODE in ("only", "both"): + magnet_strm = tn_module.build_magnet( + title=release_title + strm_suffix, + slug=slug, + season=provider_season, + episode=provider_episode, + language=language, + provider=None, + site=provider, + mode="strm", + ) + _build_item( + channel=channel, + title=release_title + strm_suffix, + magnet=magnet_strm, + pubdate=now, + cat_id=cat_id, + guid_str=f"{provider}:{slug}:S{canonical_season}E{canonical_episode}:{language}:strm", + language=language, + ) + emitted += 1 + if emitted >= max_items: + return emitted + return emitted + + +def _indexed_display_title( + *, + session: Session, + provider: str, + slug: str, + fallback_title: str, +) -> str: + title = resolve_indexed_title(session, provider=provider, slug=slug) + if title: + return title + return fallback_title + + @router.get("/api", response_class=FastAPIResponse) def torznab_api( request: Request, @@ -430,50 +681,70 @@ def torznab_api( return _rss_response(rss) if movie_preferred: - count = _handle_preview_search( - session, - q_str, - channel, - TORZNAB_CAT_MOVIE, - site="megakino", + try: + movie_providers = _ready_provider_title_indexes( + session, providers=["megakino"] + ) + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc + count = _indexed_preview_results( + tn_module=tn, + session=session, + q_str=q_str, + channel=channel, + cat_id=TORZNAB_CAT_MOVIE, + providers=movie_providers, limit=limit, strm_suffix=strm_suffix, ) if count == 0: - _handle_preview_search( - session, - q_str, - channel, - TORZNAB_CAT_ANIME, + anime_providers = [ + site for site in CATALOG_SITES_LIST if site != "megakino" + ] + try: + anime_providers = _ready_provider_title_indexes( + session, providers=anime_providers + ) + except CatalogNotReadyError as exc: + logger.debug( + "Skipping anime fallback for movie-preferred search because " + "no anime title indexes are ready: {}", + exc, + ) + return _rss_response(rss) + _indexed_preview_results( + tn_module=tn, + session=session, + q_str=q_str, + channel=channel, + cat_id=TORZNAB_CAT_ANIME, + providers=anime_providers, limit=limit, strm_suffix=strm_suffix, ) return _rss_response(rss) - special_count = _handle_special_search( - session, - q_str, - channel, - cat_id, - ids=SpecialIds( - tvdbid=tvdbid, - tmdbid=tmdbid, - imdbid=imdbid, - rid=rid, - tvmazeid=tvmazeid, - ), + anime_providers = [site for site in CATALOG_SITES_LIST if site != "megakino"] + try: + anime_providers = _ready_provider_title_indexes( + session, providers=anime_providers + ) + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc + _indexed_preview_results( + tn_module=tn, + session=session, + q_str=q_str, + channel=channel, + cat_id=cat_id, + providers=anime_providers, limit=limit, strm_suffix=strm_suffix, ) - if special_count == 0: - _handle_preview_search( - session, - q_str, - channel, - cat_id, - limit=limit, - strm_suffix=strm_suffix, - ) return _rss_response(rss) if t in ("movie", "movie-search"): @@ -491,12 +762,21 @@ def torznab_api( ) return _rss_response(rss) if q_str: - _handle_preview_search( - session, - q_str, - channel, - TORZNAB_CAT_MOVIE, - site="megakino", + try: + movie_providers = _ready_provider_title_indexes( + session, providers=["megakino"] + ) + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc + _indexed_preview_results( + tn_module=tn, + session=session, + q_str=q_str, + channel=channel, + cat_id=TORZNAB_CAT_MOVIE, + providers=movie_providers, limit=limit, strm_suffix=strm_suffix, ) @@ -509,6 +789,13 @@ def torznab_api( import app.api.torznab as tn + try: + require_catalog_ready() + except CatalogNotReadyError as exc: + from fastapi import HTTPException + + raise HTTPException(status_code=503, detail=str(exc)) from exc + if season is None: logger.debug("Returning empty RSS feed due to missing season.") return _empty_rss_response() @@ -530,100 +817,97 @@ def torznab_api( logger.debug("Returning empty RSS feed due to unresolved tvsearch query.") return _empty_rss_response() - result = tn._slug_from_query(q_str) - if not result: - logger.warning("No slug found for query '{}'. Returning empty RSS feed.", q_str) + canonical_series = find_canonical_series_by_ids_or_title( + session, + tvdb_id=tvdbid, + tmdb_id=tmdbid, + imdb_id=imdbid, + query=q_str, + ) + if canonical_series is None: + logger.warning( + "No canonical series found for query '{}'. Returning empty RSS feed.", + q_str, + ) return _empty_rss_response() - - site_found, slug = result - display_title = tn.resolve_series_title(slug, site_found) or q_str rss, channel = _rss_root() - count = 0 limit_i = max(1, int(limit)) now = datetime.now(timezone.utc) strm_suffix = " [STRM]" - ids = SpecialIds( - tvdbid=tvdbid, - tmdbid=tmdbid, - imdbid=imdbid, - rid=rid, - tvmazeid=tvmazeid, - ) if search_mode == "episode-search": assert ep_i is not None - emitted, limit_hit = emit_tvsearch_episode_items( - tn_module=tn, - session=session, - channel=channel, - slug=slug, - site_found=site_found, - display_title=display_title, - q_str=q_str, - request_season=season_i, - request_episode=ep_i, - ids=ids, - now=now, - strm_suffix=strm_suffix, - max_items=limit_i, - allow_live_probe=True, - ) - count += emitted - if limit_hit: - logger.info( - "tvsearch episode-search terminated due to limit hit (limit={})", - limit_i, + count = 0 + for mapping in find_provider_episode_mappings_for_canonical_episode( + session, + tvdb_id=canonical_series.tvdb_id, + canonical_season=season_i, + canonical_episode=ep_i, + providers=CATALOG_SITES_LIST, + ): + display_title = _indexed_display_title( + session=session, + provider=mapping.provider, + slug=mapping.slug, + fallback_title=canonical_series.title, + ) + remaining = limit_i - count + if remaining <= 0: + break + count += _emit_indexed_mapped_episode( + tn_module=tn, + session=session, + channel=channel, + provider=mapping.provider, + slug=mapping.slug, + title=display_title, + canonical_season=season_i, + canonical_episode=ep_i, + provider_season=mapping.provider_season, + provider_episode=mapping.provider_episode, + cat_id=TORZNAB_CAT_ANIME, + now=now, + strm_suffix=strm_suffix, + max_items=remaining, ) return _rss_response(rss) - fast_season_mode = TORZNAB_SEASON_SEARCH_MODE == "fast" - episode_numbers = resolve_season_episode_numbers( - tn_module=tn, - session=session, - slug=slug, - season_i=season_i, - site_found=site_found, - q_str=q_str, - ids=ids, - allow_fallback_probe=not fast_season_mode, + count = 0 + season_mappings = sorted( + find_provider_episode_mappings_for_canonical_season( + session, + tvdb_id=canonical_series.tvdb_id, + canonical_season=season_i, + providers=CATALOG_SITES_LIST, + ), + key=lambda item: (item.canonical_episode, item.provider, item.slug), ) - for episode_i in episode_numbers: + for mapping in season_mappings: remaining = limit_i - count if remaining <= 0: - logger.info( - "tvsearch season-search termination reason=limit hit limit={}", - limit_i, - ) break - - emitted, limit_hit = emit_tvsearch_episode_items( + display_title = _indexed_display_title( + session=session, + provider=mapping.provider, + slug=mapping.slug, + fallback_title=canonical_series.title, + ) + count += _emit_indexed_mapped_episode( tn_module=tn, session=session, channel=channel, - slug=slug, - site_found=site_found, - display_title=display_title, - q_str=q_str, - request_season=season_i, - request_episode=episode_i, - ids=ids, + provider=mapping.provider, + slug=mapping.slug, + title=display_title, + canonical_season=season_i, + canonical_episode=mapping.canonical_episode, + provider_season=mapping.provider_season, + provider_episode=mapping.provider_episode, + cat_id=TORZNAB_CAT_ANIME, now=now, strm_suffix=strm_suffix, max_items=remaining, - allow_live_probe=not fast_season_mode, - fast_episode_languages=None, ) - count += emitted - if limit_hit: - logger.info( - ( - "tvsearch season-search termination reason=limit hit " - "limit={} emitted_items={}" - ), - limit_i, - count, - ) - break logger.info("Returning RSS feed with {} items.", count) return _rss_response(rss) diff --git a/apps/api/app/catalog/__init__.py b/apps/api/app/catalog/__init__.py new file mode 100644 index 00000000..13743dcf --- /dev/null +++ b/apps/api/app/catalog/__init__.py @@ -0,0 +1,11 @@ +from .indexer import ( + get_catalog_indexer, + get_catalog_readiness_error, + require_catalog_ready, +) + +__all__ = [ + "get_catalog_indexer", + "get_catalog_readiness_error", + "require_catalog_ready", +] diff --git a/apps/api/app/catalog/exceptions.py b/apps/api/app/catalog/exceptions.py new file mode 100644 index 00000000..c2218131 --- /dev/null +++ b/apps/api/app/catalog/exceptions.py @@ -0,0 +1,5 @@ +from __future__ import annotations + + +class CatalogNotReadyError(RuntimeError): + """Raised when catalog-dependent routes are hit before bootstrap completes.""" diff --git a/apps/api/app/catalog/indexer.py b/apps/api/app/catalog/indexer.py new file mode 100644 index 00000000..1779896b --- /dev/null +++ b/apps/api/app/catalog/indexer.py @@ -0,0 +1,1902 @@ +from __future__ import annotations + +from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait +from dataclasses import dataclass +from datetime import timedelta +from queue import Empty, Full, Queue +from threading import Event, Lock, Semaphore, Thread +from time import monotonic +from uuid import uuid4 + +from loguru import logger +from sqlmodel import Session, select + +from app.catalog.exceptions import CatalogNotReadyError +from app.catalog.providers import ( + CanonicalPayload, + CatalogCrawlObserver, + EpisodeLanguageRecord, + EpisodeRecord, + TitleRecord, + crawl_provider_title_detail, + load_provider_title_index, + resolve_provider_canonical, +) +from app.config import ( + ANIBRIDGE_TEST_MODE, + CANONICAL_INDEX_CONCURRENCY, + CATALOG_SITES_LIST, + CATALOG_SITE_CONFIGS, + PROGRESS_STEP_PERCENT, + PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS, + PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT, + PROVIDER_INDEX_GLOBAL_CONCURRENCY, + PROVIDER_INDEX_QUEUE_SIZE, + PROVIDER_INDEX_SCHEDULER_POLL_SECONDS, + PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, + PROVIDER_INDEX_WRITER_BATCH_SIZE, + PROVIDER_INDEX_WRITER_FLUSH_SECONDS, +) +from app.db import ( + ProviderCatalogAlias, + ProviderCatalogEpisode, + ProviderCatalogTitle, + ProviderEpisodeLanguage, + ProviderIndexStatus, + ProviderTitleIndexState, + as_aware_utc, + delete_provider_generation, + engine, + get_provider_index_status, + is_catalog_bootstrap_ready, + is_provider_fully_ready, + list_provider_index_statuses, + prune_provider_generation, + replace_canonical_episodes, + replace_provider_catalog_aliases, + replace_provider_catalog_episodes, + replace_provider_catalog_title, + replace_provider_episode_mappings, + replace_provider_movie_mappings, + replace_provider_series_mappings, + upsert_canonical_series, + upsert_provider_index_status, + upsert_provider_title_index_state, + utcnow, +) +from app.utils.terminal import ProgressReporter, ProgressSnapshot + +_INDEXER: "ProviderCatalogIndexer | None" = None +_INDEXER_LOCK = Lock() +_QUEUE_SENTINEL = object() +_UNSET = object() +_STAGES = ("title_index", "detail_enrichment", "canonical_enrichment") + + +@dataclass(slots=True) +class ProviderCatalogProgress: + provider: str + phase: str = "pending" + stage: str = "title_index" + crawled_titles: int = 0 + persisted_titles: int = 0 + failed_titles: int = 0 + total_titles: int | None = None + current_slug: str = "" + queue_depth: int = 0 + last_logged_crawl_step: int = -1 + last_logged_persist_step: int = -1 + + @property + def processed_titles(self) -> int: + return self.persisted_titles + + @property + def writer_lag_titles(self) -> int: + return max(0, self.crawled_titles - self.persisted_titles) + + @property + def progress_percent(self) -> float | None: + if not self.total_titles: + return None + if self.total_titles <= 0: + return 100.0 + return round( + max(0.0, min(100.0, self.persisted_titles / self.total_titles * 100.0)), + 1, + ) + + @property + def crawl_percent(self) -> float | None: + if not self.total_titles: + return None + if self.total_titles <= 0: + return 100.0 + completed = self.crawled_titles + self.failed_titles + return round(max(0.0, min(100.0, completed / self.total_titles * 100.0)), 1) + + +class CatalogIndexWriteCoordinator: + def __init__(self) -> None: + self._lock = Lock() + + def run(self, callback): + with self._lock: + with Session(engine) as session: + result = callback(session) + if hasattr(session, "commit"): + session.commit() + return result + + +def get_catalog_indexer() -> "ProviderCatalogIndexer": + global _INDEXER + with _INDEXER_LOCK: + if _INDEXER is None: + _INDEXER = ProviderCatalogIndexer() + return _INDEXER + + +def get_catalog_readiness_error() -> str | None: + indexer = get_catalog_indexer() + with Session(engine) as session: + statuses = list_provider_index_statuses(session) + if ANIBRIDGE_TEST_MODE and not statuses: + return None + if is_catalog_bootstrap_ready(session, providers=CATALOG_SITES_LIST): + return None + pending: list[str] = [] + snapshot = indexer.get_progress_snapshot() + by_provider = {item["provider"]: item for item in snapshot.get("providers", [])} + for provider in CATALOG_SITES_LIST: + status = get_provider_index_status(session, provider=provider) + search_ready = bool( + status is not None + and (status.title_index_status == "ready" or status.bootstrap_completed) + ) + if status is None or not search_ready: + progress = by_provider.get(provider, {}) + pending.append( + f"{provider} ({progress.get('title_index_status') or 'pending'})" + ) + if not pending: + return None + return ( + "Provider catalog bootstrap is still running. " + f"Pending providers: {', '.join(pending)}." + ) + + +def require_catalog_ready() -> None: + message = get_catalog_readiness_error() + if message: + logger.warning("Catalog-dependent request blocked: {}", message) + raise CatalogNotReadyError(message) + + +class ProviderCatalogIndexer: + def __init__(self) -> None: + self._stop_event = Event() + self._thread: Thread | None = None + self._active = Semaphore(PROVIDER_INDEX_GLOBAL_CONCURRENCY) + self._progress_lock = Lock() + self._progress: dict[str, ProviderCatalogProgress] = {} + self._workers_lock = Lock() + self._workers: dict[str, Thread] = {} + self._writer = CatalogIndexWriteCoordinator() + + def start(self) -> None: + self._ensure_status_rows() + self._log_bootstrap_state() + if ANIBRIDGE_TEST_MODE: + return + if self._thread is not None and self._thread.is_alive(): + return + self._stop_event.clear() + self._thread = Thread( + target=self._run_loop, + name="provider-catalog-indexer", + daemon=True, + ) + self._thread.start() + + def stop(self) -> None: + self._stop_event.set() + if self._thread is not None: + self._thread.join(timeout=5) + with self._workers_lock: + workers = list(self._workers.values()) + for worker in workers: + worker.join(timeout=5) + + def run_due_once(self) -> None: + with Session(engine) as session: + statuses = list_provider_index_statuses(session) + for status in statuses: + if self._is_due(status): + self.refresh_provider(status.provider) + + def refresh_provider(self, provider: str) -> None: + with self._workers_lock: + existing = self._workers.get(provider) + if existing is not None and existing.is_alive(): + return + if not self._active.acquire(blocking=False): + return + worker = Thread( + target=self._run_provider_refresh, + name=f"provider-index-{provider}", + args=(provider,), + daemon=True, + ) + with self._workers_lock: + self._workers[provider] = worker + worker.start() + + def get_progress_snapshot(self) -> dict[str, object]: + with Session(engine) as session: + statuses = { + status.provider: status + for status in list_provider_index_statuses(session) + } + bootstrap_ready = is_catalog_bootstrap_ready( + session, providers=CATALOG_SITES_LIST + ) + with self._progress_lock: + runtime = { + provider: ProviderCatalogProgress( + provider=snapshot.provider, + phase=snapshot.phase, + stage=snapshot.stage, + crawled_titles=snapshot.crawled_titles, + persisted_titles=snapshot.persisted_titles, + failed_titles=snapshot.failed_titles, + total_titles=snapshot.total_titles, + current_slug=snapshot.current_slug, + queue_depth=snapshot.queue_depth, + last_logged_crawl_step=snapshot.last_logged_crawl_step, + last_logged_persist_step=snapshot.last_logged_persist_step, + ) + for provider, snapshot in self._progress.items() + } + providers: list[dict[str, object]] = [] + for provider in CATALOG_SITES_LIST: + status = statuses.get(provider) + progress = runtime.get(provider, ProviderCatalogProgress(provider=provider)) + providers.append( + { + "provider": provider, + "status": status.status if status is not None else "pending", + "active_stage": status.active_stage if status is not None else None, + "bootstrap_completed": ( + bool(status.bootstrap_completed) + if status is not None + else False + ), + "title_index_status": ( + status.title_index_status if status is not None else "pending" + ), + "detail_enrichment_status": ( + status.detail_enrichment_status + if status is not None + else "pending" + ), + "canonical_enrichment_status": ( + status.canonical_enrichment_status + if status is not None + else "pending" + ), + "search_ready": bool( + status is not None + and ( + status.title_index_status == "ready" + or status.bootstrap_completed + ) + ), + "full_ready": is_provider_fully_ready(status), + "phase": progress.phase, + "stage": progress.stage, + "processed_titles": progress.processed_titles, + "crawled_titles": progress.crawled_titles, + "persisted_titles": progress.persisted_titles, + "failed_titles": progress.failed_titles, + "total_titles": progress.total_titles, + "progress_percent": progress.progress_percent, + "crawl_progress_percent": progress.crawl_percent, + "queue_depth": progress.queue_depth, + "writer_lag_titles": progress.writer_lag_titles, + "current_slug": progress.current_slug or None, + "latest_success_generation": ( + status.latest_success_generation if status is not None else None + ), + "staging_generation": ( + status.current_generation + if status is not None + and status.current_generation + != status.latest_success_generation + else None + ), + "last_error_summary": ( + status.last_error_summary if status is not None else "" + ), + "latest_started_at": ( + status.latest_started_at.isoformat() + if status is not None and status.latest_started_at is not None + else None + ), + "latest_completed_at": ( + status.latest_completed_at.isoformat() + if status is not None and status.latest_completed_at is not None + else None + ), + "title_index_ready_at": ( + status.title_index_ready_at.isoformat() + if status is not None + and status.title_index_ready_at is not None + else None + ), + "detail_ready_at": ( + status.detail_ready_at.isoformat() + if status is not None and status.detail_ready_at is not None + else None + ), + "canonical_ready_at": ( + status.canonical_ready_at.isoformat() + if status is not None and status.canonical_ready_at is not None + else None + ), + } + ) + return { + "bootstrap_ready": bootstrap_ready, + "bootstrapping": not bootstrap_ready, + "full_ready": all( + is_provider_fully_ready(statuses.get(provider)) + for provider in CATALOG_SITES_LIST + ), + "providers": providers, + } + + def _run_loop(self) -> None: + while not self._stop_event.is_set(): + try: + self.run_due_once() + except Exception as exc: + logger.exception("Provider catalog scheduler loop failed: {}", exc) + if self._stop_event.wait(PROVIDER_INDEX_SCHEDULER_POLL_SECONDS): + break + + def _run_provider_refresh(self, provider: str) -> None: + try: + self._refresh_provider(provider) + finally: + self._active.release() + with self._workers_lock: + self._workers.pop(provider, None) + + def _ensure_status_rows(self) -> None: + for provider in CATALOG_SITES_LIST: + self._set_progress(provider, phase="pending", stage="title_index") + hours = self._refresh_interval_hours(provider) + with Session(engine) as session: + status = get_provider_index_status(session, provider=provider) + if status is None: + logger.warning( + "Provider catalog bootstrap: no persisted index state for {}. Initial bootstrap required.", + provider, + ) + self._writer.run( + lambda session, provider=provider, hours=hours: ( + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=hours, + status="pending", + title_index_status="pending", + detail_enrichment_status="pending", + canonical_enrichment_status="pending", + bootstrap_completed=False, + commit=False, + ) + ) + ) + continue + stale_generation = self._stale_generation(status) + if stale_generation is not None: + logger.warning( + "Provider catalog bootstrap: found interrupted staging generation for {} generation={} status={} cursor_slug={}. Cleaning it up before retry.", + provider, + stale_generation, + status.status, + getattr(status, "cursor_title_slug", None) or None, + ) + self._writer.run( + lambda session, provider=provider, generation=stale_generation, hours=hours: ( + self._cleanup_stale_generation( + session, + provider=provider, + generation=generation, + refresh_interval_hours=hours, + ) + ) + ) + continue + if self._needs_stage_backfill(status): + self._writer.run( + lambda session, provider=provider, status=status, hours=hours: ( + self._backfill_legacy_stage_state( + session, + provider=provider, + status=status, + refresh_interval_hours=hours, + ) + ) + ) + + def _needs_stage_backfill(self, status: ProviderIndexStatus) -> bool: + return bool( + getattr(status, "bootstrap_completed", False) + and getattr(status, "latest_success_generation", None) + and getattr(status, "title_index_status", "pending") == "pending" + ) + + def _backfill_legacy_stage_state( + self, + session: Session, + *, + provider: str, + status: ProviderIndexStatus, + refresh_interval_hours: float, + ) -> None: + ready_at = ( + getattr(status, "latest_completed_at", None) + or getattr(status, "latest_success_at", None) + or utcnow() + ) + full_ready = getattr(status, "status", None) == "ready" + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + title_index_status="ready", + title_index_ready_at=getattr(status, "title_index_ready_at", None) + or ready_at, + title_index_next_retry_after=None, + detail_enrichment_status="ready" if full_ready else "pending", + detail_ready_at=getattr(status, "detail_ready_at", None) + or (ready_at if full_ready else None), + detail_next_retry_after=None, + canonical_enrichment_status="ready" if full_ready else "pending", + canonical_ready_at=getattr(status, "canonical_ready_at", None) + or (ready_at if full_ready else None), + canonical_next_retry_after=None, + commit=False, + ) + + def _cleanup_stale_generation( + self, + session: Session, + *, + provider: str, + generation: str, + refresh_interval_hours: float, + ) -> None: + delete_provider_generation(session, provider=provider, generation=generation) + current = get_provider_index_status(session, provider=provider) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="pending", + active_stage=None, + current_generation=None, + latest_completed_at=utcnow(), + title_index_status="pending", + detail_enrichment_status="pending", + canonical_enrichment_status="pending", + failure_count=0 if current is None else current.failure_count + 1, + last_error_summary="Interrupted staging generation cleaned up after restart.", + commit=False, + ) + + def _is_due(self, status: ProviderIndexStatus) -> bool: + return self._pick_due_stage(status) is not None + + def _stage_due( + self, + *, + stage_status: str, + retry_after, + refresh_after=_UNSET, + ) -> bool: + if stage_status == "running": + return False + now = utcnow() + if retry_after is not None and as_aware_utc(retry_after) > now: + return False + if refresh_after is not _UNSET and refresh_after is not None: + if as_aware_utc(refresh_after) > now: + return False + if stage_status in {"pending", "failed"}: + return True + if refresh_after is _UNSET: + return False + if refresh_after is None: + return True + return as_aware_utc(refresh_after) <= now + + def _pick_due_stage(self, status: ProviderIndexStatus) -> str | None: + if getattr(status, "status", None) == "running": + return None + latest_success_generation = getattr(status, "latest_success_generation", None) + title_index_status = getattr(status, "title_index_status", None) + if title_index_status is None: + title_index_status = "ready" if latest_success_generation else "pending" + if not latest_success_generation or title_index_status != "ready": + if self._stage_due( + stage_status=title_index_status, + retry_after=getattr(status, "title_index_next_retry_after", None), + refresh_after=getattr(status, "next_refresh_after", None), + ): + return "title_index" + return None + if self._detail_stage_has_due_work(status.provider): + if self._stage_due( + stage_status=getattr(status, "detail_enrichment_status", "pending"), + retry_after=getattr(status, "detail_next_retry_after", None), + ): + return "detail_enrichment" + return None + if self._canonical_stage_has_due_work(status.provider): + if self._stage_due( + stage_status=getattr(status, "canonical_enrichment_status", "pending"), + retry_after=getattr(status, "canonical_next_retry_after", None), + ): + return "canonical_enrichment" + return None + if self._stage_due( + stage_status=title_index_status, + retry_after=getattr(status, "title_index_next_retry_after", None), + refresh_after=getattr(status, "next_refresh_after", None), + ): + return "title_index" + return None + + def _refresh_provider(self, provider: str) -> None: + with Session(engine) as session: + status = get_provider_index_status(session, provider=provider) + if status is None: + return + stage = self._pick_due_stage(status) + if stage is None: + return + if stage == "title_index": + self._run_title_index_stage(provider) + return + if stage == "detail_enrichment": + self._run_detail_enrichment_stage(provider) + return + if stage == "canonical_enrichment": + self._run_canonical_enrichment_stage(provider) + + def _run_title_index_stage(self, provider: str) -> None: + refresh_interval_hours = self._refresh_interval_hours(provider) + generation = uuid4().hex + queue: Queue[TitleRecord | object] = Queue(maxsize=PROVIDER_INDEX_QUEUE_SIZE) + writer_failure: list[BaseException] = [] + self._set_progress( + provider, + phase="title_index", + stage="title_index", + crawled_titles=0, + persisted_titles=0, + failed_titles=0, + total_titles=None, + current_slug="", + queue_depth=0, + reset_log_steps=True, + ) + self._writer.run( + lambda session: upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="running", + active_stage="title_index", + current_generation=generation, + latest_started_at=utcnow(), + latest_completed_at=None, + title_index_status="running", + title_index_next_retry_after=None, + last_error_summary="", + cursor_title_slug="", + commit=False, + ) + ) + + reporter = ProgressReporter( + label=f"Catalog {provider}", + unit="title", + unit_scale=False, + ) + writer = Thread( + target=self._title_index_writer_loop, + name=f"provider-index-writer-{provider}", + args=( + provider, + generation, + refresh_interval_hours, + queue, + reporter, + writer_failure, + ), + daemon=True, + ) + writer.start() + + def on_index_loaded(total_titles: int) -> None: + self._set_progress( + provider, + phase="title_index", + stage="title_index", + total_titles=total_titles, + reset_log_steps=True, + ) + reporter.update( + ProgressSnapshot(downloaded=0, total=total_titles, status="title_index") + ) + + observer = CatalogCrawlObserver(on_index_loaded=on_index_loaded) + writer_shutdown_signaled = False + + try: + rows = load_provider_title_index(provider, observer=observer) + if self._get_total_titles(provider) is None: + on_index_loaded(len(rows)) + for row in rows: + self._enqueue_title_record(provider, queue, row, writer_failure) + self._advance_crawl_progress( + provider, + current_slug=row.slug, + queue_depth=queue.qsize(), + ) + self._signal_title_index_writer_shutdown(provider=provider, queue=queue) + writer_shutdown_signaled = True + writer.join(timeout=30) + self._ensure_title_index_writer_stopped( + provider=provider, + writer=writer, + writer_failure=writer_failure, + timeout_seconds=30, + ) + if writer_failure: + raise RuntimeError(str(writer_failure[0])) + completed_at = utcnow() + self._writer.run( + lambda session: self._finish_title_index_success( + session, + provider=provider, + generation=generation, + refresh_interval_hours=refresh_interval_hours, + completed_at=completed_at, + ) + ) + self._set_progress( + provider, + phase="title_index_ready", + stage="title_index", + queue_depth=0, + current_slug="", + ) + except Exception as exc: + logger.exception( + "Provider catalog title index failed for {}: {}", provider, exc + ) + error_text = self._shutdown_title_index_writer( + provider=provider, + queue=queue, + writer_shutdown_signaled=writer_shutdown_signaled, + writer=writer, + writer_failure=writer_failure, + timeout_seconds=5, + error_text=str(exc), + ) + completed_at = utcnow() + self._writer.run( + lambda session, error=error_text: self._finish_title_index_failure( + session, + provider=provider, + generation=generation, + refresh_interval_hours=refresh_interval_hours, + completed_at=completed_at, + error=error, + ) + ) + self._set_progress( + provider, + phase="failed", + stage="title_index", + queue_depth=0, + current_slug="", + ) + finally: + reporter.close() + + def _signal_title_index_writer_shutdown( + self, + *, + provider: str, + queue: Queue[TitleRecord | object], + ) -> None: + try: + queue.put_nowait(_QUEUE_SENTINEL) + except Full as exc: + logger.error( + "Provider catalog title index writer queue is full during shutdown for {}", + provider, + ) + raise RuntimeError( + f"writer shutdown queue is full for provider {provider}" + ) from exc + + def _ensure_title_index_writer_stopped( + self, + *, + provider: str, + writer: Thread, + writer_failure: list[BaseException], + timeout_seconds: int, + ) -> None: + if writer.is_alive(): + detail = f": {writer_failure[0]}" if writer_failure else "" + raise RuntimeError( + f"writer thread did not finish within {timeout_seconds}s for {provider}{detail}" + ) + + def _shutdown_title_index_writer( + self, + *, + provider: str, + queue: Queue[TitleRecord | object], + writer_shutdown_signaled: bool, + writer: Thread, + writer_failure: list[BaseException], + timeout_seconds: int, + error_text: str, + ) -> str: + shutdown_errors: list[str] = [] + + if not writer_shutdown_signaled: + try: + self._signal_title_index_writer_shutdown(provider=provider, queue=queue) + except Exception as exc: + shutdown_errors.append(f"writer shutdown signal failed: {exc}") + try: + writer.join(timeout=timeout_seconds) + except Exception as exc: + shutdown_errors.append(f"writer join failed: {exc}") + try: + self._ensure_title_index_writer_stopped( + provider=provider, + writer=writer, + writer_failure=writer_failure, + timeout_seconds=timeout_seconds, + ) + except Exception as exc: + shutdown_errors.append(str(exc)) + + if not shutdown_errors: + return error_text + return " | ".join([error_text, *shutdown_errors]) + + def _finish_title_index_success( + self, + session: Session, + *, + provider: str, + generation: str, + refresh_interval_hours: float, + completed_at, + ) -> None: + prune_provider_generation( + session, provider=provider, keep_generation=generation + ) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="partial", + active_stage=None, + current_generation=generation, + latest_success_generation=generation, + latest_completed_at=completed_at, + latest_success_at=completed_at, + next_refresh_after=completed_at + timedelta(hours=refresh_interval_hours), + bootstrap_completed=True, + title_index_status="ready", + title_index_ready_at=completed_at, + title_index_next_retry_after=None, + detail_enrichment_status="pending", + detail_ready_at=None, + detail_next_retry_after=None, + canonical_enrichment_status="pending", + canonical_ready_at=None, + canonical_next_retry_after=None, + failure_count=0, + last_error_summary="", + cursor_title_slug="", + commit=False, + ) + + def _finish_title_index_failure( + self, + session: Session, + *, + provider: str, + generation: str, + refresh_interval_hours: float, + completed_at, + error: str, + ) -> None: + delete_provider_generation(session, provider=provider, generation=generation) + current = get_provider_index_status(session, provider=provider) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="failed", + active_stage=None, + current_generation=None, + latest_completed_at=completed_at, + title_index_status="failed", + title_index_next_retry_after=completed_at + + timedelta(hours=refresh_interval_hours), + failure_count=1 if current is None else current.failure_count + 1, + last_error_summary=error[:500], + commit=False, + ) + + def _title_index_writer_loop( + self, + provider: str, + generation: str, + refresh_interval_hours: float, + queue: Queue[TitleRecord | object], + reporter: ProgressReporter, + writer_failure: list[BaseException], + ) -> None: + batch: list[TitleRecord] = [] + last_flush_at = monotonic() + try: + while True: + timeout = max( + 0.1, + PROVIDER_INDEX_WRITER_FLUSH_SECONDS - (monotonic() - last_flush_at), + ) + try: + item = queue.get(timeout=timeout) + except Empty: + item = None + if item is None: + if batch: + self._flush_title_index_batch( + provider=provider, + generation=generation, + refresh_interval_hours=refresh_interval_hours, + batch=batch, + queue_depth=queue.qsize(), + reporter=reporter, + ) + batch = [] + last_flush_at = monotonic() + continue + if item is _QUEUE_SENTINEL: + if batch: + self._flush_title_index_batch( + provider=provider, + generation=generation, + refresh_interval_hours=refresh_interval_hours, + batch=batch, + queue_depth=queue.qsize(), + reporter=reporter, + ) + return + batch.append(item) + if len(batch) >= PROVIDER_INDEX_WRITER_BATCH_SIZE: + self._flush_title_index_batch( + provider=provider, + generation=generation, + refresh_interval_hours=refresh_interval_hours, + batch=batch, + queue_depth=queue.qsize(), + reporter=reporter, + ) + batch = [] + last_flush_at = monotonic() + except BaseException as exc: + writer_failure.append(exc) + + def _flush_title_index_batch( + self, + *, + provider: str, + generation: str, + refresh_interval_hours: float, + batch: list[TitleRecord], + queue_depth: int, + reporter: ProgressReporter, + ) -> None: + if not batch: + return + last_slug = batch[-1].slug + + def _persist(session: Session) -> None: + for record in batch: + now = utcnow() + upsert_provider_title_index_state( + session, + provider=provider, + slug=record.slug, + attempted_at=now, + succeeded_at=now, + failure_count=0, + last_error_summary="", + detail_status="pending", + detail_next_retry_after=None, + detail_failure_count=0, + detail_last_error_summary=None, + canonical_status="pending", + canonical_next_retry_after=None, + canonical_failure_count=0, + canonical_last_error_summary=None, + commit=False, + ) + replace_provider_catalog_title( + session, + provider=record.provider, + slug=record.slug, + title=record.title, + media_type_hint=record.media_type_hint, + relative_path=record.relative_path, + indexed_generation=generation, + ) + replace_provider_catalog_aliases( + session, + provider=record.provider, + slug=record.slug, + aliases=record.aliases, + indexed_generation=generation, + ) + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=refresh_interval_hours, + status="running", + active_stage="title_index", + current_generation=generation, + cursor_title_slug=last_slug, + last_error_summary="", + commit=False, + ) + + self._writer.run(_persist) + self._advance_persist_progress( + provider, + current_slug=last_slug, + count=len(batch), + queue_depth=queue_depth, + ) + reporter.update( + ProgressSnapshot( + downloaded=self._get_persisted_titles(provider), + total=self._get_total_titles(provider), + status="title_index", + ) + ) + + def _run_detail_enrichment_stage(self, provider: str) -> None: + self._run_row_stage( + provider=provider, + stage="detail_enrichment", + concurrency=self._provider_concurrency(provider), + ) + + def _run_canonical_enrichment_stage(self, provider: str) -> None: + self._run_row_stage( + provider=provider, + stage="canonical_enrichment", + concurrency=CANONICAL_INDEX_CONCURRENCY, + ) + + def _run_row_stage(self, *, provider: str, stage: str, concurrency: int) -> None: + refresh_interval_hours = self._refresh_interval_hours(provider) + generation = self._visible_generation(provider) + if generation is None: + return + self._mark_stage_running( + provider=provider, + stage=stage, + refresh_interval_hours=refresh_interval_hours, + ) + total_titles = self._count_visible_titles(provider) + self._set_progress( + provider, + phase=stage, + stage=stage, + crawled_titles=0, + persisted_titles=0, + failed_titles=0, + total_titles=total_titles, + current_slug="", + queue_depth=0, + reset_log_steps=True, + ) + failure_limit = max( + 1, + int( + max(1, total_titles) * PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT / 100.0 + ), + ) + failure_count = 0 + while not self._stop_event.is_set(): + due_rows = self._load_due_stage_rows( + provider=provider, + stage=stage, + limit=max(1, concurrency * 2), + ) + if not due_rows: + remaining_rows = self._writer.run( + lambda session: self._count_remaining_stage_rows( + session, + provider=provider, + stage=stage, + generation=generation, + ) + ) + if remaining_rows: + self._writer.run( + lambda session: self._mark_stage_pending( + session, + provider=provider, + stage=stage, + generation=generation, + refresh_interval_hours=refresh_interval_hours, + ) + ) + return + completed_at = utcnow() + self._writer.run( + lambda session: self._mark_stage_ready( + session, + provider=provider, + stage=stage, + refresh_interval_hours=refresh_interval_hours, + completed_at=completed_at, + ) + ) + self._set_progress( + provider, + phase=f"{stage}_ready", + stage=stage, + current_slug="", + ) + return + executor = ThreadPoolExecutor(max_workers=max(1, concurrency)) + pending: dict[ + Future, tuple[ProviderCatalogTitle, ProviderTitleIndexState] + ] = {} + try: + for title_row, state in due_rows: + pending[ + executor.submit( + self._run_stage_job, + provider=provider, + stage=stage, + title_row=title_row, + ) + ] = (title_row, state) + while pending: + done, not_done = wait( + pending.keys(), + timeout=1.0, + return_when=FIRST_COMPLETED, + ) + if not done: + continue + for future in done: + title_row, state = pending.pop(future) + try: + payload = future.result() + self._persist_stage_success( + provider=provider, + stage=stage, + title_row=title_row, + payload=payload, + ) + self._advance_persist_progress( + provider, + current_slug=title_row.slug, + count=1, + queue_depth=len(not_done), + ) + except Exception as exc: + failure_count += 1 + error_text = str(exc) + self._persist_stage_failure( + provider=provider, + stage=stage, + title_row=title_row, + state=state, + error=error_text, + ) + self._advance_failed_progress( + provider, + current_slug=title_row.slug, + queue_depth=len(not_done), + ) + if failure_count >= failure_limit: + for remaining in not_done: + remaining.cancel() + completed_at = utcnow() + self._writer.run( + lambda session, error=error_text: ( + self._mark_stage_failed( + session, + provider=provider, + stage=stage, + refresh_interval_hours=refresh_interval_hours, + completed_at=completed_at, + error=error, + ) + ) + ) + return + finally: + executor.shutdown(wait=False, cancel_futures=True) + + def _run_stage_job( + self, + *, + provider: str, + stage: str, + title_row: ProviderCatalogTitle, + ): + self._advance_crawl_progress( + provider, + current_slug=title_row.slug, + queue_depth=0, + ) + aliases = self._load_aliases(provider=provider, slug=title_row.slug) + if stage == "detail_enrichment": + return crawl_provider_title_detail( + provider_key=provider, + slug=title_row.slug, + title=title_row.title, + aliases=aliases, + timeout_seconds=float( + CATALOG_SITE_CONFIGS[provider].get( + "provider_index_title_timeout_seconds", + PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, + ) + ), + ) + episodes = self._load_episode_records(provider=provider, slug=title_row.slug) + return resolve_provider_canonical( + provider_key=provider, + slug=title_row.slug, + title=title_row.title, + aliases=aliases, + media_type_hint=title_row.media_type_hint, + episodes=episodes, + ) + + def _persist_stage_success( + self, + *, + provider: str, + stage: str, + title_row: ProviderCatalogTitle, + payload, + ) -> None: + generation = self._visible_generation(provider) + now = utcnow() + + def _persist(session: Session) -> None: + if stage == "detail_enrichment": + detail_record: TitleRecord = payload + replace_provider_catalog_title( + session, + provider=provider, + slug=title_row.slug, + title=detail_record.title, + media_type_hint=detail_record.media_type_hint, + relative_path=detail_record.relative_path, + indexed_generation=generation, + ) + replace_provider_catalog_episodes( + session, + provider=provider, + slug=title_row.slug, + episodes=[ + { + "season": episode.season, + "episode": episode.episode, + "relative_path": episode.relative_path, + "title_primary": episode.title_primary, + "title_secondary": episode.title_secondary, + "media_type_hint": episode.media_type_hint, + "languages": [ + { + "language": language.language, + "host_hints": language.host_hints, + } + for language in episode.languages + ], + } + for episode in detail_record.episodes + ], + indexed_generation=generation, + ) + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_row.slug, + detail_status="ready", + detail_attempted_at=now, + detail_succeeded_at=now, + detail_next_retry_after=None, + detail_failure_count=0, + detail_last_error_summary=None, + commit=False, + ) + else: + canonical: CanonicalPayload = payload + if canonical.series is not None: + series = canonical.series + upsert_canonical_series( + session, + tvdb_id=int(series["tvdb_id"]), + title=str(series["title"]), + tmdb_id=series.get("tmdb_id"), + imdb_id=series.get("imdb_id"), + tvmaze_id=series.get("tvmaze_id"), + anilist_id=series.get("anilist_id"), + mal_id=series.get("mal_id"), + aliases=list(series.get("aliases") or []), + ) + replace_canonical_episodes( + session, + tvdb_id=int(series["tvdb_id"]), + episodes=canonical.episodes, + ) + replace_provider_series_mappings( + session, + provider=provider, + slug=title_row.slug, + mappings=canonical.series_mappings, + indexed_generation=generation, + ) + replace_provider_episode_mappings( + session, + provider=provider, + slug=title_row.slug, + mappings=canonical.episode_mappings, + indexed_generation=generation, + ) + replace_provider_movie_mappings( + session, + provider=provider, + slug=title_row.slug, + mappings=canonical.movie_mappings, + indexed_generation=generation, + ) + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_row.slug, + canonical_status="ready", + canonical_attempted_at=now, + canonical_succeeded_at=now, + canonical_next_retry_after=None, + canonical_failure_count=0, + canonical_last_error_summary=None, + commit=False, + ) + + self._writer.run(_persist) + + def _persist_stage_failure( + self, + *, + provider: str, + stage: str, + title_row: ProviderCatalogTitle, + state: ProviderTitleIndexState, + error: str, + ) -> None: + refresh_interval_hours = self._refresh_interval_hours(provider) + retry_at = utcnow() + timedelta(hours=refresh_interval_hours) + + def _persist(session: Session) -> None: + if stage == "detail_enrichment": + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_row.slug, + detail_status="failed", + detail_attempted_at=utcnow(), + detail_next_retry_after=retry_at, + detail_failure_count=state.detail_failure_count + 1, + detail_last_error_summary=error[:500], + commit=False, + ) + else: + upsert_provider_title_index_state( + session, + provider=provider, + slug=title_row.slug, + canonical_status="failed", + canonical_attempted_at=utcnow(), + canonical_next_retry_after=retry_at, + canonical_failure_count=state.canonical_failure_count + 1, + canonical_last_error_summary=error[:500], + commit=False, + ) + + self._writer.run(_persist) + + def _mark_stage_running( + self, + *, + provider: str, + stage: str, + refresh_interval_hours: float, + ) -> None: + def _persist(session: Session) -> None: + payload = { + "provider": provider, + "refresh_interval_hours": refresh_interval_hours, + "status": "running", + "active_stage": stage, + "latest_started_at": utcnow(), + "last_error_summary": "", + "commit": False, + } + if stage == "detail_enrichment": + payload["detail_enrichment_status"] = "running" + payload["detail_next_retry_after"] = None + elif stage == "canonical_enrichment": + payload["canonical_enrichment_status"] = "running" + payload["canonical_next_retry_after"] = None + upsert_provider_index_status(session, **payload) + + self._writer.run(_persist) + + def _mark_stage_ready( + self, + session: Session, + *, + provider: str, + stage: str, + refresh_interval_hours: float, + completed_at, + ) -> None: + with Session(engine) as read_session: + status = get_provider_index_status(read_session, provider=provider) + if status is not None: + status = status.model_copy(deep=True) + payload = { + "provider": provider, + "refresh_interval_hours": refresh_interval_hours, + "status": "partial", + "active_stage": None, + "latest_completed_at": completed_at, + "last_error_summary": "", + "commit": False, + } + if stage == "detail_enrichment": + payload["detail_enrichment_status"] = "ready" + payload["detail_ready_at"] = completed_at + payload["detail_next_retry_after"] = None + if status is not None: + status.detail_enrichment_status = "ready" + status.detail_ready_at = completed_at + status.detail_next_retry_after = None + else: + payload["canonical_enrichment_status"] = "ready" + payload["canonical_ready_at"] = completed_at + payload["canonical_next_retry_after"] = None + if status is not None: + status.canonical_enrichment_status = "ready" + status.canonical_ready_at = completed_at + status.canonical_next_retry_after = None + if status is not None and is_provider_fully_ready(status): + payload["status"] = "ready" + upsert_provider_index_status(session, **payload) + + def _mark_stage_pending( + self, + session: Session, + *, + provider: str, + stage: str, + generation: str, + refresh_interval_hours: float, + ) -> None: + payload = { + "provider": provider, + "refresh_interval_hours": refresh_interval_hours, + "status": "partial", + "active_stage": None, + "last_error_summary": "", + "commit": False, + } + retry_after = self._earliest_stage_retry_after( + session, + provider=provider, + stage=stage, + generation=generation, + ) + if stage == "detail_enrichment": + payload["detail_enrichment_status"] = "pending" + payload["detail_next_retry_after"] = retry_after + else: + payload["canonical_enrichment_status"] = "pending" + payload["canonical_next_retry_after"] = retry_after + upsert_provider_index_status(session, **payload) + + def _earliest_stage_retry_after( + self, + session: Session, + *, + provider: str, + stage: str, + generation: str, + ): + retry_column = ( + ProviderTitleIndexState.detail_next_retry_after + if stage == "detail_enrichment" + else ProviderTitleIndexState.canonical_next_retry_after + ) + rows = session.exec( + select(retry_column) + .join( + ProviderCatalogTitle, + (ProviderCatalogTitle.provider == ProviderTitleIndexState.provider) + & (ProviderCatalogTitle.slug == ProviderTitleIndexState.slug), + ) + .where( + (ProviderTitleIndexState.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + & (retry_column.is_not(None)) + ) + ).all() + return min((as_aware_utc(row) for row in rows if row is not None), default=None) + + def _mark_stage_failed( + self, + session: Session, + *, + provider: str, + stage: str, + refresh_interval_hours: float, + completed_at, + error: str, + ) -> None: + payload = { + "provider": provider, + "refresh_interval_hours": refresh_interval_hours, + "status": "failed", + "active_stage": None, + "latest_completed_at": completed_at, + "last_error_summary": error[:500], + "commit": False, + } + retry_at = completed_at + timedelta(hours=refresh_interval_hours) + if stage == "detail_enrichment": + payload["detail_enrichment_status"] = "failed" + payload["detail_next_retry_after"] = retry_at + else: + payload["canonical_enrichment_status"] = "failed" + payload["canonical_next_retry_after"] = retry_at + upsert_provider_index_status(session, **payload) + + def _detail_stage_has_due_work(self, provider: str) -> bool: + return self._has_unfinished_stage_rows( + provider=provider, stage="detail_enrichment" + ) + + def _canonical_stage_has_due_work(self, provider: str) -> bool: + return self._has_unfinished_stage_rows( + provider=provider, stage="canonical_enrichment" + ) + + def _has_unfinished_stage_rows(self, *, provider: str, stage: str) -> bool: + generation = self._visible_generation(provider) + if generation is None: + return False + with Session(engine) as session: + rows = session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ).all() + for row in rows: + state = session.get(ProviderTitleIndexState, (provider, row.slug)) + if state is None: + return True + if stage == "detail_enrichment": + if state.detail_status != "ready": + return True + continue + if state.canonical_status != "ready": + return True + return False + + def _load_due_stage_rows( + self, + *, + provider: str, + stage: str, + limit: int, + ) -> list[tuple[ProviderCatalogTitle, ProviderTitleIndexState]]: + generation = self._visible_generation(provider) + if generation is None: + return [] + now = utcnow() + with Session(engine) as session: + rows = list( + session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ).all() + ) + due: list[tuple[ProviderCatalogTitle, ProviderTitleIndexState]] = [] + for row in rows: + state = session.get(ProviderTitleIndexState, (provider, row.slug)) + if state is None: + state = ProviderTitleIndexState(provider=provider, slug=row.slug) + if stage == "detail_enrichment": + retry_after = state.detail_next_retry_after + if retry_after is not None and as_aware_utc(retry_after) > now: + continue + if state.detail_status == "ready": + continue + due.append((row, state)) + else: + retry_after = state.canonical_next_retry_after + if retry_after is not None and as_aware_utc(retry_after) > now: + continue + if state.canonical_status == "ready": + continue + if ( + state.detail_status != "ready" + and row.media_type_hint != "movie" + ): + continue + due.append((row, state)) + if len(due) >= max(1, limit): + break + return due + + def _count_remaining_stage_rows( + self, + session: Session, + *, + provider: str, + stage: str, + generation: str, + ) -> int: + rows = session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ).all() + remaining = 0 + for row in rows: + state = session.get(ProviderTitleIndexState, (provider, row.slug)) + if state is None: + state = ProviderTitleIndexState(provider=provider, slug=row.slug) + if stage == "detail_enrichment": + if state.detail_status != "ready": + remaining += 1 + continue + if state.canonical_status != "ready": + remaining += 1 + return remaining + + def _load_aliases(self, *, provider: str, slug: str) -> list[str]: + generation = self._visible_generation(provider) + if generation is None: + return [] + with Session(engine) as session: + rows = session.exec( + select(ProviderCatalogAlias).where( + (ProviderCatalogAlias.provider == provider) + & (ProviderCatalogAlias.slug == slug) + & (ProviderCatalogAlias.indexed_generation == generation) + ) + ).all() + return [row.alias for row in rows] + + def _load_episode_records(self, *, provider: str, slug: str) -> list[EpisodeRecord]: + generation = self._visible_generation(provider) + if generation is None: + return [] + with Session(engine) as session: + episode_rows = session.exec( + select(ProviderCatalogEpisode).where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.slug == slug) + & (ProviderCatalogEpisode.indexed_generation == generation) + ) + ).all() + language_rows = session.exec( + select(ProviderEpisodeLanguage).where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.slug == slug) + & (ProviderEpisodeLanguage.indexed_generation == generation) + ) + ).all() + languages_by_episode: dict[tuple[int, int], list[EpisodeLanguageRecord]] = {} + for row in language_rows: + key = (int(row.season), int(row.episode)) + languages_by_episode.setdefault(key, []).append( + EpisodeLanguageRecord( + language=row.language, + host_hints=list(row.host_hints or []), + ) + ) + return [ + EpisodeRecord( + season=int(row.season), + episode=int(row.episode), + relative_path=row.relative_path, + title_primary=row.title_primary, + title_secondary=row.title_secondary, + media_type_hint=row.media_type_hint, + languages=languages_by_episode.get( + (int(row.season), int(row.episode)), [] + ), + ) + for row in episode_rows + ] + + def _visible_generation(self, provider: str) -> str | None: + with Session(engine) as session: + status = get_provider_index_status(session, provider=provider) + if status is None: + return None + return status.latest_success_generation + + def _count_visible_titles(self, provider: str) -> int: + generation = self._visible_generation(provider) + if generation is None: + return 0 + with Session(engine) as session: + rows = session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ).all() + return len(rows) + + def _refresh_interval_hours(self, provider: str) -> float: + return float( + CATALOG_SITE_CONFIGS.get(provider, {}).get( + "provider_index_refresh_hours", 24.0 + ) + ) + + def _provider_concurrency(self, provider: str) -> int: + return max( + 1, + int( + CATALOG_SITE_CONFIGS.get(provider, {}).get( + "provider_index_concurrency", 1 + ) + ), + ) + + def _enqueue_title_record( + self, + provider: str, + queue: Queue[TitleRecord | object], + record: TitleRecord, + writer_failure: list[BaseException], + ) -> None: + last_backpressure_log = 0.0 + while True: + if writer_failure: + raise RuntimeError(str(writer_failure[0])) + try: + queue.put(record, timeout=1.0) + self._set_progress(provider, queue_depth=queue.qsize()) + return + except Exception: + depth = queue.qsize() + self._set_progress(provider, queue_depth=depth) + now = monotonic() + if ( + now - last_backpressure_log + >= PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS + ): + logger.warning( + "Provider catalog {}: writer backpressure queue_depth={} lag_titles={}", + provider, + depth, + self._get_writer_lag(provider), + ) + last_backpressure_log = now + + def _stale_generation(self, status) -> str | None: + current_generation = getattr(status, "current_generation", None) + latest_success_generation = getattr(status, "latest_success_generation", None) + if not current_generation: + return None + if status.status == "running": + return current_generation + if current_generation != latest_success_generation: + return current_generation + return None + + def _set_progress( + self, + provider: str, + *, + phase: str | None = None, + stage: str | None = None, + crawled_titles: int | None = None, + persisted_titles: int | None = None, + failed_titles: int | None = None, + total_titles: int | None | object = _UNSET, + current_slug: str | None = None, + queue_depth: int | None = None, + reset_log_steps: bool = False, + ) -> None: + with self._progress_lock: + snapshot = self._progress.get(provider) + if snapshot is None: + snapshot = ProviderCatalogProgress(provider=provider) + self._progress[provider] = snapshot + if phase is not None: + snapshot.phase = phase + if stage is not None: + snapshot.stage = stage + if crawled_titles is not None: + snapshot.crawled_titles = crawled_titles + if persisted_titles is not None: + snapshot.persisted_titles = persisted_titles + if failed_titles is not None: + snapshot.failed_titles = failed_titles + if total_titles is not _UNSET: + snapshot.total_titles = total_titles + if current_slug is not None: + snapshot.current_slug = current_slug + if queue_depth is not None: + snapshot.queue_depth = queue_depth + if reset_log_steps: + snapshot.last_logged_crawl_step = -1 + snapshot.last_logged_persist_step = -1 + + def _advance_crawl_progress( + self, + provider: str, + *, + current_slug: str, + queue_depth: int, + ) -> None: + with self._progress_lock: + snapshot = self._progress.setdefault( + provider, ProviderCatalogProgress(provider=provider) + ) + snapshot.crawled_titles += 1 + snapshot.current_slug = current_slug + snapshot.queue_depth = queue_depth + self._maybe_log_progress(snapshot, kind="crawl") + + def _advance_failed_progress( + self, + provider: str, + *, + current_slug: str, + queue_depth: int, + ) -> None: + with self._progress_lock: + snapshot = self._progress.setdefault( + provider, ProviderCatalogProgress(provider=provider) + ) + snapshot.failed_titles += 1 + snapshot.current_slug = current_slug + snapshot.queue_depth = queue_depth + self._maybe_log_progress(snapshot, kind="crawl") + + def _advance_persist_progress( + self, + provider: str, + *, + current_slug: str, + count: int, + queue_depth: int, + ) -> None: + with self._progress_lock: + snapshot = self._progress.setdefault( + provider, ProviderCatalogProgress(provider=provider) + ) + snapshot.persisted_titles += count + snapshot.current_slug = current_slug + snapshot.queue_depth = queue_depth + self._maybe_log_progress(snapshot, kind="persist") + + def _maybe_log_progress( + self, snapshot: ProviderCatalogProgress, *, kind: str + ) -> None: + percent = ( + snapshot.crawl_percent if kind == "crawl" else snapshot.progress_percent + ) + if percent is None: + return + step = int(percent // PROGRESS_STEP_PERCENT) + if kind == "crawl": + if step <= snapshot.last_logged_crawl_step: + return + snapshot.last_logged_crawl_step = step + else: + if step <= snapshot.last_logged_persist_step: + return + snapshot.last_logged_persist_step = step + + def _get_total_titles(self, provider: str) -> int | None: + with self._progress_lock: + snapshot = self._progress.get(provider) + return snapshot.total_titles if snapshot is not None else None + + def _get_persisted_titles(self, provider: str) -> int: + with self._progress_lock: + snapshot = self._progress.get(provider) + if snapshot is None: + return 0 + return snapshot.persisted_titles + + def _get_failed_titles(self, provider: str) -> int: + with self._progress_lock: + snapshot = self._progress.get(provider) + if snapshot is None: + return 0 + return snapshot.failed_titles + + def _get_writer_lag(self, provider: str) -> int: + with self._progress_lock: + snapshot = self._progress.get(provider) + if snapshot is None: + return 0 + return snapshot.writer_lag_titles + + def _is_bootstrap_ready(self) -> bool: + with Session(engine) as session: + return is_catalog_bootstrap_ready(session, providers=CATALOG_SITES_LIST) + + def _log_bootstrap_state(self) -> None: + with Session(engine) as session: + statuses = list_provider_index_statuses(session) + for status in statuses: + logger.info( + "Provider catalog bootstrap state: provider={} status={} active_stage={} title_index_status={} detail_status={} canonical_status={} latest_success_generation={} next_refresh_after={}", + status.provider, + status.status, + status.active_stage, + status.title_index_status, + status.detail_enrichment_status, + status.canonical_enrichment_status, + status.latest_success_generation, + ( + status.next_refresh_after.isoformat() + if status.next_refresh_after is not None + else None + ), + ) diff --git a/apps/api/app/catalog/metadata.py b/apps/api/app/catalog/metadata.py new file mode 100644 index 00000000..6c0447ca --- /dev/null +++ b/apps/api/app/catalog/metadata.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +from collections import OrderedDict +from dataclasses import dataclass +from difflib import SequenceMatcher +from threading import Lock +from time import time +from typing import Any, Generic, Optional, TypeVar +from urllib.parse import urlencode + +from loguru import logger + +from app.config import ( + CANONICAL_CACHE_MEMORY_MAX_SEARCH, + CANONICAL_CACHE_MEMORY_MAX_SHOW, + CANONICAL_CACHE_TTL_SECONDS, +) +from app.db import normalize_catalog_text +from app.utils.http_client import get as http_get + +SKYHOOK_SEARCH_URL = "https://skyhook.sonarr.tv/v1/tvdb/search/en/" +SKYHOOK_SHOW_URL = "https://skyhook.sonarr.tv/v1/tvdb/shows/en/{tvdb_id}" +SKYHOOK_TIMEOUT_SECONDS = 4.0 + +TKey = TypeVar("TKey") +TValue = TypeVar("TValue") + + +class TtlLruCache(Generic[TKey, TValue]): + def __init__(self, *, max_entries: int, ttl_seconds: int) -> None: + self._max_entries = max(1, int(max_entries)) + self._ttl_seconds = max(1, int(ttl_seconds)) + self._entries: OrderedDict[TKey, tuple[float, TValue]] = OrderedDict() + self._lock = Lock() + + def get(self, key: TKey) -> TValue | None: + now = time() + with self._lock: + entry = self._entries.get(key) + if entry is None: + return None + expires_at, payload = entry + if expires_at <= now: + self._entries.pop(key, None) + return None + self._entries.move_to_end(key) + return payload + + def set(self, key: TKey, value: TValue) -> None: + expires_at = time() + self._ttl_seconds + with self._lock: + self._entries[key] = (expires_at, value) + self._entries.move_to_end(key) + while len(self._entries) > self._max_entries: + self._entries.popitem(last=False) + + def size(self) -> int: + with self._lock: + return len(self._entries) + + +@dataclass(slots=True) +class TvCanonicalMatch: + tvdb_id: int + title: str + confidence: str + source: str + rationale: str + payload: dict[str, Any] + + +_search_cache: TtlLruCache[str, list[dict[str, Any]]] = TtlLruCache( + max_entries=CANONICAL_CACHE_MEMORY_MAX_SEARCH, + ttl_seconds=CANONICAL_CACHE_TTL_SECONDS, +) +_show_cache: TtlLruCache[int, dict[str, Any]] = TtlLruCache( + max_entries=CANONICAL_CACHE_MEMORY_MAX_SHOW, + ttl_seconds=CANONICAL_CACHE_TTL_SECONDS, +) + + +def canonical_cache_stats() -> dict[str, int]: + return { + "search_entries": _search_cache.size(), + "show_entries": _show_cache.size(), + } + + +def _score_title(query: str, candidate: str) -> float: + left = normalize_catalog_text(query) + right = normalize_catalog_text(candidate) + if not left or not right: + return 0.0 + if left == right: + return 1.0 + return SequenceMatcher(None, left, right).ratio() + + +def _candidate_terms( + *, + title: str, + aliases: list[str], + imdb_id: Optional[str], + tmdb_id: Optional[int], +) -> list[tuple[str, str]]: + terms: list[tuple[str, str]] = [] + if imdb_id: + terms.append((f"imdb:{imdb_id}", "explicit_imdb")) + if tmdb_id: + terms.append((f"tmdb:{tmdb_id}", "explicit_tmdb")) + if title: + terms.append((title, "title")) + for alias in aliases: + alias_clean = (alias or "").strip() + if alias_clean and alias_clean != title: + terms.append((alias_clean, "alias")) + seen: set[str] = set() + deduped: list[tuple[str, str]] = [] + for term, source in terms: + if term in seen: + continue + seen.add(term) + deduped.append((term, source)) + return deduped + + +def resolve_tv_canonical_match( + *, + title: str, + aliases: list[str], + imdb_id: Optional[str], + tmdb_id: Optional[int], +) -> Optional[TvCanonicalMatch]: + candidates: list[dict[str, Any]] = [] + for term, source in _candidate_terms( + title=title, + aliases=aliases, + imdb_id=imdb_id, + tmdb_id=tmdb_id, + ): + payload = _search_cache.get(term) + if payload is None: + try: + query = urlencode({"term": term}) + response = http_get( + f"{SKYHOOK_SEARCH_URL}?{query}", + timeout=SKYHOOK_TIMEOUT_SECONDS, + ) + response.raise_for_status() + raw_payload = response.json() + except Exception as exc: + logger.debug("SkyHook search failed for '{}': {}", term, exc) + continue + if not isinstance(raw_payload, list): + continue + payload = [item for item in raw_payload if isinstance(item, dict)] + _search_cache.set(term, [dict(item) for item in payload]) + for item in payload: + copied = dict(item) + copied["_ab_source"] = source + copied["_ab_term"] = term + candidates.append(copied) + + best_match: Optional[tuple[float, dict[str, Any]]] = None + for item in candidates: + candidate_title = str(item.get("title") or "").strip() + candidate_tvdb = item.get("tvdbId") + if not candidate_title or not isinstance(candidate_tvdb, int): + continue + scores = [_score_title(title, candidate_title)] + scores.extend(_score_title(alias, candidate_title) for alias in aliases) + score = max(scores or [0.0]) + if item.get("_ab_source") in {"explicit_imdb", "explicit_tmdb"}: + score = max(score, 0.99) + current = (score, item) + if best_match is None or current[0] > best_match[0]: + best_match = current + + if best_match is None or best_match[0] < 0.45: + return None + + score, item = best_match + tvdb_id = int(item["tvdbId"]) + payload = _show_cache.get(tvdb_id) + if payload is None: + try: + response = http_get( + SKYHOOK_SHOW_URL.format(tvdb_id=tvdb_id), + timeout=SKYHOOK_TIMEOUT_SECONDS, + ) + response.raise_for_status() + raw_payload = response.json() + except Exception as exc: + logger.debug("SkyHook show fetch failed for tvdb {}: {}", tvdb_id, exc) + return None + if not isinstance(raw_payload, dict): + return None + payload = dict(raw_payload) + _show_cache.set(tvdb_id, dict(payload)) + + if score >= 0.99: + confidence = "confirmed" + elif score >= 0.85: + confidence = "high_confidence" + else: + confidence = "low_confidence" + + return TvCanonicalMatch( + tvdb_id=tvdb_id, + title=str(payload.get("title") or item.get("title") or title).strip(), + confidence=confidence, + source=str(item.get("_ab_source") or "title"), + rationale=f"score={score:.2f} term={item.get('_ab_term')}", + payload=payload, + ) diff --git a/apps/api/app/catalog/providers.py b/apps/api/app/catalog/providers.py new file mode 100644 index 00000000..270e5741 --- /dev/null +++ b/apps/api/app/catalog/providers.py @@ -0,0 +1,657 @@ +from __future__ import annotations + +from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError +from dataclasses import dataclass, field +from difflib import SequenceMatcher +import inspect +import re +from threading import Event +from typing import Any, Callable, Optional +from urllib.parse import urlparse + +from bs4 import BeautifulSoup # type: ignore +from loguru import logger + +from app.catalog.metadata import resolve_tv_canonical_match +from app.config import CATALOG_SITE_CONFIGS +from app.providers import get_provider +from app.providers.megakino.client import ( + get_default_client as get_default_megakino_client, +) +from app.utils.domain_resolver import get_megakino_base_url +from app.utils.http_client import get as http_get + + +@dataclass(slots=True) +class EpisodeLanguageRecord: + language: str + host_hints: list[str] = field(default_factory=list) + + +@dataclass(slots=True) +class EpisodeRecord: + season: int + episode: int + relative_path: str + title_primary: Optional[str] + title_secondary: Optional[str] + media_type_hint: str + languages: list[EpisodeLanguageRecord] = field(default_factory=list) + + +@dataclass(slots=True) +class CanonicalPayload: + series: Optional[dict[str, Any]] = None + episodes: list[dict[str, Any]] = field(default_factory=list) + series_mappings: list[dict[str, Any]] = field(default_factory=list) + episode_mappings: list[dict[str, Any]] = field(default_factory=list) + movie_mappings: list[dict[str, Any]] = field(default_factory=list) + + +@dataclass(slots=True) +class TitleRecord: + provider: str + slug: str + title: str + aliases: list[str] + media_type_hint: str + relative_path: str + episodes: list[EpisodeRecord] = field(default_factory=list) + canonical: CanonicalPayload = field(default_factory=CanonicalPayload) + + +@dataclass(slots=True) +class CatalogCrawlObserver: + on_index_loaded: Callable[[int], None] | None = None + on_title_started: Callable[[str], None] | None = None + on_title_crawled: Callable[[str], None] | None = None + on_title_failed: Callable[[str, str], None] | None = None + + +@dataclass(slots=True) +class CatalogStreamSummary: + discovered_titles: int = 0 + crawled_titles: int = 0 + failed_titles: int = 0 + + +def _relative_path(url: str) -> str: + parsed = urlparse(url) + path = parsed.path or "/" + if parsed.query: + return f"{path}?{parsed.query}" + return path + + +def _run_with_timeout( + timeout_seconds: float, func: Callable[..., Any], *args, **kwargs +): + cancel_event = Event() + signature = inspect.signature(func) + accepts_cancel_event = "cancel_event" in signature.parameters or any( + parameter.kind == inspect.Parameter.VAR_KEYWORD + for parameter in signature.parameters.values() + ) + submit_kwargs = dict(kwargs) + if accepts_cancel_event: + submit_kwargs["cancel_event"] = cancel_event + executor = ThreadPoolExecutor( + max_workers=1, + thread_name_prefix="provider-title-crawl", + ) + future = executor.submit(func, *args, **submit_kwargs) + timed_out = False + try: + return future.result(timeout=max(0.001, timeout_seconds)) + except FutureTimeoutError as exc: + timed_out = True + cancel_event.set() + future.cancel() + raise TimeoutError(f"title crawl exceeded {int(timeout_seconds)}s") from exc + finally: + executor.shutdown(wait=not timed_out, cancel_futures=timed_out) + + +def _normalize_provider_data(raw: Any, *, site: str) -> list[EpisodeLanguageRecord]: + if not isinstance(raw, dict): + return [] + languages: list[EpisodeLanguageRecord] = [] + for key, provider_map in raw.items(): + if site == "aniworld.to": + audio = ( + getattr(key[0], "value", str(key[0])) if isinstance(key, tuple) else "" + ) + subtitles = ( + getattr(key[1], "value", str(key[1])) + if isinstance(key, tuple) and len(key) > 1 + else "" + ) + if audio == "German" and subtitles == "None": + language = "German Dub" + elif audio == "Japanese" and subtitles == "German": + language = "German Sub" + elif audio == "Japanese" and subtitles == "English": + language = "English Sub" + else: + language = f"{audio} {subtitles}".strip() + else: + lang_id = int(key) if isinstance(key, int) or str(key).isdigit() else None + if lang_id == 1: + language = "German Dub" + elif lang_id == 2: + language = "English Dub" + elif lang_id == 3: + language = "German Sub" + else: + language = str(key) + host_hints = sorted(str(name) for name in (provider_map or {}).keys()) + languages.append( + EpisodeLanguageRecord(language=language, host_hints=host_hints) + ) + languages.sort(key=lambda entry: entry.language) + return languages + + +def _dedupe_languages( + languages: list[EpisodeLanguageRecord], +) -> list[EpisodeLanguageRecord]: + deduped: dict[str, set[str]] = {} + for item in languages: + bucket = deduped.setdefault(item.language, set()) + bucket.update(item.host_hints) + return [ + EpisodeLanguageRecord(language=language, host_hints=sorted(host_hints)) + for language, host_hints in sorted(deduped.items()) + ] + + +def _aniworld_languages_from_flags( + host_hints: list[str], row: BeautifulSoup +) -> list[EpisodeLanguageRecord]: + languages: list[EpisodeLanguageRecord] = [] + for image in row.select("td.editFunctions img.flag"): + src = str(image.get("src") or "").lower() + title = str(image.get("title") or "").lower() + alt = str(image.get("alt") or "").lower() + text = " ".join([src, title, alt]) + if "japanese-german" in text or "deutsch" in text and "untertitel" in text: + languages.append( + EpisodeLanguageRecord( + language="German Sub", + host_hints=host_hints, + ) + ) + elif "japanese-english" in text or "englisch" in text: + languages.append( + EpisodeLanguageRecord( + language="English Sub", + host_hints=host_hints, + ) + ) + elif ( + "german.svg" in src + or "deutsche sprache" in text + or "deutsch/german" in text + ): + languages.append( + EpisodeLanguageRecord( + language="German Dub", + host_hints=host_hints, + ) + ) + return _dedupe_languages(languages) + + +def _host_hints_from_row(row: BeautifulSoup) -> list[str]: + names: list[str] = [] + for icon in row.select("i.icon"): + classes = [cls for cls in icon.get("class", []) if cls != "icon"] + if classes: + names.append(str(classes[-1])) + continue + title = str(icon.get("title") or "").strip() + if title: + names.append(title.replace("Hoster ", "").strip()) + return sorted(dict.fromkeys(name for name in names if name)) + + +def _parse_aniworld_season_rows(season) -> list[EpisodeRecord]: + soup = BeautifulSoup(season._html, "html.parser") + episodes: list[EpisodeRecord] = [] + for row in soup.select('tr[itemtype="http://schema.org/Episode"]'): + link = row.select_one('a[itemprop="url"]') + if link is None: + continue + href = str(link.get("href") or "").strip() + if not href: + continue + relative_path = _relative_path(href) + episode_number = 0 + number_meta = row.select_one('meta[itemprop="episodeNumber"]') + if number_meta is not None: + content = str(number_meta.get("content") or "").strip() + if content.isdigit(): + episode_number = int(content) + if episode_number <= 0: + match = re.search(r"(?:episode|film)-(\d+)", href) + if match: + episode_number = int(match.group(1)) + if episode_number <= 0: + continue + title_primary = None + title_secondary = None + title_cell = row.select_one("td.seasonEpisodeTitle") + if title_cell is not None: + strong = title_cell.select_one("strong") + span = title_cell.select_one("span") + title_primary = ( + strong.get_text(" ", strip=True) if strong is not None else None + ) + title_secondary = ( + span.get_text(" ", strip=True) if span is not None else None + ) + host_hints = _host_hints_from_row(row) + languages = _aniworld_languages_from_flags(host_hints, row) + episodes.append( + EpisodeRecord( + season=int(getattr(season, "season_number", 0) or 0), + episode=episode_number, + relative_path=relative_path, + title_primary=title_primary, + title_secondary=title_secondary, + media_type_hint="movie" + if getattr(season, "are_movies", False) + else "episode", + languages=languages, + ) + ) + return episodes + + +def _parse_sto_season_rows(season) -> list[EpisodeRecord]: + html = season._html + season_number = int(getattr(season, "season_number", 0) or 0) + pattern = re.compile( + r'href="(?P(?:https?://(?:serienstream|s)\.to)?/serie/[^"\s]+/staffel-' + + str(season_number) + + r"/episode-(?P\d+))/?\"" + ) + episodes: list[EpisodeRecord] = [] + seen: set[tuple[int, str]] = set() + for match in pattern.finditer(html): + episode_number = int(match.group("episode")) + href = match.group("href") + relative_path = _relative_path(href) + key = (episode_number, relative_path) + if key in seen: + continue + seen.add(key) + episodes.append( + EpisodeRecord( + season=season_number, + episode=episode_number, + relative_path=relative_path, + title_primary=None, + title_secondary=None, + media_type_hint="episode", + languages=[], + ) + ) + return episodes + + +def _score_episode_title(left: str, right: str) -> float: + from app.db import normalize_catalog_text + + a = normalize_catalog_text(left) + b = normalize_catalog_text(right) + if not a or not b: + return 0.0 + if a == b: + return 1.0 + return SequenceMatcher(None, a, b).ratio() + + +def _build_tv_canonical_payload( + *, + provider: str, + slug: str, + title: str, + aliases: list[str], + imdb_id: Optional[str], + mal_id: Optional[int], + episodes: list[EpisodeRecord], +) -> CanonicalPayload: + match = resolve_tv_canonical_match( + title=title, + aliases=aliases, + imdb_id=imdb_id, + tmdb_id=None, + ) + if match is None: + return CanonicalPayload() + + payload = match.payload + raw_episodes = payload.get("episodes") + if not isinstance(raw_episodes, list): + raw_episodes = [] + canonical_episodes: list[dict[str, Any]] = [] + for item in raw_episodes: + if not isinstance(item, dict): + continue + season_number = item.get("seasonNumber") + episode_number = item.get("episodeNumber") + episode_title = str(item.get("title") or "").strip() + if ( + not isinstance(season_number, int) + or not isinstance(episode_number, int) + or not episode_title + ): + continue + canonical_episodes.append( + { + "season": season_number, + "episode": episode_number, + "title": episode_title, + } + ) + + series_payload = { + "tvdb_id": match.tvdb_id, + "title": match.title, + "tmdb_id": payload.get("tmdbId") + if isinstance(payload.get("tmdbId"), int) + else None, + "imdb_id": imdb_id or str(payload.get("imdbId") or "").strip() or None, + "tvmaze_id": payload.get("tvMazeId") + if isinstance(payload.get("tvMazeId"), int) + else None, + "anilist_id": None, + "mal_id": mal_id, + "aliases": aliases, + } + series_mappings = [ + { + "tvdb_id": match.tvdb_id, + "confidence": match.confidence, + "source": match.source, + "rationale": match.rationale, + } + ] + + by_number = {(item["season"], item["episode"]): item for item in canonical_episodes} + by_season: dict[int, list[dict[str, Any]]] = {} + for item in canonical_episodes: + by_season.setdefault(int(item["season"]), []).append(item) + + episode_mappings: list[dict[str, Any]] = [] + for provider_episode in episodes: + direct = by_number.get((provider_episode.season, provider_episode.episode)) + if direct is not None: + episode_mappings.append( + { + "provider_season": provider_episode.season, + "provider_episode": provider_episode.episode, + "tvdb_id": match.tvdb_id, + "canonical_season": direct["season"], + "canonical_episode": direct["episode"], + "confidence": "confirmed", + "source": "direct_numbering", + "rationale": "season+episode match", + } + ) + continue + + candidate_pool = by_season.get(provider_episode.season, canonical_episodes) + scored: list[tuple[float, dict[str, Any]]] = [] + search_titles = [ + value + for value in [ + provider_episode.title_primary, + provider_episode.title_secondary, + ] + if value + ] + for candidate in candidate_pool: + score = max( + ( + _score_episode_title(search_title, candidate["title"]) + for search_title in search_titles + ), + default=0.0, + ) + if score >= 0.65: + scored.append((score, candidate)) + scored.sort(key=lambda item: item[0], reverse=True) + if not scored: + continue + top_score = scored[0][0] + plausible = [ + candidate for score, candidate in scored if score >= top_score - 0.05 + ] + confidence = "high_confidence" if top_score >= 0.85 else "low_confidence" + candidate = plausible[0] + rationale = f"title score={top_score:.2f}" + if len(plausible) > 1: + confidence = "conflict" + rationale = ( + f"ambiguous title matches; top score={top_score:.2f}; " + f"{len(plausible)} plausible candidates" + ) + episode_mappings.append( + { + "provider_season": provider_episode.season, + "provider_episode": provider_episode.episode, + "tvdb_id": match.tvdb_id, + "canonical_season": int(candidate["season"]), + "canonical_episode": int(candidate["episode"]), + "confidence": confidence, + "source": "title_match", + "rationale": rationale, + } + ) + + return CanonicalPayload( + series=series_payload, + episodes=canonical_episodes, + series_mappings=series_mappings, + episode_mappings=episode_mappings, + ) + + +def _fallback_title_record( + *, + provider_key: str, + slug: str, + title: str, + aliases: list[str], +) -> TitleRecord: + media_type_hint = "movie" if provider_key == "megakino" else "series" + if provider_key == "aniworld.to": + relative_path = f"/anime/stream/{slug}" + elif provider_key == "s.to": + relative_path = f"/serie/{slug}" + else: + relative_path = f"/{slug}" + return TitleRecord( + provider=provider_key, + slug=slug, + title=title, + aliases=aliases, + media_type_hint=media_type_hint, + relative_path=relative_path, + episodes=[], + canonical=CanonicalPayload(), + ) + + +def load_provider_title_index( + provider_key: str, + *, + observer: CatalogCrawlObserver | None = None, +) -> list[TitleRecord]: + provider = get_provider(provider_key) + if provider is None: + return [] + + if provider_key == "megakino": + client = get_default_megakino_client() + entries = client.load_index() + if observer is not None and observer.on_index_loaded is not None: + observer.on_index_loaded(len(entries)) + rows = [ + TitleRecord( + provider=provider_key, + slug=entry.slug, + title=entry.slug.replace("-", " ").title(), + aliases=[], + media_type_hint="movie" if entry.kind == "film" else "series", + relative_path=_relative_path(entry.url), + episodes=[], + canonical=CanonicalPayload(), + ) + for entry in entries.values() + ] + rows.sort(key=lambda item: item.slug) + return rows + + logger.info("Provider catalog {}: loading title index", provider_key) + index = provider.load_or_refresh_index() + alternatives = provider.load_or_refresh_alternatives() + if observer is not None and observer.on_index_loaded is not None: + observer.on_index_loaded(len(index)) + rows = [] + for slug, title in index.items(): + aliases = list(dict.fromkeys(alternatives.get(slug, []) or [title])) + rows.append( + _fallback_title_record( + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + ) + ) + rows.sort(key=lambda item: item.slug) + return rows + + +def _crawl_aniworld_like_detail( + *, + provider_key: str, + slug: str, + title: str, + aliases: list[str], + cancel_event: Event | None = None, +) -> TitleRecord: + if cancel_event is not None and cancel_event.is_set(): + raise TimeoutError("title crawl cancelled before start") + base_url = str(CATALOG_SITE_CONFIGS[provider_key]["base_url"]).rstrip("/") + if provider_key == "aniworld.to": + from aniworld.models import AniworldSeries + + relative_root = f"/anime/stream/{slug}" + series = AniworldSeries(f"{base_url}{relative_root}") + else: + from aniworld.models import SerienstreamSeries + + relative_root = f"/serie/{slug}" + series = SerienstreamSeries(f"{base_url}{relative_root}") + + episodes: list[EpisodeRecord] = [] + for season in series.seasons: + if cancel_event is not None and cancel_event.is_set(): + raise TimeoutError(f"title crawl cancelled for {provider_key}:{slug}") + if provider_key == "aniworld.to": + episodes.extend(_parse_aniworld_season_rows(season)) + else: + episodes.extend(_parse_sto_season_rows(season)) + + return TitleRecord( + provider=provider_key, + slug=slug, + title=series.title or title, + aliases=aliases, + media_type_hint="series", + relative_path=relative_root, + episodes=episodes, + canonical=CanonicalPayload(), + ) + + +def crawl_provider_title_detail( + *, + provider_key: str, + slug: str, + title: str, + aliases: list[str], + timeout_seconds: float, +) -> TitleRecord: + if provider_key == "megakino": + return _fallback_title_record( + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + ) + return _run_with_timeout( + timeout_seconds, + _crawl_aniworld_like_detail, + provider_key=provider_key, + slug=slug, + title=title, + aliases=aliases, + ) + + +def resolve_provider_canonical( + *, + provider_key: str, + slug: str, + title: str, + aliases: list[str], + media_type_hint: str, + episodes: list[EpisodeRecord], + imdb_id: Optional[str] = None, + mal_id: Optional[int] = None, +) -> CanonicalPayload: + if provider_key == "megakino" or media_type_hint == "movie": + return CanonicalPayload() + return _build_tv_canonical_payload( + provider=provider_key, + slug=slug, + title=title, + aliases=aliases, + imdb_id=imdb_id, + mal_id=mal_id, + episodes=episodes, + ) + + +def _parse_megakino_page_metadata(url: str) -> tuple[str | None, int | None]: + base_url = get_megakino_base_url().rstrip("/") + response = http_get(url, timeout=20, headers={"Referer": base_url}) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + title = None + title_node = soup.find("h1") + if title_node: + title = title_node.get_text(" ", strip=True) + year = None + for node in soup.find_all(["a", "span", "div"]): + text = node.get_text(" ", strip=True) + if text and text.isdigit() and len(text) == 4 and 1900 <= int(text) <= 2100: + year = int(text) + break + return title, year + + +def crawl_provider_catalog( + provider_key: str, + *, + observer: CatalogCrawlObserver | None = None, +) -> list[TitleRecord]: + rows = load_provider_title_index(provider_key, observer=observer) + for row in rows: + if observer is not None and observer.on_title_crawled is not None: + observer.on_title_crawled(row.slug) + return rows diff --git a/apps/api/app/config.py b/apps/api/app/config.py index a79480d3..f66d331a 100644 --- a/apps/api/app/config.py +++ b/apps/api/app/config.py @@ -66,6 +66,21 @@ def _as_non_negative_int(val: str | None, default: int) -> int: return parsed +def _as_non_negative_float(val: str | None, default: float) -> float: + """Parse *val* as a non-negative float, returning *default* on failure.""" + if val is None: + return default + try: + parsed = float(val.strip()) + except TypeError, ValueError: + logger.warning("Invalid non-negative float value {!r}; using {}", val, default) + return default + if parsed < 0: + logger.warning("Negative value {} is not allowed; using {}", parsed, default) + return default + return parsed + + # Always-on public IP monitor. PUBLIC_IP_CHECK_ENABLED = _as_bool(os.getenv("PUBLIC_IP_CHECK_ENABLED", None), False) PUBLIC_IP_CHECK_INTERVAL_MIN = int(os.getenv("PUBLIC_IP_CHECK_INTERVAL_MIN", "30") or 0) @@ -188,6 +203,23 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: DOWNLOAD_DIR = _ensure_dir(download_candidates, "DOWNLOAD_DIR") DATA_DIR = _ensure_dir(data_candidates, "DATA_DIR") + +def _ensure_runtime_home() -> Path: + runtime_home = DATA_DIR / "home" + runtime_home.mkdir(parents=True, exist_ok=True) + current_home = os.environ.get("HOME", "").strip() + if not current_home or current_home == "/nonexistent": + os.environ["HOME"] = str(runtime_home) + os.environ.setdefault("XDG_CONFIG_HOME", str(runtime_home / ".config")) + os.environ.setdefault("XDG_CACHE_HOME", str(runtime_home / ".cache")) + Path(os.environ["XDG_CONFIG_HOME"]).mkdir(parents=True, exist_ok=True) + Path(os.environ["XDG_CACHE_HOME"]).mkdir(parents=True, exist_ok=True) + logger.debug("RUNTIME_HOME using: {}", os.environ["HOME"]) + return runtime_home + + +RUNTIME_HOME = _ensure_runtime_home() + # Optional override: path reported to clients (e.g. Sonarr) as qBittorrent save path. # Useful when AniBridge runs on host but Sonarr runs in a container with a different mount point. # Normalize to absolute for reporting if it points into container @@ -229,6 +261,99 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN = int( os.getenv("MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN", "100") ) +PROVIDER_INDEX_REFRESH_HOURS = _as_non_negative_float( + os.getenv("PROVIDER_INDEX_REFRESH_HOURS"), 24.0 +) +PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD = _as_non_negative_float( + os.getenv( + "PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD", str(PROVIDER_INDEX_REFRESH_HOURS) + ), + PROVIDER_INDEX_REFRESH_HOURS, +) +PROVIDER_INDEX_REFRESH_HOURS_STO = _as_non_negative_float( + os.getenv("PROVIDER_INDEX_REFRESH_HOURS_STO", str(PROVIDER_INDEX_REFRESH_HOURS)), + PROVIDER_INDEX_REFRESH_HOURS, +) +PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO = _as_non_negative_float( + os.getenv( + "PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO", str(PROVIDER_INDEX_REFRESH_HOURS) + ), + PROVIDER_INDEX_REFRESH_HOURS, +) +PROVIDER_INDEX_SCHEDULER_POLL_SECONDS = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_SCHEDULER_POLL_SECONDS"), 60 +) +if PROVIDER_INDEX_SCHEDULER_POLL_SECONDS < 5: + PROVIDER_INDEX_SCHEDULER_POLL_SECONDS = 5 +PROVIDER_INDEX_GLOBAL_CONCURRENCY = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_GLOBAL_CONCURRENCY"), 1 +) +if PROVIDER_INDEX_GLOBAL_CONCURRENCY < 1: + PROVIDER_INDEX_GLOBAL_CONCURRENCY = 1 +PROVIDER_INDEX_CONCURRENCY_ANIWORLD = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_CONCURRENCY_ANIWORLD"), 4 +) +if PROVIDER_INDEX_CONCURRENCY_ANIWORLD < 1: + PROVIDER_INDEX_CONCURRENCY_ANIWORLD = 1 +PROVIDER_INDEX_CONCURRENCY_STO = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_CONCURRENCY_STO"), 4 +) +if PROVIDER_INDEX_CONCURRENCY_STO < 1: + PROVIDER_INDEX_CONCURRENCY_STO = 1 +PROVIDER_INDEX_CONCURRENCY_MEGAKINO = _as_non_negative_int( + os.getenv("PROVIDER_INDEX_CONCURRENCY_MEGAKINO"), 2 +) +if PROVIDER_INDEX_CONCURRENCY_MEGAKINO < 1: + PROVIDER_INDEX_CONCURRENCY_MEGAKINO = 1 +PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS = max( + 5, + _as_non_negative_int(os.getenv("PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS"), 45), +) +PROVIDER_INDEX_QUEUE_SIZE = max( + 1, + _as_non_negative_int(os.getenv("PROVIDER_INDEX_QUEUE_SIZE"), 8), +) +PROVIDER_INDEX_WRITER_BATCH_SIZE = max( + 1, + _as_non_negative_int(os.getenv("PROVIDER_INDEX_WRITER_BATCH_SIZE"), 32), +) +PROVIDER_INDEX_WRITER_FLUSH_SECONDS = max( + 0.1, + _as_non_negative_float(os.getenv("PROVIDER_INDEX_WRITER_FLUSH_SECONDS"), 1.0), +) +PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT = min( + 100.0, + max( + 0.0, + _as_non_negative_float( + os.getenv("PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT"), + 20.0, + ), + ), +) +PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS = max( + 1.0, + _as_non_negative_float( + os.getenv("PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS"), + 15.0, + ), +) +CANONICAL_INDEX_CONCURRENCY = max( + 1, + _as_non_negative_int(os.getenv("CANONICAL_INDEX_CONCURRENCY"), 2), +) +CANONICAL_CACHE_MEMORY_MAX_SEARCH = max( + 1, + _as_non_negative_int(os.getenv("CANONICAL_CACHE_MEMORY_MAX_SEARCH"), 512), +) +CANONICAL_CACHE_MEMORY_MAX_SHOW = max( + 1, + _as_non_negative_int(os.getenv("CANONICAL_CACHE_MEMORY_MAX_SHOW"), 256), +) +CANONICAL_CACHE_TTL_SECONDS = max( + 1, + _as_non_negative_int(os.getenv("CANONICAL_CACHE_TTL_SECONDS"), 3600), +) logger.debug( f"ANIWORLD_ALPHABET_HTML={ANIWORLD_ALPHABET_HTML}, ANIWORLD_ALPHABET_URL={ANIWORLD_ALPHABET_URL}" @@ -239,6 +364,40 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: logger.debug(f"MEGAKINO_BASE_URL={MEGAKINO_BASE_URL}") logger.debug(f"MEGAKINO_TITLES_REFRESH_HOURS={MEGAKINO_TITLES_REFRESH_HOURS}") logger.debug(f"MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN={MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN}") +logger.debug("PROVIDER_INDEX_REFRESH_HOURS={}", PROVIDER_INDEX_REFRESH_HOURS) +logger.debug( + "Provider index refresh overrides: aniworld={} sto={} megakino={}", + PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD, + PROVIDER_INDEX_REFRESH_HOURS_STO, + PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO, +) +logger.debug( + "Provider index scheduler: poll_seconds={} global_concurrency={} per_provider=({}, {}, {})", + PROVIDER_INDEX_SCHEDULER_POLL_SECONDS, + PROVIDER_INDEX_GLOBAL_CONCURRENCY, + PROVIDER_INDEX_CONCURRENCY_ANIWORLD, + PROVIDER_INDEX_CONCURRENCY_STO, + PROVIDER_INDEX_CONCURRENCY_MEGAKINO, +) +logger.debug( + "Provider index title timeout: {}s", + PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, +) +logger.debug( + "Provider index writer: queue_size={} batch_size={} flush_seconds={} failure_threshold_percent={} backpressure_log_seconds={}", + PROVIDER_INDEX_QUEUE_SIZE, + PROVIDER_INDEX_WRITER_BATCH_SIZE, + PROVIDER_INDEX_WRITER_FLUSH_SECONDS, + PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT, + PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS, +) +logger.debug( + "Canonical index/cache: concurrency={} search_cache_max={} show_cache_max={} ttl_seconds={}", + CANONICAL_INDEX_CONCURRENCY, + CANONICAL_CACHE_MEMORY_MAX_SEARCH, + CANONICAL_CACHE_MEMORY_MAX_SHOW, + CANONICAL_CACHE_TTL_SECONDS, +) # TTL (Stunden) für Live-Index; 0 = nie neu laden (nur einmal pro Prozess) ANIWORLD_TITLES_REFRESH_HOURS = float(os.getenv("ANIWORLD_TITLES_REFRESH_HOURS", "24")) @@ -266,6 +425,9 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: "alphabet_html": ANIWORLD_ALPHABET_HTML, "alphabet_url": ANIWORLD_ALPHABET_URL, "titles_refresh_hours": ANIWORLD_TITLES_REFRESH_HOURS, + "provider_index_refresh_hours": PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD, + "provider_index_concurrency": PROVIDER_INDEX_CONCURRENCY_ANIWORLD, + "provider_index_title_timeout_seconds": PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, "default_languages": ["German Dub", "German Sub", "English Sub"], "release_group": RELEASE_GROUP_ANIWORLD, }, @@ -274,6 +436,9 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: "alphabet_html": STO_ALPHABET_HTML, "alphabet_url": STO_ALPHABET_URL, "titles_refresh_hours": STO_TITLES_REFRESH_HOURS, + "provider_index_refresh_hours": PROVIDER_INDEX_REFRESH_HOURS_STO, + "provider_index_concurrency": PROVIDER_INDEX_CONCURRENCY_STO, + "provider_index_title_timeout_seconds": PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, "default_languages": ["German Dub", "English Dub"], "release_group": RELEASE_GROUP_STO, }, @@ -282,6 +447,9 @@ def _ensure_dir(candidates: list[Path], label: str) -> Path: "alphabet_html": None, "alphabet_url": None, "titles_refresh_hours": MEGAKINO_TITLES_REFRESH_HOURS, + "provider_index_refresh_hours": PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO, + "provider_index_concurrency": PROVIDER_INDEX_CONCURRENCY_MEGAKINO, + "provider_index_title_timeout_seconds": PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS, "default_languages": ["Deutsch", "German Dub"], "release_group": "megakino", }, @@ -354,11 +522,32 @@ def _normalize_video_host_name(name: str) -> str | None: PROVIDER_REDIRECT_TIMEOUT_SECONDS = 1 logger.debug("PROVIDER_REDIRECT_TIMEOUT_SECONDS={}", PROVIDER_REDIRECT_TIMEOUT_SECONDS) +try: + PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS = float( + os.getenv("PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", "15") + ) +except ValueError: + PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS = 15.0 +if PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS <= 0: + PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS = 15.0 +logger.debug( + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS={}", + PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS, +) + PROVIDER_REDIRECT_RETRIES = _as_non_negative_int( os.getenv("PROVIDER_REDIRECT_RETRIES"), 2 ) logger.debug("PROVIDER_REDIRECT_RETRIES={}", PROVIDER_REDIRECT_RETRIES) +JOB_PROGRESS_FLUSH_SECONDS = _as_non_negative_float( + os.getenv("JOB_PROGRESS_FLUSH_SECONDS"), + 0.5, +) +if JOB_PROGRESS_FLUSH_SECONDS <= 0: + JOB_PROGRESS_FLUSH_SECONDS = 0.5 +logger.debug("JOB_PROGRESS_FLUSH_SECONDS={}", JOB_PROGRESS_FLUSH_SECONDS) + PROVIDER_CHALLENGE_BACKOFF_SECONDS = _as_non_negative_int( os.getenv("PROVIDER_CHALLENGE_BACKOFF_SECONDS"), 300 ) diff --git a/apps/api/app/core/downloader/download.py b/apps/api/app/core/downloader/download.py index f7b11970..51ccc728 100644 --- a/apps/api/app/core/downloader/download.py +++ b/apps/api/app/core/downloader/download.py @@ -172,7 +172,12 @@ def _attempt_download( direct, chosen = get_direct_url_with_fallback( ep, preferred=provider, language=language ) - logger.info("Chosen provider: {}, direct URL: {}", chosen, direct) + logger.info( + "Resolved initial download host: preferred={} resolved={} direct_url={}", + provider, + chosen, + "", + ) base_hint = title_hint if not base_hint and slug and season is not None and episode is not None: @@ -193,7 +198,11 @@ def _attempt_download( ) except Exception as exc: msg = str(exc) - logger.warning("Primary download failed: {}", msg) + logger.warning( + "Download failed after resolving host {}: {}", + chosen, + msg, + ) tried_alt = False providers_left = [ @@ -206,7 +215,13 @@ def _attempt_download( direct3, chosen3 = get_direct_url_with_fallback( ep, preferred=provider_name, language=language ) - logger.info("Retrying download via alternate provider {}", chosen3) + logger.info( + "Retrying download after {} failed: next_preferred={} resolved={} direct_url={}", + chosen, + provider_name, + chosen3, + "", + ) temp_path, info = _ydl_download( direct3, dest_dir, @@ -219,7 +234,8 @@ def _attempt_download( break except Exception as exc3: logger.warning( - "Alternate provider {} failed to download: {}", + "Retry attempt failed: next_preferred={} resolved_or_attempted={} error={}", + provider_name, provider_name, exc3, ) diff --git a/apps/api/app/core/downloader/provider_resolution.py b/apps/api/app/core/downloader/provider_resolution.py index 74185941..ea5162c2 100644 --- a/apps/api/app/core/downloader/provider_resolution.py +++ b/apps/api/app/core/downloader/provider_resolution.py @@ -1,11 +1,13 @@ from __future__ import annotations import re +import threading +import weakref from typing import List, Optional, Tuple, TYPE_CHECKING from loguru import logger -from app.config import PROVIDER_ORDER +from app.config import PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS, PROVIDER_ORDER from .errors import DownloadError, LanguageUnavailableError from .language import normalize_language @@ -13,6 +15,107 @@ from aniworld.models import Episode _AVAIL_RE = re.compile(r"Available languages:\s*\[([^\]]*)\]", re.IGNORECASE) +_DIRECT_LINK_TIMEOUT_LOCK = threading.Lock() +_DIRECT_LINK_TIMED_OUT_WORKERS: weakref.WeakKeyDictionary[ + object, dict[tuple[str, str], list[threading.Thread]] +] = weakref.WeakKeyDictionary() +_DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID: dict[ + int, dict[tuple[str, str], list[threading.Thread]] +] = {} + + +class DirectLinkTimeoutError(TimeoutError): + def __init__(self, message: str, worker: threading.Thread) -> None: + super().__init__(message) + self.worker = worker + + +def _provider_attempt_has_active_timed_out_worker( + ep: object, + provider_name: str, + language: str, +) -> bool: + key = (provider_name, language) + with _DIRECT_LINK_TIMEOUT_LOCK: + try: + workers_by_key = _DIRECT_LINK_TIMED_OUT_WORKERS.get(ep, {}) + except TypeError: + workers_by_key = _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.get(id(ep), {}) + workers = workers_by_key.get(key, []) + active_workers = [worker for worker in workers if worker.is_alive()] + if active_workers: + workers_by_key[key] = active_workers + try: + _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = workers_by_key + except TypeError: + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = workers_by_key + return True + workers_by_key.pop(key, None) + try: + if workers_by_key: + _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = workers_by_key + else: + _DIRECT_LINK_TIMED_OUT_WORKERS.pop(ep, None) + except TypeError: + if workers_by_key: + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = workers_by_key + else: + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.pop(id(ep), None) + return False + + +def _track_timed_out_worker( + ep: object, + *, + provider_name: str, + language: str, + worker: threading.Thread, +) -> None: + key = (provider_name, language) + with _DIRECT_LINK_TIMEOUT_LOCK: + try: + workers_by_key = _DIRECT_LINK_TIMED_OUT_WORKERS.get(ep, {}) + except TypeError: + workers_by_key = _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID.get(id(ep), {}) + existing_workers = workers_by_key.get(key, []) + workers = [existing for existing in existing_workers if existing.is_alive()] + workers.append(worker) + workers_by_key[key] = workers + try: + _DIRECT_LINK_TIMED_OUT_WORKERS[ep] = workers_by_key + except TypeError: + _DIRECT_LINK_TIMED_OUT_WORKERS_BY_ID[id(ep)] = workers_by_key + + +def _run_with_timeout( + callback, + *args, + timeout_seconds: float, + operation: str, +): + outcome: dict[str, object] = {} + + def _target() -> None: + try: + outcome["result"] = callback(*args) + except BaseException as exc: # pragma: no cover - re-raised on caller thread + outcome["error"] = exc + + worker = threading.Thread( + target=_target, + name=f"{operation}-timeout-guard", + daemon=True, + ) + worker.start() + worker.join(timeout=max(0.001, timeout_seconds)) + if worker.is_alive(): + raise DirectLinkTimeoutError( + f"{operation} timed out after {timeout_seconds:.1f}s", + worker, + ) + if "error" in outcome: + raise outcome["error"] # type: ignore[misc] + return outcome.get("result") def _parse_available_languages_from_error(msg: str) -> List[str]: @@ -58,9 +161,21 @@ def _try_get_direct(ep: Episode, provider_name: str, language: str) -> Optional[ LanguageUnavailableError: If the provider reports the requested language is not offered; the exception contains the list of available languages. """ language = normalize_language(language) + if _provider_attempt_has_active_timed_out_worker(ep, provider_name, language): + logger.warning( + "Skipping provider '{}' because a timed-out direct-link lookup is still running for this episode and language.", + provider_name, + ) + return None logger.info("Trying provider '{}' for language '{}'", provider_name, language) try: - url = ep.get_direct_link(provider_name, language) # Lib-API + url = _run_with_timeout( + ep.get_direct_link, + provider_name, + language, + timeout_seconds=PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS, + operation=f"{provider_name.lower()}-direct-link", + ) if url: logger.success( "Found direct URL from provider '{}': {}", provider_name, url @@ -69,6 +184,13 @@ def _try_get_direct(ep: Episode, provider_name: str, language: str) -> Optional[ logger.warning("Provider '{}' returned no URL.", provider_name) except Exception as exc: msg = str(exc) + if isinstance(exc, DirectLinkTimeoutError): + _track_timed_out_worker( + ep, + provider_name=provider_name, + language=language, + worker=exc.worker, + ) if "No provider found for language" in msg: available = _parse_available_languages_from_error(msg) logger.error( diff --git a/apps/api/app/core/lifespan.py b/apps/api/app/core/lifespan.py index 0c0c0631..0185cc15 100644 --- a/apps/api/app/core/lifespan.py +++ b/apps/api/app/core/lifespan.py @@ -30,6 +30,7 @@ ) from app.core.scheduler import init_executor, shutdown_executor +from app.catalog import get_catalog_indexer from app.db import ( engine, dispose_engine, @@ -129,16 +130,20 @@ async def lifespan(app: FastAPI): if cleaned: logger.warning(f"Reset {cleaned} dangling jobs to 'failed'") init_executor() - - # Start background workers - cleanup_stop = threading.Event() - ip_stop = threading.Event() - megakino_stop = threading.Event() if "megakino" in CATALOG_SITE_CONFIGS and not ANIBRIDGE_TEST_MODE: try: resolve_megakino_base_url() except Exception as e: logger.warning(f"megakino domain resolution failed: {e}") + try: + get_catalog_indexer().start() + except Exception as e: + logger.warning("provider catalog indexer start failed: {}", e) + + # Start background workers + cleanup_stop = threading.Event() + ip_stop = threading.Event() + megakino_stop = threading.Event() try: _start_ttl_cleanup_thread(cleanup_stop) except Exception as e: @@ -159,6 +164,10 @@ async def lifespan(app: FastAPI): finally: # Shutdown services shutdown_executor() + try: + get_catalog_indexer().stop() + except Exception as e: + logger.warning("provider catalog indexer stop failed: {}", e) try: from app.api import strm as strm_api diff --git a/apps/api/app/core/scheduler.py b/apps/api/app/core/scheduler.py index 25eeb87b..fcb2c203 100644 --- a/apps/api/app/core/scheduler.py +++ b/apps/api/app/core/scheduler.py @@ -1,12 +1,18 @@ from __future__ import annotations import threading from concurrent.futures import ThreadPoolExecutor, Future -from typing import Dict, Tuple, Optional +from dataclasses import dataclass +from typing import Dict, Optional, Tuple from loguru import logger from sqlmodel import Session import errno -from app.config import MAX_CONCURRENCY, DOWNLOAD_DIR, STRM_PROXY_MODE +from app.config import ( + DOWNLOAD_DIR, + JOB_PROGRESS_FLUSH_SECONDS, + MAX_CONCURRENCY, + STRM_PROXY_MODE, +) from app.utils.strm import allocate_unique_strm_path, build_strm_content from app.core.strm_proxy import StrmIdentity, resolve_direct_url, build_stream_url from app.utils.terminal import ( @@ -23,10 +29,82 @@ # global executor + registry EXECUTOR: Optional[ThreadPoolExecutor] = None -RUNNING: Dict[str, Tuple[Future, threading.Event]] = {} +RUNNING: Dict[str, Tuple[Future | None, threading.Event]] = {} RUNNING_LOCK = threading.Lock() +@dataclass(slots=True) +class JobProgressSnapshot: + downloaded_bytes: int + total_bytes: int | None + speed: float | None + eta: int | None + progress: float + + +class JobProgressWriter: + def __init__(self, job_id: str, flush_interval_seconds: float) -> None: + self._job_id = job_id + self._flush_interval_seconds = flush_interval_seconds + self._lock = threading.Lock() + self._wake_event = threading.Event() + self._stop_event = threading.Event() + self._pending: JobProgressSnapshot | None = None + self._thread = threading.Thread( + target=self._run, + name=f"job-progress-writer-{job_id}", + daemon=True, + ) + + def start(self) -> None: + self._thread.start() + + def publish(self, snapshot: JobProgressSnapshot) -> None: + with self._lock: + self._pending = snapshot + self._wake_event.set() + + def close(self, *, flush: bool) -> None: + self._stop_event.set() + if not flush: + with self._lock: + self._pending = None + self._wake_event.set() + self._thread.join(timeout=5) + + def _drain_pending(self) -> JobProgressSnapshot | None: + with self._lock: + snapshot = self._pending + self._pending = None + return snapshot + + def _flush_snapshot(self, snapshot: JobProgressSnapshot) -> None: + with Session(engine) as session: + update_job( + session, + self._job_id, + status="downloading", + downloaded_bytes=snapshot.downloaded_bytes, + total_bytes=snapshot.total_bytes, + speed=snapshot.speed, + eta=snapshot.eta, + progress=snapshot.progress, + ) + + def _run(self) -> None: + while True: + self._wake_event.wait(self._flush_interval_seconds) + self._wake_event.clear() + snapshot = self._drain_pending() + if snapshot is not None: + self._flush_snapshot(snapshot) + if self._stop_event.is_set(): + final_snapshot = self._drain_pending() + if final_snapshot is not None: + self._flush_snapshot(final_snapshot) + return + + def init_executor() -> None: global EXECUTOR if EXECUTOR is None: @@ -49,70 +127,73 @@ def shutdown_executor() -> None: def _progress_updater(job_id: str, stop_event: threading.Event): - from sqlmodel import Session - from app.db import engine, update_job - reporter: ProgressReporter | None = None last_db_n = -1 + callback_lock = threading.Lock() + writer = JobProgressWriter( + job_id=job_id, + flush_interval_seconds=JOB_PROGRESS_FLUSH_SECONDS, + ) + writer.start() def _cb(d: dict): nonlocal reporter, last_db_n - if stop_event.is_set(): - if reporter: - reporter.close() - raise Exception("Cancelled") - - status = d.get("status") - downloaded = int(d.get("downloaded_bytes") or 0) - total = d.get("total_bytes") or d.get("total_bytes_estimate") - speed = d.get("speed") - eta = d.get("eta") - - # Initialize reporter lazily (label contains job id) - if reporter is None: - reporter = ProgressReporter(label=f"Job {job_id}") - - # Render progress to terminal (TTY bar or stepped logs) - reporter.update( - ProgressSnapshot( - downloaded=downloaded, - total=int(total) if total else None, - speed=float(speed) if speed else None, - eta=int(eta) if eta else None, - status=str(status) if status else None, + with callback_lock: + if stop_event.is_set(): + if reporter: + reporter.close() + raise Exception("Cancelled") + + status = d.get("status") + downloaded = int(d.get("downloaded_bytes") or 0) + total = d.get("total_bytes") or d.get("total_bytes_estimate") + speed = d.get("speed") + eta = d.get("eta") + total_i = int(total) if total else None + speed_f = float(speed) if speed else None + eta_i = int(eta) if eta else None + + if reporter is None: + reporter = ProgressReporter(label=f"Job {job_id}") + + reporter.update( + ProgressSnapshot( + downloaded=downloaded, + total=total_i, + speed=speed_f, + eta=eta_i, + status=str(status) if status else None, + ) ) - ) - - # Throttle DB writes to ~1% steps (or on finish) - progress = 0.0 - should_write = True - if total: - try: - total_i = int(total) - step = max(1, total_i // 100) - should_write = downloaded == total_i or downloaded // step != last_db_n - last_db_n = downloaded // step - progress = max(0.0, min(100.0, downloaded / total_i * 100.0)) - except Exception: - should_write = True - if should_write: - with Session(engine) as s: - update_job( - s, - job_id, - status="downloading" if status != "finished" else "downloading", - downloaded_bytes=downloaded, - total_bytes=int(total) if total else None, - speed=float(speed) if speed else None, - eta=int(eta) if eta else None, - progress=progress, + progress = 0.0 + should_write = True + if total_i: + try: + step = max(1, total_i // 100) + should_write = ( + downloaded == total_i or downloaded // step != last_db_n + ) + last_db_n = downloaded // step + progress = max(0.0, min(100.0, downloaded / total_i * 100.0)) + except Exception: + should_write = True + + if should_write: + writer.publish( + JobProgressSnapshot( + downloaded_bytes=downloaded, + total_bytes=total_i, + speed=speed_f, + eta=eta_i, + progress=progress, + ) ) - if status == "finished" and reporter is not None: - reporter.close() + if status == "finished" and reporter is not None: + reporter.close() - return _cb + return _cb, writer def _run_download(job_id: str, req: dict, stop_event: threading.Event): @@ -136,6 +217,7 @@ def _run_download(job_id: str, req: dict, stop_event: threading.Event): - 'site' (optional, defaults to "aniworld.to") stop_event (threading.Event): Event that, when set, requests cancellation of the download. """ + progress_cb, progress_writer = _progress_updater(job_id, stop_event) try: with Session(engine) as s: site = req.get("site", "aniworld.to") @@ -152,16 +234,18 @@ def _run_download(job_id: str, req: dict, stop_event: threading.Event): ), dest_dir=DOWNLOAD_DIR, title_hint=req.get("title_hint"), - progress_cb=_progress_updater(job_id, stop_event), + progress_cb=progress_cb, stop_event=stop_event, site=req.get("site", "aniworld.to"), ) + progress_writer.close(flush=True) with Session(engine) as s: update_job( s, job_id, status="completed", progress=100.0, result_path=str(dest) ) except OSError as e: + progress_writer.close(flush=False) with Session(engine) as s: if e.errno in (errno.EACCES, errno.EROFS): update_job( @@ -173,6 +257,7 @@ def _run_download(job_id: str, req: dict, stop_event: threading.Event): else: update_job(s, job_id, status="failed", message=str(e)) except Exception as e: + progress_writer.close(flush=False) msg = str(e) status = "failed" if "Cancel" in msg or "cancel" in msg: @@ -309,7 +394,42 @@ def _run_strm(job_id: str, req: dict, stop_event: threading.Event) -> None: RUNNING.pop(job_id, None) -def schedule_download(req: dict) -> str: +def start_scheduled_job(job_id: str, req: dict) -> None: + init_executor() + if EXECUTOR is None: + raise RuntimeError("executor not available") + + stop_event = threading.Event() + mode = str(req.get("mode") or "").strip().lower() + runner = _run_strm if mode == "strm" else _run_download + with RUNNING_LOCK: + if job_id in RUNNING: + raise RuntimeError(f"job already running: {job_id}") + RUNNING[job_id] = (None, stop_event) + try: + fut = EXECUTOR.submit(runner, job_id, req, stop_event) + except Exception: + with RUNNING_LOCK: + current = RUNNING.get(job_id) + if current is not None and current[1] is stop_event: + RUNNING.pop(job_id, None) + raise + with RUNNING_LOCK: + current = RUNNING.get(job_id) + if current is None or current[1] is not stop_event: + stop_event.set() + fut.cancel() + return + RUNNING[job_id] = (fut, stop_event) + + +def create_scheduled_job(req: dict) -> str: + with Session(engine) as s: + job = create_job(s, source_site=req.get("site") or "aniworld.to") + return job.id + + +def schedule_download(req: dict, *, autostart: bool = True) -> str: """ Schedule a background download job and return its job identifier. @@ -330,20 +450,10 @@ def schedule_download(req: dict) -> str: Raises: RuntimeError: If the thread pool executor is unavailable after initialization. """ - init_executor() - if EXECUTOR is None: - raise RuntimeError("executor not available") - - with Session(engine) as s: - job = create_job(s, source_site=req.get("site") or "aniworld.to") - - stop_event = threading.Event() - mode = str(req.get("mode") or "").strip().lower() - runner = _run_strm if mode == "strm" else _run_download - fut = EXECUTOR.submit(runner, job.id, req, stop_event) - with RUNNING_LOCK: - RUNNING[job.id] = (fut, stop_event) - return job.id + job_id = create_scheduled_job(req) + if autostart: + start_scheduled_job(job_id, req) + return job_id def cancel_job(job_id: str) -> None: @@ -353,4 +463,10 @@ def cancel_job(job_id: str) -> None: return fut, ev = item ev.set() + if fut is None: + with RUNNING_LOCK: + current = RUNNING.get(job_id) + if current is not None and current[1] is ev and current[0] is None: + RUNNING.pop(job_id, None) + return fut.cancel() diff --git a/apps/api/app/db/__init__.py b/apps/api/app/db/__init__.py index 379ec44e..71c12b5e 100644 --- a/apps/api/app/db/__init__.py +++ b/apps/api/app/db/__init__.py @@ -3,9 +3,14 @@ This package contains the SQLite/SQLModel data models and related utilities that used to live in `app/models.py`. Functionality and public API are preserved; imports should now use `from app.db import ...`. +Runtime export is intentionally dynamic; see `__init__.pyi` for Pylance food. """ from . import models as _models +# TODO: Clean up this this runtime facade and static type hinting +# It's a bit hacky but never caused any issues +# we should replace the dynamic globals().update(...) with a normal star import +# or eventually explicitly define what this package exports __all__ = [name for name in dir(_models) if not name.startswith("_")] globals().update({name: getattr(_models, name) for name in __all__}) diff --git a/apps/api/app/db/__init__.pyi b/apps/api/app/db/__init__.pyi new file mode 100644 index 00000000..61cf7267 --- /dev/null +++ b/apps/api/app/db/__init__.pyi @@ -0,0 +1,6 @@ +"""Static mirror of the dynamic exports in `__init__.py`. + +Python is fine with `globals().update(...)`; Pylance needs a little snack. +""" + +from .models import * # noqa: F403 diff --git a/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py b/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py new file mode 100644 index 00000000..cc38710e --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260428_0004_provider_catalog_index.py @@ -0,0 +1,610 @@ +"""Add provider catalog index and canonical mapping tables + +Revision ID: 20260428_0004 +Revises: 20260204_0003 +Create Date: 2026-04-28 00:00:00.000000 +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260428_0004" +down_revision = "20260204_0003" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + + if not inspector.has_table("providerindexstatus"): + op.create_table( + "providerindexstatus", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("refresh_interval_hours", sa.Float(), nullable=False), + sa.Column("status", sa.String(), nullable=False), + sa.Column("current_generation", sa.String(), nullable=True), + sa.Column("latest_success_generation", sa.String(), nullable=True), + sa.Column("latest_started_at", sa.DateTime(), nullable=True), + sa.Column("latest_completed_at", sa.DateTime(), nullable=True), + sa.Column("latest_success_at", sa.DateTime(), nullable=True), + sa.Column("next_refresh_after", sa.DateTime(), nullable=True), + sa.Column("bootstrap_completed", sa.Boolean(), nullable=False), + sa.Column("failure_count", sa.Integer(), nullable=False), + sa.Column("last_error_summary", sa.String(), nullable=True), + sa.Column("cursor_title_slug", sa.String(), nullable=True), + sa.Column("updated_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("provider", name="pk_providerindexstatus"), + ) + op.create_index( + "ix_providerindexstatus_status", + "providerindexstatus", + ["status"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_latest_started_at", + "providerindexstatus", + ["latest_started_at"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_latest_completed_at", + "providerindexstatus", + ["latest_completed_at"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_latest_success_at", + "providerindexstatus", + ["latest_success_at"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_next_refresh_after", + "providerindexstatus", + ["next_refresh_after"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_bootstrap_completed", + "providerindexstatus", + ["bootstrap_completed"], + unique=False, + ) + op.create_index( + "ix_providerindexstatus_updated_at", + "providerindexstatus", + ["updated_at"], + unique=False, + ) + + if not inspector.has_table("providertitleindexstate"): + op.create_table( + "providertitleindexstate", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("last_attempted_at", sa.DateTime(), nullable=True), + sa.Column("last_success_at", sa.DateTime(), nullable=True), + sa.Column("failure_count", sa.Integer(), nullable=False), + sa.Column("last_error_summary", sa.String(), nullable=True), + sa.Column("updated_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", name="pk_providertitleindexstate" + ), + ) + op.create_index( + "ix_providertitleindexstate_last_attempted_at", + "providertitleindexstate", + ["last_attempted_at"], + unique=False, + ) + op.create_index( + "ix_providertitleindexstate_last_success_at", + "providertitleindexstate", + ["last_success_at"], + unique=False, + ) + op.create_index( + "ix_providertitleindexstate_updated_at", + "providertitleindexstate", + ["updated_at"], + unique=False, + ) + + if not inspector.has_table("providercatalogtitle"): + op.create_table( + "providercatalogtitle", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("normalized_title", sa.String(), nullable=False), + sa.Column("media_type_hint", sa.String(), nullable=False), + sa.Column("relative_path", sa.String(), nullable=False), + sa.Column("indexed_generation", sa.String(), nullable=False), + sa.Column("last_indexed_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("provider", "slug", name="pk_providercatalogtitle"), + ) + op.create_index( + "ix_providercatalogtitle_title", + "providercatalogtitle", + ["title"], + unique=False, + ) + op.create_index( + "ix_providercatalogtitle_normalized_title", + "providercatalogtitle", + ["normalized_title"], + unique=False, + ) + op.create_index( + "ix_providercatalogtitle_media_type_hint", + "providercatalogtitle", + ["media_type_hint"], + unique=False, + ) + op.create_index( + "ix_providercatalogtitle_indexed_generation", + "providercatalogtitle", + ["indexed_generation"], + unique=False, + ) + op.create_index( + "ix_providercatalogtitle_last_indexed_at", + "providercatalogtitle", + ["last_indexed_at"], + unique=False, + ) + + if not inspector.has_table("providercatalogalias"): + op.create_table( + "providercatalogalias", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("alias", sa.String(), nullable=False), + sa.Column("normalized_alias", sa.String(), nullable=False), + sa.Column("indexed_generation", sa.String(), nullable=False), + sa.Column("last_indexed_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", "alias", name="pk_providercatalogalias" + ), + ) + op.create_index( + "ix_providercatalogalias_normalized_alias", + "providercatalogalias", + ["normalized_alias"], + unique=False, + ) + op.create_index( + "ix_providercatalogalias_indexed_generation", + "providercatalogalias", + ["indexed_generation"], + unique=False, + ) + op.create_index( + "ix_providercatalogalias_last_indexed_at", + "providercatalogalias", + ["last_indexed_at"], + unique=False, + ) + + if not inspector.has_table("providercatalogepisode"): + op.create_table( + "providercatalogepisode", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("season", sa.Integer(), nullable=False), + sa.Column("episode", sa.Integer(), nullable=False), + sa.Column("title_primary", sa.String(), nullable=True), + sa.Column("title_secondary", sa.String(), nullable=True), + sa.Column("relative_path", sa.String(), nullable=False), + sa.Column("media_type_hint", sa.String(), nullable=False), + sa.Column("indexed_generation", sa.String(), nullable=False), + sa.Column("last_indexed_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", + "slug", + "season", + "episode", + name="pk_providercatalogepisode", + ), + ) + op.create_index( + "ix_providercatalogepisode_media_type_hint", + "providercatalogepisode", + ["media_type_hint"], + unique=False, + ) + op.create_index( + "ix_providercatalogepisode_indexed_generation", + "providercatalogepisode", + ["indexed_generation"], + unique=False, + ) + op.create_index( + "ix_providercatalogepisode_last_indexed_at", + "providercatalogepisode", + ["last_indexed_at"], + unique=False, + ) + + if not inspector.has_table("providerepisodelanguage"): + op.create_table( + "providerepisodelanguage", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("season", sa.Integer(), nullable=False), + sa.Column("episode", sa.Integer(), nullable=False), + sa.Column("language", sa.String(), nullable=False), + sa.Column("normalized_language", sa.String(), nullable=False), + sa.Column("host_hints", sa.JSON(), nullable=True), + sa.Column("indexed_generation", sa.String(), nullable=False), + sa.Column("last_indexed_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", + "slug", + "season", + "episode", + "language", + name="pk_providerepisodelanguage", + ), + ) + op.create_index( + "ix_providerepisodelanguage_normalized_language", + "providerepisodelanguage", + ["normalized_language"], + unique=False, + ) + op.create_index( + "ix_providerepisodelanguage_indexed_generation", + "providerepisodelanguage", + ["indexed_generation"], + unique=False, + ) + op.create_index( + "ix_providerepisodelanguage_last_indexed_at", + "providerepisodelanguage", + ["last_indexed_at"], + unique=False, + ) + + if not inspector.has_table("canonicalseries"): + op.create_table( + "canonicalseries", + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("normalized_title", sa.String(), nullable=False), + sa.Column("tmdb_id", sa.Integer(), nullable=True), + sa.Column("imdb_id", sa.String(), nullable=True), + sa.Column("tvmaze_id", sa.Integer(), nullable=True), + sa.Column("anilist_id", sa.Integer(), nullable=True), + sa.Column("mal_id", sa.Integer(), nullable=True), + sa.Column("last_synced_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("tvdb_id", name="pk_canonicalseries"), + ) + op.create_index( + "ix_canonicalseries_title", "canonicalseries", ["title"], unique=False + ) + op.create_index( + "ix_canonicalseries_normalized_title", + "canonicalseries", + ["normalized_title"], + unique=False, + ) + op.create_index( + "ix_canonicalseries_tmdb_id", "canonicalseries", ["tmdb_id"], unique=False + ) + op.create_index( + "ix_canonicalseries_imdb_id", "canonicalseries", ["imdb_id"], unique=False + ) + op.create_index( + "ix_canonicalseries_tvmaze_id", + "canonicalseries", + ["tvmaze_id"], + unique=False, + ) + op.create_index( + "ix_canonicalseries_anilist_id", + "canonicalseries", + ["anilist_id"], + unique=False, + ) + op.create_index( + "ix_canonicalseries_mal_id", "canonicalseries", ["mal_id"], unique=False + ) + op.create_index( + "ix_canonicalseries_last_synced_at", + "canonicalseries", + ["last_synced_at"], + unique=False, + ) + + if not inspector.has_table("canonicalseriesalias"): + op.create_table( + "canonicalseriesalias", + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("alias", sa.String(), nullable=False), + sa.Column("normalized_alias", sa.String(), nullable=False), + sa.PrimaryKeyConstraint("tvdb_id", "alias", name="pk_canonicalseriesalias"), + ) + op.create_index( + "ix_canonicalseriesalias_normalized_alias", + "canonicalseriesalias", + ["normalized_alias"], + unique=False, + ) + + if not inspector.has_table("canonicalepisode"): + op.create_table( + "canonicalepisode", + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("season", sa.Integer(), nullable=False), + sa.Column("episode", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("normalized_title", sa.String(), nullable=False), + sa.Column("last_synced_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "tvdb_id", "season", "episode", name="pk_canonicalepisode" + ), + ) + op.create_index( + "ix_canonicalepisode_title", + "canonicalepisode", + ["title"], + unique=False, + ) + op.create_index( + "ix_canonicalepisode_normalized_title", + "canonicalepisode", + ["normalized_title"], + unique=False, + ) + op.create_index( + "ix_canonicalepisode_last_synced_at", + "canonicalepisode", + ["last_synced_at"], + unique=False, + ) + + if not inspector.has_table("canonicalmovie"): + op.create_table( + "canonicalmovie", + sa.Column("tmdb_id", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("normalized_title", sa.String(), nullable=False), + sa.Column("release_year", sa.Integer(), nullable=False), + sa.Column("imdb_id", sa.String(), nullable=True), + sa.Column("tvdb_id", sa.Integer(), nullable=True), + sa.Column("last_synced_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("tmdb_id", name="pk_canonicalmovie"), + ) + op.create_index( + "ix_canonicalmovie_title", "canonicalmovie", ["title"], unique=False + ) + op.create_index( + "ix_canonicalmovie_normalized_title", + "canonicalmovie", + ["normalized_title"], + unique=False, + ) + op.create_index( + "ix_canonicalmovie_release_year", + "canonicalmovie", + ["release_year"], + unique=False, + ) + op.create_index( + "ix_canonicalmovie_imdb_id", "canonicalmovie", ["imdb_id"], unique=False + ) + op.create_index( + "ix_canonicalmovie_tvdb_id", "canonicalmovie", ["tvdb_id"], unique=False + ) + op.create_index( + "ix_canonicalmovie_last_synced_at", + "canonicalmovie", + ["last_synced_at"], + unique=False, + ) + + if not inspector.has_table("providerseriesmapping"): + op.create_table( + "providerseriesmapping", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("confidence", sa.String(), nullable=False), + sa.Column("source", sa.String(), nullable=False), + sa.Column("rationale", sa.String(), nullable=True), + sa.Column("last_verified_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", "tvdb_id", name="pk_providerseriesmapping" + ), + ) + op.create_index( + "ix_providerseriesmapping_confidence", + "providerseriesmapping", + ["confidence"], + unique=False, + ) + op.create_index( + "ix_providerseriesmapping_source", + "providerseriesmapping", + ["source"], + unique=False, + ) + op.create_index( + "ix_providerseriesmapping_last_verified_at", + "providerseriesmapping", + ["last_verified_at"], + unique=False, + ) + + if not inspector.has_table("providerepisodemapping"): + op.create_table( + "providerepisodemapping", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("provider_season", sa.Integer(), nullable=False), + sa.Column("provider_episode", sa.Integer(), nullable=False), + sa.Column("tvdb_id", sa.Integer(), nullable=False), + sa.Column("canonical_season", sa.Integer(), nullable=False), + sa.Column("canonical_episode", sa.Integer(), nullable=False), + sa.Column("confidence", sa.String(), nullable=False), + sa.Column("source", sa.String(), nullable=False), + sa.Column("rationale", sa.String(), nullable=True), + sa.Column("last_verified_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", + "slug", + "provider_season", + "provider_episode", + "tvdb_id", + "canonical_season", + "canonical_episode", + name="pk_providerepisodemapping", + ), + ) + op.create_index( + "ix_providerepisodemapping_confidence", + "providerepisodemapping", + ["confidence"], + unique=False, + ) + op.create_index( + "ix_providerepisodemapping_source", + "providerepisodemapping", + ["source"], + unique=False, + ) + op.create_index( + "ix_providerepisodemapping_last_verified_at", + "providerepisodemapping", + ["last_verified_at"], + unique=False, + ) + + if not inspector.has_table("providermoviemapping"): + op.create_table( + "providermoviemapping", + sa.Column("provider", sa.String(), nullable=False), + sa.Column("slug", sa.String(), nullable=False), + sa.Column("tmdb_id", sa.Integer(), nullable=False), + sa.Column("confidence", sa.String(), nullable=False), + sa.Column("source", sa.String(), nullable=False), + sa.Column("rationale", sa.String(), nullable=True), + sa.Column("last_verified_at", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "provider", "slug", "tmdb_id", name="pk_providermoviemapping" + ), + ) + op.create_index( + "ix_providermoviemapping_confidence", + "providermoviemapping", + ["confidence"], + unique=False, + ) + op.create_index( + "ix_providermoviemapping_source", + "providermoviemapping", + ["source"], + unique=False, + ) + op.create_index( + "ix_providermoviemapping_last_verified_at", + "providermoviemapping", + ["last_verified_at"], + unique=False, + ) + + +def downgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + + table_indexes = { + "providermoviemapping": [ + "ix_providermoviemapping_last_verified_at", + "ix_providermoviemapping_source", + "ix_providermoviemapping_confidence", + ], + "providerepisodemapping": [ + "ix_providerepisodemapping_last_verified_at", + "ix_providerepisodemapping_source", + "ix_providerepisodemapping_confidence", + ], + "providerseriesmapping": [ + "ix_providerseriesmapping_last_verified_at", + "ix_providerseriesmapping_source", + "ix_providerseriesmapping_confidence", + ], + "canonicalmovie": [ + "ix_canonicalmovie_last_synced_at", + "ix_canonicalmovie_tvdb_id", + "ix_canonicalmovie_imdb_id", + "ix_canonicalmovie_release_year", + "ix_canonicalmovie_normalized_title", + "ix_canonicalmovie_title", + ], + "canonicalepisode": [ + "ix_canonicalepisode_last_synced_at", + "ix_canonicalepisode_normalized_title", + "ix_canonicalepisode_title", + ], + "canonicalseriesalias": ["ix_canonicalseriesalias_normalized_alias"], + "canonicalseries": [ + "ix_canonicalseries_last_synced_at", + "ix_canonicalseries_mal_id", + "ix_canonicalseries_anilist_id", + "ix_canonicalseries_tvmaze_id", + "ix_canonicalseries_imdb_id", + "ix_canonicalseries_tmdb_id", + "ix_canonicalseries_normalized_title", + "ix_canonicalseries_title", + ], + "providerepisodelanguage": [ + "ix_providerepisodelanguage_last_indexed_at", + "ix_providerepisodelanguage_indexed_generation", + "ix_providerepisodelanguage_normalized_language", + ], + "providercatalogepisode": [ + "ix_providercatalogepisode_last_indexed_at", + "ix_providercatalogepisode_indexed_generation", + "ix_providercatalogepisode_media_type_hint", + ], + "providercatalogalias": [ + "ix_providercatalogalias_last_indexed_at", + "ix_providercatalogalias_indexed_generation", + "ix_providercatalogalias_normalized_alias", + ], + "providercatalogtitle": [ + "ix_providercatalogtitle_last_indexed_at", + "ix_providercatalogtitle_indexed_generation", + "ix_providercatalogtitle_media_type_hint", + "ix_providercatalogtitle_normalized_title", + "ix_providercatalogtitle_title", + ], + "providertitleindexstate": [ + "ix_providertitleindexstate_updated_at", + "ix_providertitleindexstate_last_success_at", + "ix_providertitleindexstate_last_attempted_at", + ], + "providerindexstatus": [ + "ix_providerindexstatus_updated_at", + "ix_providerindexstatus_bootstrap_completed", + "ix_providerindexstatus_next_refresh_after", + "ix_providerindexstatus_latest_success_at", + "ix_providerindexstatus_latest_completed_at", + "ix_providerindexstatus_latest_started_at", + "ix_providerindexstatus_status", + ], + } + for table, indexes in table_indexes.items(): + if not inspector.has_table(table): + continue + for index in indexes: + op.drop_index(index, table_name=table) + op.drop_table(table) diff --git a/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py b/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py new file mode 100644 index 00000000..68164112 --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260429_0005_provider_mapping_generations.py @@ -0,0 +1,420 @@ +"""Make provider mappings generation-aware + +Revision ID: 20260429_0005 +Revises: 20260428_0004 +Create Date: 2026-04-29 00:00:00.000000 +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260429_0005" +down_revision = "20260428_0004" +branch_labels = None +depends_on = None + + +def _rebuild_provider_mapping_table( + *, + table_name: str, + create_sql: str, + copy_sql: str, + index_sql: list[str], +) -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + if not inspector.has_table(table_name): + return + columns = {column["name"] for column in inspector.get_columns(table_name)} + if "indexed_generation" in columns: + return + + temp_table = f"{table_name}_v2" + op.execute(sa.text(f"DROP TABLE IF EXISTS {temp_table}")) + op.execute(sa.text(create_sql)) + op.execute(sa.text(copy_sql)) + op.drop_table(table_name) + op.rename_table(temp_table, table_name) + for statement in index_sql: + op.execute(sa.text(statement)) + + +def upgrade() -> None: + _rebuild_provider_mapping_table( + table_name="providerseriesmapping", + create_sql=""" + CREATE TABLE providerseriesmapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + tvdb_id INTEGER NOT NULL, + indexed_generation VARCHAR NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, tvdb_id, indexed_generation) + ) + """, + copy_sql=""" + INSERT INTO providerseriesmapping_v2 ( + provider, + slug, + tvdb_id, + indexed_generation, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + mapping.provider, + mapping.slug, + mapping.tvdb_id, + COALESCE(status.latest_success_generation, status.current_generation, 'legacy'), + mapping.confidence, + mapping.source, + mapping.rationale, + mapping.last_verified_at + FROM providerseriesmapping AS mapping + LEFT JOIN providerindexstatus AS status + ON status.provider = mapping.provider + """, + index_sql=[ + "CREATE INDEX ix_providerseriesmapping_confidence ON providerseriesmapping (confidence)", + "CREATE INDEX ix_providerseriesmapping_source ON providerseriesmapping (source)", + "CREATE INDEX ix_providerseriesmapping_last_verified_at ON providerseriesmapping (last_verified_at)", + "CREATE INDEX ix_providerseriesmapping_indexed_generation ON providerseriesmapping (indexed_generation)", + ], + ) + _rebuild_provider_mapping_table( + table_name="providerepisodemapping", + create_sql=""" + CREATE TABLE providerepisodemapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + provider_season INTEGER NOT NULL, + provider_episode INTEGER NOT NULL, + tvdb_id INTEGER NOT NULL, + canonical_season INTEGER NOT NULL, + canonical_episode INTEGER NOT NULL, + indexed_generation VARCHAR NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY ( + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + indexed_generation + ) + ) + """, + copy_sql=""" + INSERT INTO providerepisodemapping_v2 ( + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + indexed_generation, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + mapping.provider, + mapping.slug, + mapping.provider_season, + mapping.provider_episode, + mapping.tvdb_id, + mapping.canonical_season, + mapping.canonical_episode, + COALESCE(status.latest_success_generation, status.current_generation, 'legacy'), + mapping.confidence, + mapping.source, + mapping.rationale, + mapping.last_verified_at + FROM providerepisodemapping AS mapping + LEFT JOIN providerindexstatus AS status + ON status.provider = mapping.provider + """, + index_sql=[ + "CREATE INDEX ix_providerepisodemapping_confidence ON providerepisodemapping (confidence)", + "CREATE INDEX ix_providerepisodemapping_source ON providerepisodemapping (source)", + "CREATE INDEX ix_providerepisodemapping_last_verified_at ON providerepisodemapping (last_verified_at)", + "CREATE INDEX ix_providerepisodemapping_indexed_generation ON providerepisodemapping (indexed_generation)", + ], + ) + _rebuild_provider_mapping_table( + table_name="providermoviemapping", + create_sql=""" + CREATE TABLE providermoviemapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + tmdb_id INTEGER NOT NULL, + indexed_generation VARCHAR NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, tmdb_id, indexed_generation) + ) + """, + copy_sql=""" + INSERT INTO providermoviemapping_v2 ( + provider, + slug, + tmdb_id, + indexed_generation, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + mapping.provider, + mapping.slug, + mapping.tmdb_id, + COALESCE(status.latest_success_generation, status.current_generation, 'legacy'), + mapping.confidence, + mapping.source, + mapping.rationale, + mapping.last_verified_at + FROM providermoviemapping AS mapping + LEFT JOIN providerindexstatus AS status + ON status.provider = mapping.provider + """, + index_sql=[ + "CREATE INDEX ix_providermoviemapping_confidence ON providermoviemapping (confidence)", + "CREATE INDEX ix_providermoviemapping_source ON providermoviemapping (source)", + "CREATE INDEX ix_providermoviemapping_last_verified_at ON providermoviemapping (last_verified_at)", + "CREATE INDEX ix_providermoviemapping_indexed_generation ON providermoviemapping (indexed_generation)", + ], + ) + + +def downgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + + downgrade_specs = { + "providerseriesmapping": { + "create_sql": """ + CREATE TABLE providerseriesmapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + tvdb_id INTEGER NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, tvdb_id) + ) + """, + "copy_sql": """ + INSERT INTO providerseriesmapping_v2 ( + provider, + slug, + tvdb_id, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + provider, + slug, + tvdb_id, + confidence, + source, + rationale, + last_verified_at + FROM ( + SELECT + provider, + slug, + tvdb_id, + confidence, + source, + rationale, + last_verified_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, tvdb_id + ORDER BY indexed_generation DESC, last_verified_at DESC + ) AS rn + FROM providerseriesmapping + ) + WHERE rn = 1 + """, + "indexes": [ + "CREATE INDEX ix_providerseriesmapping_confidence ON providerseriesmapping (confidence)", + "CREATE INDEX ix_providerseriesmapping_source ON providerseriesmapping (source)", + "CREATE INDEX ix_providerseriesmapping_last_verified_at ON providerseriesmapping (last_verified_at)", + ], + }, + "providerepisodemapping": { + "create_sql": """ + CREATE TABLE providerepisodemapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + provider_season INTEGER NOT NULL, + provider_episode INTEGER NOT NULL, + tvdb_id INTEGER NOT NULL, + canonical_season INTEGER NOT NULL, + canonical_episode INTEGER NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY ( + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode + ) + ) + """, + "copy_sql": """ + INSERT INTO providerepisodemapping_v2 ( + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + confidence, + source, + rationale, + last_verified_at + FROM ( + SELECT + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode, + confidence, + source, + rationale, + last_verified_at, + ROW_NUMBER() OVER ( + PARTITION BY + provider, + slug, + provider_season, + provider_episode, + tvdb_id, + canonical_season, + canonical_episode + ORDER BY indexed_generation DESC, last_verified_at DESC + ) AS rn + FROM providerepisodemapping + ) + WHERE rn = 1 + """, + "indexes": [ + "CREATE INDEX ix_providerepisodemapping_confidence ON providerepisodemapping (confidence)", + "CREATE INDEX ix_providerepisodemapping_source ON providerepisodemapping (source)", + "CREATE INDEX ix_providerepisodemapping_last_verified_at ON providerepisodemapping (last_verified_at)", + ], + }, + "providermoviemapping": { + "create_sql": """ + CREATE TABLE providermoviemapping_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + tmdb_id INTEGER NOT NULL, + confidence VARCHAR NOT NULL, + source VARCHAR NOT NULL, + rationale VARCHAR, + last_verified_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, tmdb_id) + ) + """, + "copy_sql": """ + INSERT INTO providermoviemapping_v2 ( + provider, + slug, + tmdb_id, + confidence, + source, + rationale, + last_verified_at + ) + SELECT + provider, + slug, + tmdb_id, + confidence, + source, + rationale, + last_verified_at + FROM ( + SELECT + provider, + slug, + tmdb_id, + confidence, + source, + rationale, + last_verified_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, tmdb_id + ORDER BY indexed_generation DESC, last_verified_at DESC + ) AS rn + FROM providermoviemapping + ) + WHERE rn = 1 + """, + "indexes": [ + "CREATE INDEX ix_providermoviemapping_confidence ON providermoviemapping (confidence)", + "CREATE INDEX ix_providermoviemapping_source ON providermoviemapping (source)", + "CREATE INDEX ix_providermoviemapping_last_verified_at ON providermoviemapping (last_verified_at)", + ], + }, + } + + for table_name, spec in downgrade_specs.items(): + if not inspector.has_table(table_name): + continue + columns = {column["name"] for column in inspector.get_columns(table_name)} + if "indexed_generation" not in columns: + continue + temp_table = f"{table_name}_v2" + op.execute(sa.text(f"DROP TABLE IF EXISTS {temp_table}")) + op.execute(sa.text(spec["create_sql"])) + op.execute(sa.text(spec["copy_sql"])) + op.drop_table(table_name) + op.rename_table(temp_table, table_name) + for statement in spec["indexes"]: + op.execute(sa.text(statement)) diff --git a/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py b/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py new file mode 100644 index 00000000..6a2c8f93 --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260429_0006_provider_index_stages.py @@ -0,0 +1,265 @@ +"""Add staged provider index status and title enrichment state + +Revision ID: 20260429_0006 +Revises: 20260429_0005 +Create Date: 2026-04-29 00:30:00.000000 +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260429_0006" +down_revision = "20260429_0005" +branch_labels = None +depends_on = None + + +def _has_column(inspector, table_name: str, column_name: str) -> bool: + return column_name in { + column["name"] for column in inspector.get_columns(table_name) + } + + +def upgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + + if inspector.has_table("providerindexstatus"): + with op.batch_alter_table("providerindexstatus") as batch_op: + if not _has_column(inspector, "providerindexstatus", "active_stage"): + batch_op.add_column( + sa.Column("active_stage", sa.String(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_active_stage", + ["active_stage"], + unique=False, + ) + if not _has_column(inspector, "providerindexstatus", "title_index_status"): + batch_op.add_column( + sa.Column( + "title_index_status", + sa.String(), + nullable=False, + server_default="pending", + ) + ) + batch_op.create_index( + "ix_providerindexstatus_title_index_status", + ["title_index_status"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "title_index_ready_at" + ): + batch_op.add_column( + sa.Column("title_index_ready_at", sa.DateTime(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_title_index_ready_at", + ["title_index_ready_at"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "title_index_next_retry_after" + ): + batch_op.add_column( + sa.Column( + "title_index_next_retry_after", + sa.DateTime(), + nullable=True, + ) + ) + batch_op.create_index( + "ix_providerindexstatus_title_index_next_retry_after", + ["title_index_next_retry_after"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "detail_enrichment_status" + ): + batch_op.add_column( + sa.Column( + "detail_enrichment_status", + sa.String(), + nullable=False, + server_default="pending", + ) + ) + batch_op.create_index( + "ix_providerindexstatus_detail_enrichment_status", + ["detail_enrichment_status"], + unique=False, + ) + if not _has_column(inspector, "providerindexstatus", "detail_ready_at"): + batch_op.add_column( + sa.Column("detail_ready_at", sa.DateTime(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_detail_ready_at", + ["detail_ready_at"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "detail_next_retry_after" + ): + batch_op.add_column( + sa.Column("detail_next_retry_after", sa.DateTime(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_detail_next_retry_after", + ["detail_next_retry_after"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "canonical_enrichment_status" + ): + batch_op.add_column( + sa.Column( + "canonical_enrichment_status", + sa.String(), + nullable=False, + server_default="pending", + ) + ) + batch_op.create_index( + "ix_providerindexstatus_canonical_enrichment_status", + ["canonical_enrichment_status"], + unique=False, + ) + if not _has_column(inspector, "providerindexstatus", "canonical_ready_at"): + batch_op.add_column( + sa.Column("canonical_ready_at", sa.DateTime(), nullable=True) + ) + batch_op.create_index( + "ix_providerindexstatus_canonical_ready_at", + ["canonical_ready_at"], + unique=False, + ) + if not _has_column( + inspector, "providerindexstatus", "canonical_next_retry_after" + ): + batch_op.add_column( + sa.Column( + "canonical_next_retry_after", + sa.DateTime(), + nullable=True, + ) + ) + batch_op.create_index( + "ix_providerindexstatus_canonical_next_retry_after", + ["canonical_next_retry_after"], + unique=False, + ) + + if inspector.has_table("providertitleindexstate"): + with op.batch_alter_table("providertitleindexstate") as batch_op: + staged_columns = [ + ("detail_status", sa.String(), "pending"), + ("detail_last_attempted_at", sa.DateTime(), None), + ("detail_last_success_at", sa.DateTime(), None), + ("detail_next_retry_after", sa.DateTime(), None), + ("detail_failure_count", sa.Integer(), 0), + ("detail_last_error_summary", sa.String(), None), + ("canonical_status", sa.String(), "pending"), + ("canonical_last_attempted_at", sa.DateTime(), None), + ("canonical_last_success_at", sa.DateTime(), None), + ("canonical_next_retry_after", sa.DateTime(), None), + ("canonical_failure_count", sa.Integer(), 0), + ("canonical_last_error_summary", sa.String(), None), + ] + for name, column_type, default in staged_columns: + if _has_column(inspector, "providertitleindexstate", name): + continue + kwargs = {"nullable": True} + if default is not None: + kwargs["server_default"] = str(default) + kwargs["nullable"] = False + batch_op.add_column(sa.Column(name, column_type, **kwargs)) + if ( + name.endswith("_status") + or name.endswith("_at") + or name.endswith("_retry_after") + ): + batch_op.create_index( + f"ix_providertitleindexstate_{name}", + [name], + unique=False, + ) + + +def downgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + + providerindexstatus_indexes = [ + "ix_providerindexstatus_canonical_next_retry_after", + "ix_providerindexstatus_canonical_ready_at", + "ix_providerindexstatus_canonical_enrichment_status", + "ix_providerindexstatus_detail_next_retry_after", + "ix_providerindexstatus_detail_ready_at", + "ix_providerindexstatus_detail_enrichment_status", + "ix_providerindexstatus_title_index_next_retry_after", + "ix_providerindexstatus_title_index_ready_at", + "ix_providerindexstatus_title_index_status", + "ix_providerindexstatus_active_stage", + ] + providerindexstatus_columns = [ + "canonical_next_retry_after", + "canonical_ready_at", + "canonical_enrichment_status", + "detail_next_retry_after", + "detail_ready_at", + "detail_enrichment_status", + "title_index_next_retry_after", + "title_index_ready_at", + "title_index_status", + "active_stage", + ] + if inspector.has_table("providerindexstatus"): + existing_columns = { + column["name"] for column in inspector.get_columns("providerindexstatus") + } + existing_indexes = { + index["name"] for index in inspector.get_indexes("providerindexstatus") + } + with op.batch_alter_table("providerindexstatus") as batch_op: + for index_name in providerindexstatus_indexes: + if index_name in existing_indexes: + batch_op.drop_index(index_name) + for column_name in providerindexstatus_columns: + if column_name in existing_columns: + batch_op.drop_column(column_name) + + staged_columns = [ + "canonical_last_error_summary", + "canonical_failure_count", + "canonical_next_retry_after", + "canonical_last_success_at", + "canonical_last_attempted_at", + "canonical_status", + "detail_last_error_summary", + "detail_failure_count", + "detail_next_retry_after", + "detail_last_success_at", + "detail_last_attempted_at", + "detail_status", + ] + if inspector.has_table("providertitleindexstate"): + existing_columns = { + column["name"] + for column in inspector.get_columns("providertitleindexstate") + } + existing_indexes = { + index["name"] for index in inspector.get_indexes("providertitleindexstate") + } + with op.batch_alter_table("providertitleindexstate") as batch_op: + for column_name in staged_columns: + index_name = f"ix_providertitleindexstate_{column_name}" + if index_name in existing_indexes: + batch_op.drop_index(index_name) + if column_name in existing_columns: + batch_op.drop_column(column_name) diff --git a/apps/api/app/db/migrations/versions/20260430_0007_provider_catalog_generations.py b/apps/api/app/db/migrations/versions/20260430_0007_provider_catalog_generations.py new file mode 100644 index 00000000..31710954 --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260430_0007_provider_catalog_generations.py @@ -0,0 +1,500 @@ +"""Make provider catalog rows generation-distinct + +Revision ID: 20260430_0007 +Revises: 20260429_0006 +Create Date: 2026-04-30 12:00:00.000000 +""" + +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + + +revision = "20260430_0007" +down_revision = "20260429_0006" +branch_labels = None +depends_on = None + + +def _rebuild_table( + *, + table_name: str, + temp_table: str, + create_sql: str, + copy_sql: str, + index_sql: list[str], + require_generation_in_pk: bool, +) -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + if not inspector.has_table(table_name): + return + pk = inspector.get_pk_constraint(table_name) or {} + pk_columns = pk.get("constrained_columns") or [] + if ("indexed_generation" in pk_columns) is require_generation_in_pk: + return + + op.execute(sa.text(f"DROP TABLE IF EXISTS {temp_table}")) + op.execute(sa.text(create_sql)) + op.execute(sa.text(copy_sql)) + op.drop_table(table_name) + op.rename_table(temp_table, table_name) + for statement in index_sql: + op.execute(sa.text(statement)) + + +def upgrade() -> None: + specs = { + "providercatalogtitle": { + "temp_table": "providercatalogtitle_v2", + "create_sql": """ + CREATE TABLE providercatalogtitle_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + title VARCHAR NOT NULL, + normalized_title VARCHAR NOT NULL, + media_type_hint VARCHAR NOT NULL, + relative_path VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, indexed_generation) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogtitle_v2 ( + provider, + slug, + indexed_generation, + title, + normalized_title, + media_type_hint, + relative_path, + last_indexed_at + ) + SELECT + provider, + slug, + indexed_generation, + title, + normalized_title, + media_type_hint, + relative_path, + last_indexed_at + FROM providercatalogtitle + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogtitle_title ON providercatalogtitle (title)", + "CREATE INDEX ix_providercatalogtitle_normalized_title ON providercatalogtitle (normalized_title)", + "CREATE INDEX ix_providercatalogtitle_media_type_hint ON providercatalogtitle (media_type_hint)", + "CREATE INDEX ix_providercatalogtitle_indexed_generation ON providercatalogtitle (indexed_generation)", + "CREATE INDEX ix_providercatalogtitle_last_indexed_at ON providercatalogtitle (last_indexed_at)", + ], + }, + "providercatalogalias": { + "temp_table": "providercatalogalias_v2", + "create_sql": """ + CREATE TABLE providercatalogalias_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + alias VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + normalized_alias VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, alias, indexed_generation) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogalias_v2 ( + provider, + slug, + alias, + indexed_generation, + normalized_alias, + last_indexed_at + ) + SELECT + provider, + slug, + alias, + indexed_generation, + normalized_alias, + last_indexed_at + FROM providercatalogalias + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogalias_normalized_alias ON providercatalogalias (normalized_alias)", + "CREATE INDEX ix_providercatalogalias_indexed_generation ON providercatalogalias (indexed_generation)", + "CREATE INDEX ix_providercatalogalias_last_indexed_at ON providercatalogalias (last_indexed_at)", + ], + }, + "providercatalogepisode": { + "temp_table": "providercatalogepisode_v2", + "create_sql": """ + CREATE TABLE providercatalogepisode_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + season INTEGER NOT NULL, + episode INTEGER NOT NULL, + indexed_generation VARCHAR NOT NULL, + title_primary VARCHAR, + title_secondary VARCHAR, + relative_path VARCHAR NOT NULL, + media_type_hint VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, season, episode, indexed_generation) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogepisode_v2 ( + provider, + slug, + season, + episode, + indexed_generation, + title_primary, + title_secondary, + relative_path, + media_type_hint, + last_indexed_at + ) + SELECT + provider, + slug, + season, + episode, + indexed_generation, + title_primary, + title_secondary, + relative_path, + media_type_hint, + last_indexed_at + FROM providercatalogepisode + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogepisode_media_type_hint ON providercatalogepisode (media_type_hint)", + "CREATE INDEX ix_providercatalogepisode_indexed_generation ON providercatalogepisode (indexed_generation)", + "CREATE INDEX ix_providercatalogepisode_last_indexed_at ON providercatalogepisode (last_indexed_at)", + ], + }, + "providerepisodelanguage": { + "temp_table": "providerepisodelanguage_v2", + "create_sql": """ + CREATE TABLE providerepisodelanguage_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + season INTEGER NOT NULL, + episode INTEGER NOT NULL, + language VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + normalized_language VARCHAR NOT NULL, + host_hints JSON, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY ( + provider, + slug, + season, + episode, + language, + indexed_generation + ) + ) + """, + "copy_sql": """ + INSERT INTO providerepisodelanguage_v2 ( + provider, + slug, + season, + episode, + language, + indexed_generation, + normalized_language, + host_hints, + last_indexed_at + ) + SELECT + provider, + slug, + season, + episode, + language, + indexed_generation, + normalized_language, + host_hints, + last_indexed_at + FROM providerepisodelanguage + """, + "index_sql": [ + "CREATE INDEX ix_providerepisodelanguage_normalized_language ON providerepisodelanguage (normalized_language)", + "CREATE INDEX ix_providerepisodelanguage_indexed_generation ON providerepisodelanguage (indexed_generation)", + "CREATE INDEX ix_providerepisodelanguage_last_indexed_at ON providerepisodelanguage (last_indexed_at)", + ], + }, + } + + for table_name, spec in specs.items(): + _rebuild_table( + table_name=table_name, + temp_table=spec["temp_table"], + create_sql=spec["create_sql"], + copy_sql=spec["copy_sql"], + index_sql=spec["index_sql"], + require_generation_in_pk=True, + ) + + +def downgrade() -> None: + specs = { + "providercatalogtitle": { + "temp_table": "providercatalogtitle_v2", + "create_sql": """ + CREATE TABLE providercatalogtitle_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + title VARCHAR NOT NULL, + normalized_title VARCHAR NOT NULL, + media_type_hint VARCHAR NOT NULL, + relative_path VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogtitle_v2 ( + provider, + slug, + title, + normalized_title, + media_type_hint, + relative_path, + indexed_generation, + last_indexed_at + ) + SELECT + provider, + slug, + title, + normalized_title, + media_type_hint, + relative_path, + indexed_generation, + last_indexed_at + FROM ( + SELECT + provider, + slug, + title, + normalized_title, + media_type_hint, + relative_path, + indexed_generation, + last_indexed_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug + ORDER BY indexed_generation DESC, last_indexed_at DESC + ) AS rn + FROM providercatalogtitle + ) + WHERE rn = 1 + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogtitle_title ON providercatalogtitle (title)", + "CREATE INDEX ix_providercatalogtitle_normalized_title ON providercatalogtitle (normalized_title)", + "CREATE INDEX ix_providercatalogtitle_media_type_hint ON providercatalogtitle (media_type_hint)", + "CREATE INDEX ix_providercatalogtitle_indexed_generation ON providercatalogtitle (indexed_generation)", + "CREATE INDEX ix_providercatalogtitle_last_indexed_at ON providercatalogtitle (last_indexed_at)", + ], + }, + "providercatalogalias": { + "temp_table": "providercatalogalias_v2", + "create_sql": """ + CREATE TABLE providercatalogalias_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + alias VARCHAR NOT NULL, + normalized_alias VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, alias) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogalias_v2 ( + provider, + slug, + alias, + normalized_alias, + indexed_generation, + last_indexed_at + ) + SELECT + provider, + slug, + alias, + normalized_alias, + indexed_generation, + last_indexed_at + FROM ( + SELECT + provider, + slug, + alias, + normalized_alias, + indexed_generation, + last_indexed_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, alias + ORDER BY indexed_generation DESC, last_indexed_at DESC + ) AS rn + FROM providercatalogalias + ) + WHERE rn = 1 + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogalias_normalized_alias ON providercatalogalias (normalized_alias)", + "CREATE INDEX ix_providercatalogalias_indexed_generation ON providercatalogalias (indexed_generation)", + "CREATE INDEX ix_providercatalogalias_last_indexed_at ON providercatalogalias (last_indexed_at)", + ], + }, + "providercatalogepisode": { + "temp_table": "providercatalogepisode_v2", + "create_sql": """ + CREATE TABLE providercatalogepisode_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + season INTEGER NOT NULL, + episode INTEGER NOT NULL, + title_primary VARCHAR, + title_secondary VARCHAR, + relative_path VARCHAR NOT NULL, + media_type_hint VARCHAR NOT NULL, + indexed_generation VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, season, episode) + ) + """, + "copy_sql": """ + INSERT INTO providercatalogepisode_v2 ( + provider, + slug, + season, + episode, + title_primary, + title_secondary, + relative_path, + media_type_hint, + indexed_generation, + last_indexed_at + ) + SELECT + provider, + slug, + season, + episode, + title_primary, + title_secondary, + relative_path, + media_type_hint, + indexed_generation, + last_indexed_at + FROM ( + SELECT + provider, + slug, + season, + episode, + title_primary, + title_secondary, + relative_path, + media_type_hint, + indexed_generation, + last_indexed_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, season, episode + ORDER BY indexed_generation DESC, last_indexed_at DESC + ) AS rn + FROM providercatalogepisode + ) + WHERE rn = 1 + """, + "index_sql": [ + "CREATE INDEX ix_providercatalogepisode_media_type_hint ON providercatalogepisode (media_type_hint)", + "CREATE INDEX ix_providercatalogepisode_indexed_generation ON providercatalogepisode (indexed_generation)", + "CREATE INDEX ix_providercatalogepisode_last_indexed_at ON providercatalogepisode (last_indexed_at)", + ], + }, + "providerepisodelanguage": { + "temp_table": "providerepisodelanguage_v2", + "create_sql": """ + CREATE TABLE providerepisodelanguage_v2 ( + provider VARCHAR NOT NULL, + slug VARCHAR NOT NULL, + season INTEGER NOT NULL, + episode INTEGER NOT NULL, + language VARCHAR NOT NULL, + normalized_language VARCHAR NOT NULL, + host_hints JSON, + indexed_generation VARCHAR NOT NULL, + last_indexed_at DATETIME NOT NULL, + PRIMARY KEY (provider, slug, season, episode, language) + ) + """, + "copy_sql": """ + INSERT INTO providerepisodelanguage_v2 ( + provider, + slug, + season, + episode, + language, + normalized_language, + host_hints, + indexed_generation, + last_indexed_at + ) + SELECT + provider, + slug, + season, + episode, + language, + normalized_language, + host_hints, + indexed_generation, + last_indexed_at + FROM ( + SELECT + provider, + slug, + season, + episode, + language, + normalized_language, + host_hints, + indexed_generation, + last_indexed_at, + ROW_NUMBER() OVER ( + PARTITION BY provider, slug, season, episode, language + ORDER BY indexed_generation DESC, last_indexed_at DESC + ) AS rn + FROM providerepisodelanguage + ) + WHERE rn = 1 + """, + "index_sql": [ + "CREATE INDEX ix_providerepisodelanguage_normalized_language ON providerepisodelanguage (normalized_language)", + "CREATE INDEX ix_providerepisodelanguage_indexed_generation ON providerepisodelanguage (indexed_generation)", + "CREATE INDEX ix_providerepisodelanguage_last_indexed_at ON providerepisodelanguage (last_indexed_at)", + ], + }, + } + + for table_name, spec in specs.items(): + _rebuild_table( + table_name=table_name, + temp_table=spec["temp_table"], + create_sql=spec["create_sql"], + copy_sql=spec["copy_sql"], + index_sql=spec["index_sql"], + require_generation_in_pk=False, + ) diff --git a/apps/api/app/db/migrations/versions/20260625_0008_client_task_resume_metadata.py b/apps/api/app/db/migrations/versions/20260625_0008_client_task_resume_metadata.py new file mode 100644 index 00000000..1f082bf6 --- /dev/null +++ b/apps/api/app/db/migrations/versions/20260625_0008_client_task_resume_metadata.py @@ -0,0 +1,39 @@ +"""Store download metadata required to resume paused client tasks. + +Revision ID: 20260625_0008 +Revises: 20260430_0007 +Create Date: 2026-06-25 03:20:00.000000 +""" + +from alembic import op +import sqlalchemy as sa + + +revision = "20260625_0008" +down_revision = "20260430_0007" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + inspector = sa.inspect(op.get_bind()) + existing_columns = { + column["name"] for column in inspector.get_columns("clienttask") + } + with op.batch_alter_table("clienttask") as batch_op: + if "provider" not in existing_columns: + batch_op.add_column(sa.Column("provider", sa.String(), nullable=True)) + if "mode" not in existing_columns: + batch_op.add_column(sa.Column("mode", sa.String(), nullable=True)) + + +def downgrade() -> None: + inspector = sa.inspect(op.get_bind()) + existing_columns = { + column["name"] for column in inspector.get_columns("clienttask") + } + with op.batch_alter_table("clienttask") as batch_op: + if "mode" in existing_columns: + batch_op.drop_column("mode") + if "provider" in existing_columns: + batch_op.drop_column("provider") diff --git a/apps/api/app/db/models.py b/apps/api/app/db/models.py index ffcaabd4..682583da 100644 --- a/apps/api/app/db/models.py +++ b/apps/api/app/db/models.py @@ -3,12 +3,15 @@ from typing import Optional, Literal, Generator, Any, Dict, List, TYPE_CHECKING from datetime import datetime, timezone, timedelta from uuid import uuid4 +import re +import unicodedata from loguru import logger from fastapi import HTTPException # Defer logger configuration to application startup from sqlmodel import SQLModel, Field, Session, create_engine, select, Column, JSON +from sqlalchemy import event, tuple_ from sqlalchemy.orm import registry as sa_registry from sqlalchemy.pool import NullPool @@ -18,23 +21,37 @@ from app.config import AVAILABILITY_TTL_HOURS, DATA_DIR JobStatus = Literal["queued", "downloading", "completed", "failed", "cancelled"] +CatalogRefreshStatus = Literal[ + "pending", + "running", + "ready", + "failed", +] +CatalogStageStatus = Literal[ + "pending", + "running", + "ready", + "failed", +] +CatalogMappingConfidence = Literal[ + "confirmed", + "high_confidence", + "low_confidence", + "unresolved", + "conflict", +] # ---- Datetime Helpers def utcnow() -> datetime: - logger.trace("utcnow() called.") return datetime.now(timezone.utc) def as_aware_utc(dt: Optional[datetime]) -> datetime: - logger.debug(f"as_aware_utc() called with dt={dt}") if dt is None: - logger.debug("Datetime is None, returning utcnow().") return utcnow() if dt.tzinfo is None: - logger.debug("Datetime is naive, setting tzinfo to UTC.") return dt.replace(tzinfo=timezone.utc) - logger.debug("Datetime is aware, converting to UTC.") return dt.astimezone(timezone.utc) @@ -154,6 +171,8 @@ class ClientTask(ModelBase, table=True): episode: int language: str site: Optional[str] = Field(default="aniworld.to", index=True) # Track source site + provider: Optional[str] = None + mode: Optional[str] = None job_id: Optional[str] = Field(default=None, index=True) save_path: Optional[str] = None category: Optional[str] = None @@ -164,17 +183,208 @@ class ClientTask(ModelBase, table=True): ) # queued/downloading/paused/completed/error +# ---------------- Provider Catalog Index +class ProviderIndexStatus(ModelBase, table=True): + provider: str = Field(primary_key=True) + refresh_interval_hours: float = 24.0 + status: str = Field(default="pending", index=True) + active_stage: Optional[str] = Field(default=None, index=True) + current_generation: Optional[str] = None + latest_success_generation: Optional[str] = None + latest_started_at: Optional[datetime] = Field(default=None, index=True) + latest_completed_at: Optional[datetime] = Field(default=None, index=True) + latest_success_at: Optional[datetime] = Field(default=None, index=True) + next_refresh_after: Optional[datetime] = Field(default=None, index=True) + bootstrap_completed: bool = Field(default=False, index=True) + title_index_status: str = Field(default="pending", index=True) + title_index_ready_at: Optional[datetime] = Field(default=None, index=True) + title_index_next_retry_after: Optional[datetime] = Field(default=None, index=True) + detail_enrichment_status: str = Field(default="pending", index=True) + detail_ready_at: Optional[datetime] = Field(default=None, index=True) + detail_next_retry_after: Optional[datetime] = Field(default=None, index=True) + canonical_enrichment_status: str = Field(default="pending", index=True) + canonical_ready_at: Optional[datetime] = Field(default=None, index=True) + canonical_next_retry_after: Optional[datetime] = Field( + default=None, + index=True, + ) + failure_count: int = 0 + last_error_summary: Optional[str] = None + cursor_title_slug: Optional[str] = None + updated_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderTitleIndexState(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + last_attempted_at: Optional[datetime] = Field(default=None, index=True) + last_success_at: Optional[datetime] = Field(default=None, index=True) + failure_count: int = 0 + last_error_summary: Optional[str] = None + detail_status: str = Field(default="pending", index=True) + detail_last_attempted_at: Optional[datetime] = Field(default=None, index=True) + detail_last_success_at: Optional[datetime] = Field(default=None, index=True) + detail_next_retry_after: Optional[datetime] = Field(default=None, index=True) + detail_failure_count: int = 0 + detail_last_error_summary: Optional[str] = None + canonical_status: str = Field(default="pending", index=True) + canonical_last_attempted_at: Optional[datetime] = Field(default=None, index=True) + canonical_last_success_at: Optional[datetime] = Field(default=None, index=True) + canonical_next_retry_after: Optional[datetime] = Field(default=None, index=True) + canonical_failure_count: int = 0 + canonical_last_error_summary: Optional[str] = None + updated_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderCatalogTitle(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) + title: str = Field(index=True) + normalized_title: str = Field(index=True) + media_type_hint: str = Field(default="series", index=True) + relative_path: str + last_indexed_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderCatalogAlias(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + alias: str = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) + normalized_alias: str = Field(index=True) + last_indexed_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderCatalogEpisode(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + season: int = Field(primary_key=True) + episode: int = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) + title_primary: Optional[str] = None + title_secondary: Optional[str] = None + relative_path: str + media_type_hint: str = Field(default="episode", index=True) + last_indexed_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderEpisodeLanguage(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + season: int = Field(primary_key=True) + episode: int = Field(primary_key=True) + language: str = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) + normalized_language: str = Field(index=True) + host_hints: Optional[list[str]] = Field(sa_column=Column(JSON), default=None) + last_indexed_at: datetime = Field(default_factory=utcnow, index=True) + + +class CanonicalSeries(ModelBase, table=True): + tvdb_id: int = Field(primary_key=True) + title: str = Field(index=True) + normalized_title: str = Field(index=True) + tmdb_id: Optional[int] = Field(default=None, index=True) + imdb_id: Optional[str] = Field(default=None, index=True) + tvmaze_id: Optional[int] = Field(default=None, index=True) + anilist_id: Optional[int] = Field(default=None, index=True) + mal_id: Optional[int] = Field(default=None, index=True) + last_synced_at: datetime = Field(default_factory=utcnow, index=True) + + +class CanonicalSeriesAlias(ModelBase, table=True): + tvdb_id: int = Field(primary_key=True) + alias: str = Field(primary_key=True) + normalized_alias: str = Field(index=True) + + +class CanonicalEpisode(ModelBase, table=True): + tvdb_id: int = Field(primary_key=True) + season: int = Field(primary_key=True) + episode: int = Field(primary_key=True) + title: str = Field(index=True) + normalized_title: str = Field(index=True) + last_synced_at: datetime = Field(default_factory=utcnow, index=True) + + +class CanonicalMovie(ModelBase, table=True): + tmdb_id: int = Field(primary_key=True) + title: str = Field(index=True) + normalized_title: str = Field(index=True) + release_year: int = Field(index=True) + imdb_id: Optional[str] = Field(default=None, index=True) + tvdb_id: Optional[int] = Field(default=None, index=True) + last_synced_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderSeriesMapping(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + tvdb_id: int = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) + confidence: str = Field(default="unresolved", index=True) + source: str = Field(default="title_match", index=True) + rationale: Optional[str] = None + last_verified_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderEpisodeMapping(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + provider_season: int = Field(primary_key=True) + provider_episode: int = Field(primary_key=True) + tvdb_id: int = Field(primary_key=True) + canonical_season: int = Field(primary_key=True) + canonical_episode: int = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) + confidence: str = Field(default="unresolved", index=True) + source: str = Field(default="numbering", index=True) + rationale: Optional[str] = None + last_verified_at: datetime = Field(default_factory=utcnow, index=True) + + +class ProviderMovieMapping(ModelBase, table=True): + provider: str = Field(primary_key=True) + slug: str = Field(primary_key=True) + tmdb_id: int = Field(primary_key=True) + indexed_generation: str = Field(primary_key=True, index=True) + confidence: str = Field(default="unresolved", index=True) + source: str = Field(default="title_year", index=True) + rationale: Optional[str] = None + last_verified_at: datetime = Field(default_factory=utcnow, index=True) + + # ---------------- Engine and Session utilities DATABASE_URL = f"sqlite:///{(DATA_DIR / 'anibridge_jobs.db').as_posix()}" logger.debug(f"DATABASE_URL: {DATABASE_URL}") engine = create_engine( DATABASE_URL, - connect_args={"check_same_thread": False}, + connect_args={ + "check_same_thread": False, + "timeout": 30, + "autocommit": False, + }, poolclass=NullPool, # ensure connections are closed when sessions end ) logger.debug("SQLModel engine created.") + +@event.listens_for(engine, "connect") +def _configure_sqlite_connection(dbapi_connection, _connection_record) -> None: + autocommit = getattr(dbapi_connection, "autocommit", None) + if autocommit is not None: + dbapi_connection.autocommit = True + cursor = dbapi_connection.cursor() + cursor.execute("PRAGMA journal_mode=WAL") + cursor.execute("PRAGMA synchronous=NORMAL") + cursor.execute("PRAGMA busy_timeout=30000") + cursor.close() + if autocommit is not None: + dbapi_connection.autocommit = autocommit + + MIGRATION_BASE_REVISION = "20260203_0001" @@ -558,6 +768,975 @@ def list_cached_episode_numbers_for_season( return episodes +def normalize_catalog_text(value: str) -> str: + normalized = unicodedata.normalize("NFKD", value or "") + normalized = "".join(ch for ch in normalized if not unicodedata.combining(ch)) + normalized = normalized.replace("’", "'").replace("`", "'") + normalized = re.sub(r"[^a-zA-Z0-9]+", " ", normalized) + return normalized.lower().strip() + + +_UNSET = object() + + +def upsert_provider_index_status( + session: Session, + *, + provider: str, + refresh_interval_hours: float, + status: Optional[str] = None, + active_stage: Optional[str] | object = _UNSET, + current_generation: Optional[str] | object = _UNSET, + latest_success_generation: Optional[str] | object = _UNSET, + latest_started_at: Optional[datetime] | object = _UNSET, + latest_completed_at: Optional[datetime] | object = _UNSET, + latest_success_at: Optional[datetime] | object = _UNSET, + next_refresh_after: Optional[datetime] | object = _UNSET, + bootstrap_completed: Optional[bool] = None, + title_index_status: Optional[str] = None, + title_index_ready_at: Optional[datetime] | object = _UNSET, + title_index_next_retry_after: Optional[datetime] | object = _UNSET, + detail_enrichment_status: Optional[str] = None, + detail_ready_at: Optional[datetime] | object = _UNSET, + detail_next_retry_after: Optional[datetime] | object = _UNSET, + canonical_enrichment_status: Optional[str] = None, + canonical_ready_at: Optional[datetime] | object = _UNSET, + canonical_next_retry_after: Optional[datetime] | object = _UNSET, + failure_count: Optional[int] = None, + last_error_summary: Optional[str] | object = _UNSET, + cursor_title_slug: Optional[str] | object = _UNSET, + commit: bool = True, +) -> ProviderIndexStatus: + rec = session.get(ProviderIndexStatus, provider) + if rec is None: + rec = ProviderIndexStatus( + provider=provider, + refresh_interval_hours=refresh_interval_hours, + ) + rec.refresh_interval_hours = refresh_interval_hours + if status is not None: + rec.status = status + if active_stage is not _UNSET: + rec.active_stage = active_stage + if current_generation is not _UNSET: + rec.current_generation = current_generation + if latest_success_generation is not _UNSET: + rec.latest_success_generation = latest_success_generation + if latest_started_at is not _UNSET: + rec.latest_started_at = latest_started_at + if latest_completed_at is not _UNSET: + rec.latest_completed_at = latest_completed_at + if latest_success_at is not _UNSET: + rec.latest_success_at = latest_success_at + if next_refresh_after is not _UNSET: + rec.next_refresh_after = next_refresh_after + if bootstrap_completed is not None: + rec.bootstrap_completed = bootstrap_completed + if title_index_status is not None: + rec.title_index_status = title_index_status + if title_index_ready_at is not _UNSET: + rec.title_index_ready_at = title_index_ready_at + if title_index_next_retry_after is not _UNSET: + rec.title_index_next_retry_after = title_index_next_retry_after + if detail_enrichment_status is not None: + rec.detail_enrichment_status = detail_enrichment_status + if detail_ready_at is not _UNSET: + rec.detail_ready_at = detail_ready_at + if detail_next_retry_after is not _UNSET: + rec.detail_next_retry_after = detail_next_retry_after + if canonical_enrichment_status is not None: + rec.canonical_enrichment_status = canonical_enrichment_status + if canonical_ready_at is not _UNSET: + rec.canonical_ready_at = canonical_ready_at + if canonical_next_retry_after is not _UNSET: + rec.canonical_next_retry_after = canonical_next_retry_after + if failure_count is not None: + rec.failure_count = failure_count + if last_error_summary is not _UNSET: + rec.last_error_summary = last_error_summary + if cursor_title_slug is not _UNSET: + rec.cursor_title_slug = cursor_title_slug + elif status == "ready": + rec.cursor_title_slug = None + rec.updated_at = utcnow() + session.add(rec) + if commit: + session.commit() + session.refresh(rec) + return rec + + +def get_provider_index_status( + session: Session, + *, + provider: str, +) -> Optional[ProviderIndexStatus]: + return session.get(ProviderIndexStatus, provider) + + +def list_provider_index_statuses(session: Session) -> List[ProviderIndexStatus]: + return list(session.exec(select(ProviderIndexStatus)).all()) + + +def upsert_provider_title_index_state( + session: Session, + *, + provider: str, + slug: str, + attempted_at: Optional[datetime] = None, + succeeded_at: Optional[datetime] = None, + failure_count: Optional[int] = None, + last_error_summary: Optional[str] = None, + detail_status: Optional[str] = None, + detail_attempted_at: Optional[datetime] = None, + detail_succeeded_at: Optional[datetime] = None, + detail_next_retry_after: Optional[datetime] | object = _UNSET, + detail_failure_count: Optional[int] = None, + detail_last_error_summary: Optional[str] | object = _UNSET, + canonical_status: Optional[str] = None, + canonical_attempted_at: Optional[datetime] = None, + canonical_succeeded_at: Optional[datetime] = None, + canonical_next_retry_after: Optional[datetime] | object = _UNSET, + canonical_failure_count: Optional[int] = None, + canonical_last_error_summary: Optional[str] | object = _UNSET, + commit: bool = True, +) -> ProviderTitleIndexState: + rec = session.get(ProviderTitleIndexState, (provider, slug)) + if rec is None: + rec = ProviderTitleIndexState(provider=provider, slug=slug) + if attempted_at is not None: + rec.last_attempted_at = attempted_at + if succeeded_at is not None: + rec.last_success_at = succeeded_at + if failure_count is not None: + rec.failure_count = failure_count + if last_error_summary is not None: + rec.last_error_summary = last_error_summary + if detail_status is not None: + rec.detail_status = detail_status + if detail_attempted_at is not None: + rec.detail_last_attempted_at = detail_attempted_at + if detail_succeeded_at is not None: + rec.detail_last_success_at = detail_succeeded_at + if detail_next_retry_after is not _UNSET: + rec.detail_next_retry_after = detail_next_retry_after + if detail_failure_count is not None: + rec.detail_failure_count = detail_failure_count + if detail_last_error_summary is not _UNSET: + rec.detail_last_error_summary = detail_last_error_summary + if canonical_status is not None: + rec.canonical_status = canonical_status + if canonical_attempted_at is not None: + rec.canonical_last_attempted_at = canonical_attempted_at + if canonical_succeeded_at is not None: + rec.canonical_last_success_at = canonical_succeeded_at + if canonical_next_retry_after is not _UNSET: + rec.canonical_next_retry_after = canonical_next_retry_after + if canonical_failure_count is not None: + rec.canonical_failure_count = canonical_failure_count + if canonical_last_error_summary is not _UNSET: + rec.canonical_last_error_summary = canonical_last_error_summary + rec.updated_at = utcnow() + session.add(rec) + if commit: + session.commit() + session.refresh(rec) + return rec + + +def replace_provider_catalog_title( + session: Session, + *, + provider: str, + slug: str, + title: str, + media_type_hint: str, + relative_path: str, + indexed_generation: str, +) -> ProviderCatalogTitle: + rec = session.get(ProviderCatalogTitle, (provider, slug, indexed_generation)) + if rec is None: + rec = ProviderCatalogTitle( + provider=provider, + slug=slug, + indexed_generation=indexed_generation, + title=title, + normalized_title=normalize_catalog_text(title), + media_type_hint=media_type_hint, + relative_path=relative_path, + last_indexed_at=utcnow(), + ) + else: + rec.title = title + rec.normalized_title = normalize_catalog_text(title) + rec.media_type_hint = media_type_hint + rec.relative_path = relative_path + rec.last_indexed_at = utcnow() + session.add(rec) + return rec + + +def replace_provider_catalog_aliases( + session: Session, + *, + provider: str, + slug: str, + aliases: List[str], + indexed_generation: str, +) -> None: + session.exec( + ProviderCatalogAlias.__table__.delete().where( + (ProviderCatalogAlias.provider == provider) + & (ProviderCatalogAlias.slug == slug) + & (ProviderCatalogAlias.indexed_generation == indexed_generation) + ) + ) + seen: set[str] = set() + for alias in aliases: + alias_clean = (alias or "").strip() + if not alias_clean or alias_clean in seen: + continue + seen.add(alias_clean) + session.add( + ProviderCatalogAlias( + provider=provider, + slug=slug, + alias=alias_clean, + indexed_generation=indexed_generation, + normalized_alias=normalize_catalog_text(alias_clean), + last_indexed_at=utcnow(), + ) + ) + + +def replace_provider_catalog_episodes( + session: Session, + *, + provider: str, + slug: str, + episodes: List[dict[str, Any]], + indexed_generation: str, +) -> None: + session.exec( + ProviderEpisodeLanguage.__table__.delete().where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.slug == slug) + & (ProviderEpisodeLanguage.indexed_generation == indexed_generation) + ) + ) + session.exec( + ProviderCatalogEpisode.__table__.delete().where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.slug == slug) + & (ProviderCatalogEpisode.indexed_generation == indexed_generation) + ) + ) + for item in episodes: + season_number = int(item["season"]) + episode_number = int(item["episode"]) + session.add( + ProviderCatalogEpisode( + provider=provider, + slug=slug, + season=season_number, + episode=episode_number, + indexed_generation=indexed_generation, + title_primary=item.get("title_primary"), + title_secondary=item.get("title_secondary"), + relative_path=item["relative_path"], + media_type_hint=item.get("media_type_hint", "episode"), + last_indexed_at=utcnow(), + ) + ) + deduped_languages: dict[str, dict[str, Any]] = {} + for language_payload in item.get("languages", []): + language = str(language_payload.get("language") or "").strip() + if not language: + continue + normalized_language = normalize_catalog_text(language) + key = normalized_language or language + bucket = deduped_languages.setdefault( + key, + { + "language": language, + "normalized_language": normalized_language, + "host_hints": set(), + }, + ) + bucket["host_hints"].update( + str(host_hint).strip() + for host_hint in language_payload.get("host_hints") or [] + if str(host_hint).strip() + ) + for payload in deduped_languages.values(): + session.add( + ProviderEpisodeLanguage( + provider=provider, + slug=slug, + season=season_number, + episode=episode_number, + language=str(payload["language"]), + indexed_generation=indexed_generation, + normalized_language=str(payload["normalized_language"]), + host_hints=sorted(payload["host_hints"]), + last_indexed_at=utcnow(), + ) + ) + + +def prune_provider_generation( + session: Session, + *, + provider: str, + keep_generation: str, +) -> None: + session.exec( + ProviderCatalogAlias.__table__.delete().where( + (ProviderCatalogAlias.provider == provider) + & (ProviderCatalogAlias.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderEpisodeLanguage.__table__.delete().where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderCatalogEpisode.__table__.delete().where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderCatalogTitle.__table__.delete().where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderSeriesMapping.__table__.delete().where( + (ProviderSeriesMapping.provider == provider) + & (ProviderSeriesMapping.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderEpisodeMapping.__table__.delete().where( + (ProviderEpisodeMapping.provider == provider) + & (ProviderEpisodeMapping.indexed_generation != keep_generation) + ) + ) + session.exec( + ProviderMovieMapping.__table__.delete().where( + (ProviderMovieMapping.provider == provider) + & (ProviderMovieMapping.indexed_generation != keep_generation) + ) + ) + + +def delete_provider_generation( + session: Session, + *, + provider: str, + generation: str, +) -> None: + session.exec( + ProviderCatalogAlias.__table__.delete().where( + (ProviderCatalogAlias.provider == provider) + & (ProviderCatalogAlias.indexed_generation == generation) + ) + ) + session.exec( + ProviderEpisodeLanguage.__table__.delete().where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.indexed_generation == generation) + ) + ) + session.exec( + ProviderCatalogEpisode.__table__.delete().where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.indexed_generation == generation) + ) + ) + session.exec( + ProviderCatalogTitle.__table__.delete().where( + (ProviderCatalogTitle.provider == provider) + & (ProviderCatalogTitle.indexed_generation == generation) + ) + ) + session.exec( + ProviderSeriesMapping.__table__.delete().where( + (ProviderSeriesMapping.provider == provider) + & (ProviderSeriesMapping.indexed_generation == generation) + ) + ) + session.exec( + ProviderEpisodeMapping.__table__.delete().where( + (ProviderEpisodeMapping.provider == provider) + & (ProviderEpisodeMapping.indexed_generation == generation) + ) + ) + session.exec( + ProviderMovieMapping.__table__.delete().where( + (ProviderMovieMapping.provider == provider) + & (ProviderMovieMapping.indexed_generation == generation) + ) + ) + + +def _visible_generation_map( + session: Session, + *, + providers: List[str], +) -> dict[str, str]: + rows = session.exec( + select(ProviderIndexStatus).where(ProviderIndexStatus.provider.in_(providers)) + ).all() + return { + row.provider: row.latest_success_generation + for row in rows + if row.latest_success_generation + } + + +def replace_provider_series_mappings( + session: Session, + *, + provider: str, + slug: str, + mappings: List[dict[str, Any]], + indexed_generation: str, +) -> None: + session.exec( + ProviderSeriesMapping.__table__.delete().where( + (ProviderSeriesMapping.provider == provider) + & (ProviderSeriesMapping.slug == slug) + & (ProviderSeriesMapping.indexed_generation == indexed_generation) + ) + ) + for mapping in mappings: + session.add( + ProviderSeriesMapping( + provider=provider, + slug=slug, + tvdb_id=int(mapping["tvdb_id"]), + indexed_generation=indexed_generation, + confidence=str(mapping.get("confidence", "unresolved")), + source=str(mapping.get("source", "title_match")), + rationale=mapping.get("rationale"), + last_verified_at=utcnow(), + ) + ) + + +def replace_provider_episode_mappings( + session: Session, + *, + provider: str, + slug: str, + mappings: List[dict[str, Any]], + indexed_generation: str, +) -> None: + session.exec( + ProviderEpisodeMapping.__table__.delete().where( + (ProviderEpisodeMapping.provider == provider) + & (ProviderEpisodeMapping.slug == slug) + & (ProviderEpisodeMapping.indexed_generation == indexed_generation) + ) + ) + for mapping in mappings: + session.add( + ProviderEpisodeMapping( + provider=provider, + slug=slug, + provider_season=int(mapping["provider_season"]), + provider_episode=int(mapping["provider_episode"]), + tvdb_id=int(mapping["tvdb_id"]), + canonical_season=int(mapping["canonical_season"]), + canonical_episode=int(mapping["canonical_episode"]), + indexed_generation=indexed_generation, + confidence=str(mapping.get("confidence", "unresolved")), + source=str(mapping.get("source", "numbering")), + rationale=mapping.get("rationale"), + last_verified_at=utcnow(), + ) + ) + + +def replace_provider_movie_mappings( + session: Session, + *, + provider: str, + slug: str, + mappings: List[dict[str, Any]], + indexed_generation: str, +) -> None: + session.exec( + ProviderMovieMapping.__table__.delete().where( + (ProviderMovieMapping.provider == provider) + & (ProviderMovieMapping.slug == slug) + & (ProviderMovieMapping.indexed_generation == indexed_generation) + ) + ) + for mapping in mappings: + session.add( + ProviderMovieMapping( + provider=provider, + slug=slug, + tmdb_id=int(mapping["tmdb_id"]), + indexed_generation=indexed_generation, + confidence=str(mapping.get("confidence", "unresolved")), + source=str(mapping.get("source", "title_year")), + rationale=mapping.get("rationale"), + last_verified_at=utcnow(), + ) + ) + + +def upsert_canonical_series( + session: Session, + *, + tvdb_id: int, + title: str, + tmdb_id: Optional[int] = None, + imdb_id: Optional[str] = None, + tvmaze_id: Optional[int] = None, + anilist_id: Optional[int] = None, + mal_id: Optional[int] = None, + aliases: Optional[List[str]] = None, +) -> CanonicalSeries: + rec = session.get(CanonicalSeries, tvdb_id) + if rec is None: + rec = CanonicalSeries(tvdb_id=tvdb_id, title=title, normalized_title="") + rec.title = title + rec.normalized_title = normalize_catalog_text(title) + rec.tmdb_id = tmdb_id + rec.imdb_id = imdb_id + rec.tvmaze_id = tvmaze_id + rec.anilist_id = anilist_id + rec.mal_id = mal_id + rec.last_synced_at = utcnow() + session.add(rec) + if aliases is not None: + session.exec( + CanonicalSeriesAlias.__table__.delete().where( + CanonicalSeriesAlias.tvdb_id == tvdb_id + ) + ) + for alias in aliases: + alias_clean = (alias or "").strip() + if not alias_clean: + continue + session.add( + CanonicalSeriesAlias( + tvdb_id=tvdb_id, + alias=alias_clean, + normalized_alias=normalize_catalog_text(alias_clean), + ) + ) + return rec + + +def replace_canonical_episodes( + session: Session, + *, + tvdb_id: int, + episodes: List[dict[str, Any]], +) -> None: + session.exec( + CanonicalEpisode.__table__.delete().where(CanonicalEpisode.tvdb_id == tvdb_id) + ) + deduped: dict[tuple[int, int], str] = {} + for episode in episodes: + title = str(episode.get("title") or "").strip() + if not title: + continue + season = int(episode["season"]) + number = int(episode["episode"]) + key = (season, number) + if key not in deduped: + deduped[key] = title + for (season, number), title in sorted(deduped.items()): + session.add( + CanonicalEpisode( + tvdb_id=tvdb_id, + season=season, + episode=number, + title=title, + normalized_title=normalize_catalog_text(title), + last_synced_at=utcnow(), + ) + ) + + +def is_catalog_bootstrap_ready( + session: Session, + *, + providers: List[str], +) -> bool: + if not providers: + return True + statuses = { + row.provider: row + for row in session.exec( + select(ProviderIndexStatus).where( + ProviderIndexStatus.provider.in_(providers) + ) + ).all() + } + return all( + status is not None + and (status.title_index_status == "ready" or status.bootstrap_completed) + and bool(status.latest_success_generation) + for status in (statuses.get(provider) for provider in providers) + ) + + +def is_provider_stage_ready(status: ProviderIndexStatus | None, *, stage: str) -> bool: + if status is None: + return False + if stage == "title_index": + return ( + status.title_index_status == "ready" or status.bootstrap_completed + ) and bool(status.latest_success_generation) + if stage == "detail_enrichment": + return status.detail_enrichment_status == "ready" + if stage == "canonical_enrichment": + return status.canonical_enrichment_status == "ready" + raise ValueError(f"Unknown provider stage: {stage}") + + +def is_provider_fully_ready(status: ProviderIndexStatus | None) -> bool: + return ( + is_provider_stage_ready(status, stage="title_index") + and is_provider_stage_ready(status, stage="detail_enrichment") + and is_provider_stage_ready(status, stage="canonical_enrichment") + ) + + +def catalog_title_count(session: Session, *, provider: Optional[str] = None) -> int: + stmt = select(ProviderCatalogTitle) + if provider: + stmt = stmt.where(ProviderCatalogTitle.provider == provider) + return len(session.exec(stmt).all()) + + +def resolve_indexed_title( + session: Session, + *, + provider: str, + slug: str, +) -> Optional[str]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return None + row = session.get( + ProviderCatalogTitle, + (provider, slug, status.latest_success_generation), + ) + return row.title if row else None + + +def search_indexed_provider_titles( + session: Session, + *, + query: str, + providers: List[str], + media_type_hint: Optional[str] = None, + limit: int = 20, +) -> List[ProviderCatalogTitle]: + q_norm = normalize_catalog_text(query) + if not q_norm: + return [] + tokens = [token for token in q_norm.split(" ") if token] + if not tokens: + return [] + visible_generations = _visible_generation_map(session, providers=providers) + if not visible_generations: + return [] + visible_generation_pairs = list(visible_generations.items()) + stmt = select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider.in_(providers)) + & ( + tuple_( + ProviderCatalogTitle.provider, + ProviderCatalogTitle.indexed_generation, + ).in_(visible_generation_pairs) + ) + ) + if media_type_hint is not None: + stmt = stmt.where(ProviderCatalogTitle.media_type_hint == media_type_hint) + rows = session.exec(stmt).all() + + alias_rows = session.exec( + select(ProviderCatalogAlias).where( + (ProviderCatalogAlias.provider.in_(providers)) + & ( + tuple_( + ProviderCatalogAlias.provider, + ProviderCatalogAlias.indexed_generation, + ).in_(visible_generation_pairs) + ) + ) + ).all() + aliases_by_key: dict[tuple[str, str, str], list[str]] = {} + for alias in alias_rows: + aliases_by_key.setdefault( + (alias.provider, alias.slug, alias.indexed_generation), [] + ).append(alias.normalized_alias) + + def _score(row: ProviderCatalogTitle) -> tuple[int, int]: + names = [row.normalized_title] + names.extend( + aliases_by_key.get((row.provider, row.slug, row.indexed_generation), []) + ) + best = 0 + exact = 0 + for name in names: + name_tokens = set(token for token in name.split(" ") if token) + overlap = len(name_tokens & set(tokens)) + if name == q_norm: + exact = 1 + if overlap > best: + best = overlap + return (exact, best) + + scored_rows = [(row, _score(row)) for row in rows] + ranked = sorted(scored_rows, key=lambda item: item[1], reverse=True) + filtered = [row for row, score in ranked if score[1] > 0 or score[0] > 0] + return filtered[: max(1, limit)] + + +def list_indexed_titles_for_provider( + session: Session, + *, + provider: str, +) -> List[ProviderCatalogTitle]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return [] + return list( + session.exec( + select(ProviderCatalogTitle).where( + (ProviderCatalogTitle.provider == provider) + & ( + ProviderCatalogTitle.indexed_generation + == status.latest_success_generation + ) + ) + ).all() + ) + + +def get_indexed_episode_languages( + session: Session, + *, + provider: str, + slug: str, + season: int, + episode: int, +) -> List[ProviderEpisodeLanguage]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return [] + return list( + session.exec( + select(ProviderEpisodeLanguage).where( + (ProviderEpisodeLanguage.provider == provider) + & (ProviderEpisodeLanguage.slug == slug) + & (ProviderEpisodeLanguage.season == season) + & (ProviderEpisodeLanguage.episode == episode) + & ( + ProviderEpisodeLanguage.indexed_generation + == status.latest_success_generation + ) + ) + ).all() + ) + + +def list_indexed_provider_episodes( + session: Session, + *, + provider: str, + slug: str, +) -> List[ProviderCatalogEpisode]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return [] + return list( + session.exec( + select(ProviderCatalogEpisode).where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.slug == slug) + & ( + ProviderCatalogEpisode.indexed_generation + == status.latest_success_generation + ) + ) + ).all() + ) + + +def list_indexed_episode_numbers_for_season( + session: Session, + *, + provider: str, + slug: str, + season: int, +) -> List[int]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return [] + episodes = [ + int(row.episode) + for row in session.exec( + select(ProviderCatalogEpisode).where( + (ProviderCatalogEpisode.provider == provider) + & (ProviderCatalogEpisode.slug == slug) + & (ProviderCatalogEpisode.season == season) + & ( + ProviderCatalogEpisode.indexed_generation + == status.latest_success_generation + ) + ) + ).all() + ] + return sorted(set(episodes)) + + +def find_canonical_series_by_ids_or_title( + session: Session, + *, + tvdb_id: Optional[int] = None, + tmdb_id: Optional[int] = None, + imdb_id: Optional[str] = None, + query: Optional[str] = None, +) -> Optional[CanonicalSeries]: + if tvdb_id: + row = session.get(CanonicalSeries, tvdb_id) + if row is not None: + return row + if tmdb_id: + row = session.exec( + select(CanonicalSeries).where(CanonicalSeries.tmdb_id == tmdb_id) + ).first() + if row is not None: + return row + if imdb_id: + row = session.exec( + select(CanonicalSeries).where(CanonicalSeries.imdb_id == imdb_id) + ).first() + if row is not None: + return row + q_norm = normalize_catalog_text(query or "") + if not q_norm: + return None + row = session.exec( + select(CanonicalSeries).where(CanonicalSeries.normalized_title == q_norm) + ).first() + if row is not None: + return row + alias = session.exec( + select(CanonicalSeriesAlias).where( + CanonicalSeriesAlias.normalized_alias == q_norm + ) + ).first() + if alias is not None: + return session.get(CanonicalSeries, alias.tvdb_id) + return None + + +def find_provider_episode_mappings_for_canonical_episode( + session: Session, + *, + tvdb_id: int, + canonical_season: int, + canonical_episode: int, + providers: List[str], +) -> List[ProviderEpisodeMapping]: + visible_generations = _visible_generation_map(session, providers=providers) + if not visible_generations: + return [] + return list( + session.exec( + select(ProviderEpisodeMapping).where( + (ProviderEpisodeMapping.tvdb_id == tvdb_id) + & (ProviderEpisodeMapping.canonical_season == canonical_season) + & (ProviderEpisodeMapping.canonical_episode == canonical_episode) + & (ProviderEpisodeMapping.provider.in_(providers)) + & ( + tuple_( + ProviderEpisodeMapping.provider, + ProviderEpisodeMapping.indexed_generation, + ).in_(list(visible_generations.items())) + ) + & ProviderEpisodeMapping.confidence.in_( + ["confirmed", "high_confidence", "low_confidence"] + ) + ) + ).all() + ) + + +def find_provider_episode_mappings_for_canonical_season( + session: Session, + *, + tvdb_id: int, + canonical_season: int, + providers: List[str], +) -> List[ProviderEpisodeMapping]: + visible_generations = _visible_generation_map(session, providers=providers) + if not visible_generations: + return [] + return list( + session.exec( + select(ProviderEpisodeMapping).where( + (ProviderEpisodeMapping.tvdb_id == tvdb_id) + & (ProviderEpisodeMapping.canonical_season == canonical_season) + & (ProviderEpisodeMapping.provider.in_(providers)) + & ( + tuple_( + ProviderEpisodeMapping.provider, + ProviderEpisodeMapping.indexed_generation, + ).in_(list(visible_generations.items())) + ) + & ProviderEpisodeMapping.confidence.in_( + ["confirmed", "high_confidence", "low_confidence"] + ) + ) + ).all() + ) + + +def find_provider_episode_mapping( + session: Session, + *, + provider: str, + slug: str, + provider_season: int, + provider_episode: int, +) -> Optional[ProviderEpisodeMapping]: + status = session.get(ProviderIndexStatus, provider) + if status is None or not status.latest_success_generation: + return None + return session.exec( + select(ProviderEpisodeMapping).where( + (ProviderEpisodeMapping.provider == provider) + & (ProviderEpisodeMapping.slug == slug) + & (ProviderEpisodeMapping.provider_season == provider_season) + & (ProviderEpisodeMapping.provider_episode == provider_episode) + & ( + ProviderEpisodeMapping.indexed_generation + == status.latest_success_generation + ) + & ProviderEpisodeMapping.confidence.in_( + ["confirmed", "high_confidence", "low_confidence"] + ) + ) + ).first() + + # --- STRM URL Mapping CRUD def get_strm_mapping( session: Session, @@ -699,6 +1878,8 @@ def upsert_client_task( job_id: Optional[str], state: str = "queued", site: str = "aniworld.to", + provider: Optional[str] = None, + mode: Optional[str] = None, ) -> ClientTask: """ Insert or update a ClientTask record identified by its hash. @@ -722,6 +1903,8 @@ def upsert_client_task( episode=episode, language=language, site=site, + provider=provider, + mode=mode, save_path=save_path, category=category, job_id=job_id, @@ -736,6 +1919,10 @@ def upsert_client_task( rec.episode = episode rec.language = language rec.site = site + if provider is not None: + rec.provider = provider + if mode is not None: + rec.mode = mode rec.save_path = save_path rec.category = category rec.job_id = job_id diff --git a/apps/api/app/providers/megakino/sitemap.py b/apps/api/app/providers/megakino/sitemap.py index d113365f..0db6b15d 100644 --- a/apps/api/app/providers/megakino/sitemap.py +++ b/apps/api/app/providers/megakino/sitemap.py @@ -10,7 +10,9 @@ from loguru import logger -from app.utils.http_client import get as http_get +MEGAKINO_SITEMAP_USER_AGENT = ( + "Mozilla/5.0 (AniBridge Megakino Indexer; +https://github.com/Zzackllack/AniBridge)" +) @dataclass(frozen=True) @@ -199,11 +201,20 @@ def _fetch_sitemap(url: str, timeout: float = 20.0) -> str: HTTPError: If the HTTP response status indicates a failure. """ logger.debug("Megakino sitemap fetch: {}", url) - resp = http_get(url, timeout=timeout) + resp = requests.get( + url, + timeout=timeout, + allow_redirects=True, + headers={ + "User-Agent": MEGAKINO_SITEMAP_USER_AGENT, + "Accept-Encoding": "identity", + }, + ) resp.raise_for_status() logger.debug( - "Megakino sitemap response: status={} bytes={}", + "Megakino sitemap response: status={} final_url={} bytes={}", resp.status_code, + resp.url, len(resp.text or ""), ) return resp.text diff --git a/apps/api/app/utils/domain_resolver.py b/apps/api/app/utils/domain_resolver.py index a0f9d314..4bc486df 100644 --- a/apps/api/app/utils/domain_resolver.py +++ b/apps/api/app/utils/domain_resolver.py @@ -8,14 +8,19 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from loguru import logger +import requests from requests.exceptions import RequestException -from app.utils.http_client import get as http_get from app.utils.logger import config as configure_logger configure_logger() MEGAKINO_DEFAULT_DOMAIN = "megakino1.to" +MEGAKINO_REDIRECT_SEEDS = [ + "megakino1.to", + "megakino.to", + "megakino.live", +] MEGAKINO_DOMAIN_CANDIDATES = [ "megakino1.to", "megakino.live", @@ -28,6 +33,10 @@ "megakino.to", ] MEGAKINO_MIRRORS_PATH = "/mirrors.txt" +MEGAKINO_GITHUB_DOMAIN_HINT_URL = ( + "https://raw.githubusercontent.com/" + "Yezun-hikari/new-domain-check/main/monitors/megakino/domain.txt" +) USER_AGENT = "Mozilla/5.0 (AniBridge; +https://github.com/Zzackllack/AniBridge)" MEGAKINO_RESOLVER_TIMEOUT_SECONDS = 6 MEGAKINO_RESOLVER_MAX_WORKERS = 12 @@ -96,11 +105,13 @@ def _resolver_http_get( def _run() -> None: try: - result["resp"] = http_get( + request_headers = {"User-Agent": USER_AGENT, "Accept-Encoding": "identity"} + request_headers.update(headers) + result["resp"] = requests.get( url, timeout=timeout, allow_redirects=allow_redirects, - headers=headers, + headers=request_headers, ) except Exception as exc: err["exc"] = exc @@ -115,6 +126,36 @@ def _run() -> None: return result["resp"] +def _fetch_github_domain_hint( + timeout: float | int = MEGAKINO_RESOLVER_TIMEOUT_SECONDS, +) -> Optional[str]: + """Return the current Megakino host from the external monitor repository.""" + try: + resp = _resolver_http_get( + MEGAKINO_GITHUB_DOMAIN_HINT_URL, + timeout=timeout, + allow_redirects=True, + headers={"Accept": "text/plain"}, + ) + except RequestException as exc: + logger.debug("Megakino GitHub domain hint fetch failed: {}", exc) + return None + + if resp.status_code >= 400: + logger.debug( + "Megakino GitHub domain hint returned {}", + resp.status_code, + ) + return None + + domain = _normalize_domain(resp.text or "") + if not domain: + logger.debug("Megakino GitHub domain hint was empty or invalid.") + return None + logger.info("Megakino GitHub domain hint resolved to {}", domain) + return domain + + def _build_base_url(value: str) -> str: """ Create a normalized base URL from an input domain or URL. @@ -410,63 +451,70 @@ def fetch_megakino_domain( Returns: The resolved domain without a URL scheme (for example, "example.com"), or `None` if no candidate could be validated. """ - logger.info("Resolving megakino domain via sitemap checks.") + logger.info("Resolving megakino domain via redirect-aware sitemap checks.") seen: set[str] = set() - ordered_candidates: list[str] = [] - mirror_timeout = min(timeout, 8) - mirror_domains = fetch_megakino_mirror_domains( - timeout=mirror_timeout, include_sitemap_fallback=False - ) - if mirror_domains: - ordered_candidates.extend(mirror_domains) - ordered_candidates.extend(MEGAKINO_DOMAIN_CANDIDATES) - normalized_candidates: list[str] = [] - for candidate in ordered_candidates: - domain = _normalize_domain(candidate) - if not domain or domain in seen: - continue - seen.add(domain) - normalized_candidates.append(domain) - - if not normalized_candidates: - logger.warning("Megakino domain resolution failed; no candidate succeeded.") - return None - - max_workers = _resolver_max_workers(len(normalized_candidates)) - logger.info( - "Probing {} megakino candidates (workers={}, timeout={}s).", - len(normalized_candidates), - max_workers, - timeout, - ) - - def _probe_candidate(idx: int, domain: str) -> tuple[int, Optional[str]]: - base_url = _build_base_url(domain) - try: - return (idx, _probe_megakino_sitemap(base_url, timeout=timeout)) - except Exception as exc: - logger.warning("Megakino candidate check failed for {}: {}", base_url, exc) - return (idx, None) - - probe_results: dict[int, str] = {} - with ThreadPoolExecutor( - max_workers=min(max_workers, len(normalized_candidates)) - ) as ex: - futures = [ - ex.submit(_probe_candidate, idx, domain) - for idx, domain in enumerate(normalized_candidates) - ] - for fut in as_completed(futures): - idx, resolved = fut.result() + raw_candidates: list[str] = [] + env_candidate = os.getenv("MEGAKINO_BASE_URL", "").strip() + if env_candidate: + raw_candidates.append(env_candidate) + raw_candidates.extend(MEGAKINO_REDIRECT_SEEDS) + + def _iter_candidates(candidates: list[str]) -> list[str]: + normalized: list[str] = [] + for candidate in candidates: + domain = _normalize_domain(candidate) + if not domain or domain in seen: + continue + seen.add(domain) + normalized.append(domain) + return normalized + + normalized_candidates = _iter_candidates(raw_candidates) + if normalized_candidates: + logger.info( + "Megakino domain resolution candidates: {}", + ", ".join(normalized_candidates), + ) + for domain in normalized_candidates: + base_url = _build_base_url(domain) + try: + resolved = _probe_megakino_sitemap(base_url, timeout=timeout) + except Exception as exc: + logger.warning( + "Megakino candidate check failed for {}: {}", base_url, exc + ) + continue if resolved: - probe_results[idx] = resolved - - for idx, domain in enumerate(normalized_candidates): - resolved = probe_results.get(idx) - if resolved: - logger.success("Megakino domain resolved: {}", resolved) - return resolved - logger.warning("Megakino candidate failed validation: {}", domain) + logger.success("Megakino domain resolved: {}", resolved) + return resolved + logger.warning("Megakino candidate failed validation: {}", domain) + + hint_domain = _fetch_github_domain_hint(timeout=min(timeout, 8)) + if hint_domain: + hinted_candidates = _iter_candidates([hint_domain]) + if hinted_candidates: + logger.info( + "Megakino domain resolution fallback candidate: {}", + ", ".join(hinted_candidates), + ) + for domain in hinted_candidates: + base_url = _build_base_url(domain) + try: + resolved = _probe_megakino_sitemap(base_url, timeout=timeout) + except Exception as exc: + logger.warning( + "Megakino hinted candidate check failed for {}: {}", + base_url, + exc, + ) + continue + if resolved: + logger.success("Megakino domain resolved: {}", resolved) + return resolved + logger.warning( + "Megakino hinted candidate failed validation: {}", domain + ) + logger.warning("Megakino domain resolution failed; no candidate succeeded.") return None diff --git a/apps/api/app/utils/logger.py b/apps/api/app/utils/logger.py index b1687ddc..14884d58 100644 --- a/apps/api/app/utils/logger.py +++ b/apps/api/app/utils/logger.py @@ -64,6 +64,9 @@ def emit(self, record: logging.LogRecord) -> None: handlers=[intercept_handler], level=stdlib_level, force=True ) logging.captureWarnings(True) + # TODO: intercept this: + # INFO [alembic.runtime.migration] Will assume non-transactional DDL. + # INFO [alembic.runtime.migration] Context impl SQLiteImpl. logging.lastResort = None for name in ( "uvicorn", @@ -95,7 +98,14 @@ def ensure_log_path(base_dir: Optional[Path] = None) -> Path: if env: return Path(env) - base = Path.cwd() / "data" if base_dir is None else base_dir + if base_dir is None: + data_dir_env = os.environ.get("DATA_DIR", "").strip() + if data_dir_env: + base = Path(data_dir_env).expanduser() + else: + base = Path.cwd() / "data" + else: + base = base_dir ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") run_id = uuid.uuid4().hex[:8] log_path = base / f"terminal-{ts}-{run_id}.log" diff --git a/apps/api/app/utils/terminal.py b/apps/api/app/utils/terminal.py index 8443ada9..1e751a78 100644 --- a/apps/api/app/utils/terminal.py +++ b/apps/api/app/utils/terminal.py @@ -41,8 +41,16 @@ class ProgressReporter: - Non-interactive: print an info line every PROGRESS_STEP_PERCENT. """ - def __init__(self, label: str) -> None: + def __init__( + self, + label: str, + *, + unit: str = "B", + unit_scale: bool = True, + ) -> None: self.label = label + self.unit = unit + self.unit_scale = unit_scale self._bar = None self._last_step_pct = -1 # last printed step percentage (integer) self._interactive = is_interactive_terminal() @@ -64,8 +72,8 @@ def update(self, snap: ProgressSnapshot) -> None: self._bar = tqdm( total=int(total), desc=self.label, - unit="B", - unit_scale=True, + unit=self.unit, + unit_scale=self.unit_scale, leave=True, file=bar_file, ascii=False, # force unicode blocks (█▉▊▌ etc.) @@ -84,7 +92,20 @@ def update(self, snap: ProgressSnapshot) -> None: self._bar.n = downloaded postfix = {} if snap.speed is not None: - postfix["Speed"] = f"{float(snap.speed) / (1024 * 1024):.2f} MB/s" + unit = self.unit + unit_power = { + "B": 0, + "KB": 1, + "KiB": 1, + "MB": 2, + "MiB": 2, + "GB": 3, + "GiB": 3, + "TB": 4, + "TiB": 4, + }.get(unit, 0) + scaled_speed = float(snap.speed) / (1024**unit_power) + postfix["Speed"] = f"{scaled_speed:.2f} {unit}/s" if snap.eta is not None: postfix["ETA"] = f"{int(snap.eta)}s" if postfix: @@ -100,21 +121,30 @@ def update(self, snap: ProgressSnapshot) -> None: step = max(1, int(PROGRESS_STEP_PERCENT)) if pct == 100 or pct // step > self._last_step_pct // step: self._last_step_pct = pct + speed_unit = "MB/s" if self.unit == "B" else f"{self.unit}/s" speed = ( - f"{float(snap.speed) / (1024 * 1024):.2f} MB/s" + f"{float(snap.speed) / (1024 * 1024):.2f} {speed_unit}" + if self.unit == "B" and snap.speed is not None + else f"{float(snap.speed):.2f} {speed_unit}" if snap.speed is not None else "-" ) eta = f"{int(snap.eta)}s" if snap.eta is not None else "-" + progress_text = ( + f"{downloaded}/{total} bytes" + if self.unit == "B" + else f"{downloaded}/{total} {self.unit}" + ) logger.info( - f"{self.label}: {pct}% ({downloaded}/{total} bytes) speed={speed} eta={eta}" + f"{self.label}: {pct}% ({progress_text}) speed={speed} eta={eta}" ) else: # Total unknown: avoid spamming; print on large increments threshold = 8 * 1024 * 1024 # 8 MiB if downloaded // threshold > self._last_step_pct: self._last_step_pct = downloaded // threshold - logger.info(f"{self.label}: downloaded {downloaded} bytes...") + suffix = "bytes" if self.unit == "B" else self.unit + logger.info(f"{self.label}: downloaded {downloaded} {suffix}...") def close(self) -> None: if self._bar is not None: diff --git a/apps/api/app/utils/title_resolver.py b/apps/api/app/utils/title_resolver.py index 04a424c1..02f1ea80 100644 --- a/apps/api/app/utils/title_resolver.py +++ b/apps/api/app/utils/title_resolver.py @@ -11,9 +11,20 @@ import requests.exceptions from bs4 import BeautifulSoup # type: ignore from loguru import logger +from sqlalchemy.exc import OperationalError +from sqlmodel import Session +from sqlmodel import select from app.utils.logger import config as configure_logger from app.utils.http_client import get as http_get # type: ignore +from app.catalog import get_catalog_readiness_error +from app.db import ( + ProviderCatalogAlias, + engine, + list_indexed_titles_for_provider, + resolve_indexed_title, + search_indexed_provider_titles, +) from app.config import ( CATALOG_SITES_LIST, @@ -490,13 +501,40 @@ def resolve_series_title( if not slug: logger.warning("No slug provided to resolve_series_title.") return None + try: + with Session(engine) as session: + title = resolve_indexed_title(session, provider=site, slug=slug) + except OperationalError as exc: + logger.warning( + "Indexed title lookup unavailable for slug '{}' on {}: {}", + slug, + site, + exc, + ) + else: + if title: + logger.info(f"Resolved title for slug '{slug}' on {site}: {title}") + return title + # Attempt to resolve from the in-memory/indexed alphabet sources first index = load_or_refresh_index(site) title = index.get(slug) if title: logger.info(f"Resolved title for slug '{slug}' on {site}: {title}") - else: - logger.warning(f"No title found for slug: {slug} on {site}") - return title + return title + + # No title found in DB or in-memory index. If the provider catalog is + # already considered ready/bootstrapped, treat this as a definitive miss. + if get_catalog_readiness_error() is None: + logger.warning( + "No indexed title found for slug '{}' on {} after catalog bootstrap.", + slug, + site, + ) + return None + + # Catalog is still bootstrapping and no in-memory title found. + logger.warning(f"No title found for slug: {slug} on {site}") + return None def load_or_refresh_alternatives(site: str = "aniworld.to") -> Dict[str, List[str]]: @@ -508,6 +546,22 @@ def load_or_refresh_alternatives(site: str = "aniworld.to") -> Dict[str, List[st Dict[str, List[str]]: Mapping from slug to list of alternative titles (primary title first). """ global _cached_alts + readiness_error = get_catalog_readiness_error() + if readiness_error is None: + try: + with Session(engine) as session: + rows = list_indexed_titles_for_provider(session, provider=site) + except OperationalError as exc: + logger.warning( + "Indexed alternatives lookup unavailable for {}: {}", + site, + exc, + ) + else: + if rows: + # The indexed request path no longer needs a full alternatives dump. + # Keep a minimal compatibility shape for older helper call sites. + return {row.slug: [row.title] for row in rows} now = time.time() site_cfg = _get_site_cfg(site) or CATALOG_SITE_CONFIGS.get("aniworld.to", {}) refresh_hours = float(site_cfg.get("titles_refresh_hours", 24.0)) @@ -604,6 +658,24 @@ def _score_title_candidate( ) +def _score_indexed_db_candidate(session: Session, *, query: str, candidate) -> float: + query_tokens = _match_tokens(query) + query_norm = _normalize_alnum(query) + best_score = _score_title_candidate(query_tokens, query_norm, candidate.title) + alias_rows = session.exec( + select(ProviderCatalogAlias).where( + (ProviderCatalogAlias.provider == candidate.provider) + & (ProviderCatalogAlias.slug == candidate.slug) + & (ProviderCatalogAlias.indexed_generation == candidate.indexed_generation) + ) + ).all() + for alias in alias_rows: + alias_score = _score_title_candidate(query_tokens, query_norm, alias.alias) + if alias_score > best_score: + best_score = alias_score + return best_score + + def _build_sto_search_terms(query: str) -> List[str]: """ Builds ordered search variants for S.to from a raw query. @@ -692,6 +764,10 @@ def slug_from_query(q: str, site: Optional[str] = None) -> Optional[Tuple[str, s """ if not q: return None + # Prefer searching the in-memory alphabet indexes first (fast, deterministic + # for unit tests and for recent file-based index loads). If no indexed + # match is found, fall back to the DB-backed provider index search when + # the catalog reports readiness. def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: """ @@ -755,14 +831,71 @@ def _search_sites(sites: List[str]) -> Optional[Tuple[str, str]]: return ("s.to", api_slug) return None + # 1) If a specific site was requested, only consult that site's in-memory + # index (and then its DB-backed index) — do NOT fall back to other sites. if site: - return _search_sites([site]) + result = _search_sites([site]) + if result: + return result + try: + with Session(engine) as session: + rows = search_indexed_provider_titles( + session, + query=q, + providers=[site], + limit=5, + ) + for candidate in rows: + cand_score = _score_indexed_db_candidate( + session, query=q, candidate=candidate + ) + if cand_score >= _MIN_TITLE_MATCH_SCORE: + return (candidate.provider, candidate.slug) + except OperationalError as exc: + logger.debug( + "Skipping indexed DB lookup for {} because catalog tables are unavailable: {}", + site, + exc, + ) + return None + # 2) No specific site requested: try index-based lookup across primary sites primary_sites = [s for s in CATALOG_SITES_LIST if s != "megakino"] result = _search_sites(primary_sites) if result: return result + # 3) If no index match, and the catalog is ready, try the DB-backed search + readiness_error = get_catalog_readiness_error() + if readiness_error is None: + providers = list(CATALOG_SITES_LIST) + preferred = [provider for provider in providers if provider != "megakino"] + fallback = [provider for provider in providers if provider == "megakino"] + try: + with Session(engine) as session: + for batch in (preferred, fallback): + if not batch: + continue + rows = search_indexed_provider_titles( + session, + query=q, + providers=batch, + limit=1, + ) + if rows: + candidate = rows[0] + cand_score = _score_indexed_db_candidate( + session, query=q, candidate=candidate + ) + if cand_score >= _MIN_TITLE_MATCH_SCORE: + return (candidate.provider, candidate.slug) + except OperationalError as exc: + logger.debug( + "Skipping indexed DB lookup because catalog tables are unavailable: {}", + exc, + ) + + # 3) Megakino-specific direct slug/fallback handling if "megakino" in CATALOG_SITES_LIST or "megakino" in _PROVIDER_CACHE: raw = (q or "").strip() direct_slug = _extract_slug(raw, "megakino") diff --git a/apps/api/tests/conftest.py b/apps/api/tests/conftest.py index 39235f52..8886999a 100644 --- a/apps/api/tests/conftest.py +++ b/apps/api/tests/conftest.py @@ -91,6 +91,11 @@ def client(tmp_path, monkeypatch): "app.config", "app.db", "app.db.models", + "app.catalog", + "app.catalog.exceptions", + "app.catalog.indexer", + "app.catalog.metadata", + "app.catalog.providers", "app.core.strm_proxy", "app.core.strm_proxy.auth", "app.core.strm_proxy.cache", @@ -123,7 +128,12 @@ def client(tmp_path, monkeypatch): create_db_and_tables() - monkeypatch.setattr(qb_torrents, "schedule_download", lambda req: "job-1") + def _schedule_download(req, *, autostart=True): + del req, autostart + return "job-1" + + monkeypatch.setattr(qb_torrents, "schedule_download", _schedule_download) + monkeypatch.setattr(qb_torrents, "start_scheduled_job", lambda job_id, req: None) monkeypatch.setattr(qb_torrents, "cancel_job", lambda job_id: None) with TestClient(app) as c: diff --git a/apps/api/tests/integration/api/qbittorrent/test_more.py b/apps/api/tests/integration/api/qbittorrent/test_more.py index 81e113eb..fae5fb1f 100644 --- a/apps/api/tests/integration/api/qbittorrent/test_more.py +++ b/apps/api/tests/integration/api/qbittorrent/test_more.py @@ -88,3 +88,27 @@ def test_sync_maindata_state_mapping(client): assert ts["h1"]["state"] == "uploading" assert ts["h2"]["state"] == "error" assert ts["h3"]["state"] == "pausedDL" + + +def test_sync_maindata_preserves_queued_state(client): + from sqlmodel import Session + from app.db import engine, create_job, upsert_client_task + + with Session(engine) as s: + job = create_job(s) + upsert_client_task( + s, + hash="queued-hash", + name="Queued", + slug="slug", + season=1, + episode=4, + language="German Dub", + save_path=None, + category=None, + job_id=job.id, + state="queued", + ) + + data = client.get("/api/v2/sync/maindata").json() + assert data["torrents"]["queued-hash"]["state"] == "queuedDL" diff --git a/apps/api/tests/integration/api/qbittorrent/test_torrents.py b/apps/api/tests/integration/api/qbittorrent/test_torrents.py index 4ce10341..0c616cc6 100644 --- a/apps/api/tests/integration/api/qbittorrent/test_torrents.py +++ b/apps/api/tests/integration/api/qbittorrent/test_torrents.py @@ -71,3 +71,140 @@ def test_torrents_add_aw_and_sto_prefixes(client): info = client.get("/api/v2/torrents/info") items = info.json() assert len(items) == 2 + + +def test_torrents_add_starts_worker_after_task_write(client, monkeypatch): + from app.utils.magnet import build_magnet + import app.api.qbittorrent.torrents as qb_torrents + + calls: list[tuple[str, str]] = [] + + def _schedule_download(req, *, autostart=True): + del req + calls.append(("schedule", "autostart" if autostart else "deferred")) + return "job-1" + + monkeypatch.setattr( + qb_torrents, + "schedule_download", + _schedule_download, + ) + monkeypatch.setattr( + qb_torrents, + "start_scheduled_job", + lambda job_id, req: calls.append(("start", job_id)), + ) + + magnet = build_magnet( + title="Title", + slug="slug", + season=1, + episode=1, + language="German Dub", + ) + response = client.post("/api/v2/torrents/add", data={"urls": magnet}) + + assert response.status_code == 200 + assert calls == [("schedule", "deferred"), ("start", "job-1")] + + +def test_torrents_info_preserves_queued_state_for_paused_add(client): + from app.utils.magnet import build_magnet + + magnet = build_magnet( + title="Queued Title", + slug="queued-title", + season=1, + episode=1, + language="German Dub", + ) + + response = client.post( + "/api/v2/torrents/add", data={"urls": magnet, "paused": "true"} + ) + + assert response.status_code == 200 + + info = client.get("/api/v2/torrents/info").json() + assert info[0]["state"] == "queuedDL" + + +def test_torrents_resume_starts_paused_job_with_original_request(client, monkeypatch): + from app.utils.magnet import build_magnet + import app.api.qbittorrent.torrents as qb_torrents + from app.db import create_job, engine + from sqlmodel import Session + + started: list[tuple[str, dict]] = [] + + def create_queued_job(req, *, autostart=True): + del autostart + with Session(engine) as session: + return create_job(session, source_site=req["site"]).id + + monkeypatch.setattr(qb_torrents, "schedule_download", create_queued_job) + monkeypatch.setattr( + qb_torrents, + "start_scheduled_job", + lambda job_id, req: started.append((job_id, req)), + ) + magnet = build_magnet( + title="Queued STRM", + slug="queued-strm", + season=2, + episode=3, + language="German Dub", + provider="VOE", + mode="strm", + ) + + add_response = client.post( + "/api/v2/torrents/add", + data={"urls": magnet, "paused": "true"}, + ) + torrent_hash = client.get("/api/v2/torrents/info").json()[0]["hash"] + resume_response = client.post( + "/api/v2/torrents/resume", + data={"hashes": torrent_hash}, + ) + + assert add_response.status_code == 200 + assert resume_response.status_code == 200 + assert len(started) == 1 + assert started[0][1] == { + "slug": "queued-strm", + "season": 2, + "episode": 3, + "language": "German Dub", + "site": "aniworld.to", + "title_hint": "Queued STRM", + "provider": "VOE", + "mode": "strm", + } + info = client.get("/api/v2/torrents/info").json() + assert info[0]["state"] == "downloading" + + +def test_torrents_add_returns_500_when_start_fails(client, monkeypatch): + from app.utils.magnet import build_magnet + import app.api.qbittorrent.torrents as qb_torrents + + monkeypatch.setattr( + qb_torrents, + "start_scheduled_job", + lambda job_id, req: (_ for _ in ()).throw(RuntimeError("boom")), + ) + + magnet = build_magnet( + title="Title", + slug="slug", + season=1, + episode=1, + language="German Dub", + ) + response = client.post("/api/v2/torrents/add", data={"urls": magnet}) + + assert response.status_code == 500 + + info = client.get("/api/v2/torrents/info").json() + assert info[0]["state"] == "error" diff --git a/apps/api/tests/integration/api/test_health.py b/apps/api/tests/integration/api/test_health.py index b8a9dd0f..ed38224f 100644 --- a/apps/api/tests/integration/api/test_health.py +++ b/apps/api/tests/integration/api/test_health.py @@ -2,3 +2,12 @@ def test_health_endpoint(client): r = client.get("/health") assert r.status_code == 200 assert r.json().get("status") == "ok" + assert "catalog" in r.json() + + +def test_catalog_health_endpoint(client): + r = client.get("/health/catalog") + assert r.status_code == 200 + payload = r.json() + assert "bootstrap_ready" in payload + assert "providers" in payload diff --git a/apps/api/tests/integration/api/torznab/test_api.py b/apps/api/tests/integration/api/torznab/test_api.py index 98eb7f24..69d559b0 100644 --- a/apps/api/tests/integration/api/torznab/test_api.py +++ b/apps/api/tests/integration/api/torznab/test_api.py @@ -1,5 +1,136 @@ +from __future__ import annotations + import xml.etree.ElementTree as ET +from sqlmodel import Session + + +def _seed_ready_tv_catalog( + *, + canonical_title: str, + query_aliases: list[str], + provider_title: str | None = None, + slug: str = "slug", + tvdb_id: int = 12345, + episode_mappings: list[tuple[int, int, int, int]] | None = None, +) -> None: + from app.db import ( + engine, + replace_canonical_episodes, + replace_provider_catalog_aliases, + replace_provider_catalog_episodes, + replace_provider_catalog_title, + replace_provider_episode_mappings, + replace_provider_series_mappings, + upsert_canonical_series, + upsert_provider_index_status, + ) + + generation = f"gen-{slug}" + mapped_episodes = episode_mappings or [(1, 1, 1, 1)] + provider_title = provider_title or canonical_title + + with Session(engine) as session: + for provider in ("aniworld.to", "s.to", "megakino"): + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=24.0, + status="ready", + current_generation=generation, + latest_success_generation=generation, + bootstrap_completed=True, + ) + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug=slug, + title=provider_title, + media_type_hint="series", + relative_path=f"/anime/stream/{slug}", + indexed_generation=generation, + ) + replace_provider_catalog_aliases( + session, + provider="aniworld.to", + slug=slug, + aliases=[provider_title, *query_aliases], + indexed_generation=generation, + ) + replace_provider_catalog_episodes( + session, + provider="aniworld.to", + slug=slug, + indexed_generation=generation, + episodes=[ + { + "season": provider_season, + "episode": provider_episode, + "relative_path": f"/anime/stream/{slug}/staffel-{provider_season}/episode-{provider_episode}", + "title_primary": f"Episode {canonical_episode}", + "title_secondary": None, + "media_type_hint": "episode", + "languages": [ + {"language": "German Sub", "host_hints": ["VOE"]}, + ], + } + for provider_season, provider_episode, _canonical_season, canonical_episode in mapped_episodes + ], + ) + upsert_canonical_series( + session, + tvdb_id=tvdb_id, + title=canonical_title, + imdb_id=f"tt{tvdb_id:07d}", + aliases=query_aliases, + ) + replace_canonical_episodes( + session, + tvdb_id=tvdb_id, + episodes=[ + { + "season": canonical_season, + "episode": canonical_episode, + "title": f"Episode {canonical_episode}", + } + for _provider_season, _provider_episode, canonical_season, canonical_episode in mapped_episodes + ], + ) + replace_provider_series_mappings( + session, + provider="aniworld.to", + slug=slug, + indexed_generation=generation, + mappings=[ + { + "tvdb_id": tvdb_id, + "confidence": "confirmed", + "source": "title", + "rationale": "test", + } + ], + ) + replace_provider_episode_mappings( + session, + provider="aniworld.to", + slug=slug, + indexed_generation=generation, + mappings=[ + { + "provider_season": provider_season, + "provider_episode": provider_episode, + "tvdb_id": tvdb_id, + "canonical_season": canonical_season, + "canonical_episode": canonical_episode, + "confidence": "confirmed", + "source": "direct_numbering", + "rationale": "test", + } + for provider_season, provider_episode, canonical_season, canonical_episode in mapped_episodes + ], + ) + session.commit() + def test_caps(client): resp = client.get("/torznab/api", params={"t": "caps"}) @@ -7,52 +138,112 @@ def test_caps(client): ET.fromstring(resp.text) -def test_search(client): +def test_search(client, monkeypatch): + import app.api.torznab.api as torznab_api + + monkeypatch.setattr( + torznab_api, + "_ready_provider_title_indexes", + lambda session, *, providers: providers, + ) resp = client.get("/torznab/api", params={"t": "search", "q": "test"}) assert resp.status_code == 200 root = ET.fromstring(resp.text) assert root.find("channel") is not None -def test_tvsearch_happy_path(client, monkeypatch): - import app.api.torznab as tn +def test_movie_search_uses_megakino_readiness_only(client, monkeypatch): + import app.api.torznab.api as torznab_api + from app.db import engine, upsert_provider_index_status - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" + called: list[list[str]] = [] + + def fake_indexed_preview_results(**kwargs): + called.append(kwargs["providers"]) + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="megakino", + refresh_interval_hours=24.0, + status="ready", + latest_success_generation="gen-movie", + current_generation="gen-movie", + bootstrap_completed=True, + title_index_status="ready", + ) + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="pending", + title_index_status="pending", + ) - # Return (site, slug) tuple for new multi-site API - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), + torznab_api, + "_indexed_preview_results", + fake_indexed_preview_results, ) + + resp = client.get("/torznab/api", params={"t": "movie", "q": "movie"}) + + assert resp.status_code == 200 + assert called == [["megakino"]] + + +def test_generic_search_ignores_unqueried_megakino_readiness(client, monkeypatch): + import app.api.torznab.api as torznab_api + from app.db import engine, upsert_provider_index_status + + called: list[list[str]] = [] + + def fake_indexed_preview_results(**kwargs): + called.append(kwargs["providers"]) + return 1 + + with Session(engine) as session: + for provider in ("aniworld.to", "s.to"): + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=24.0, + status="ready", + latest_success_generation=f"gen-{provider}", + current_generation=f"gen-{provider}", + bootstrap_completed=True, + title_index_status="ready", + ) + upsert_provider_index_status( + session, + provider="megakino", + refresh_interval_hours=24.0, + status="pending", + title_index_status="pending", + ) + monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - "Title" - ), + torznab_api, + "_indexed_preview_results", + fake_indexed_preview_results, ) + + resp = client.get("/torznab/api", params={"t": "search", "q": "anime"}) + + assert resp.status_code == 200 + assert len(called) == 1 + assert set(called[0]) == {"aniworld.to", "s.to"} + + +def test_tvsearch_happy_path(client, monkeypatch): + import app.api.torznab as tn + + _seed_ready_tv_catalog(canonical_title="Series", query_aliases=["foo"]) monkeypatch.setattr( tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - "magnet:?xt=urn:btih:test&dn=Title&aw_slug=slug&aw_s=1&aw_e=1&aw_lang=German+Sub&aw_site=aniworld.to" + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") ), ) @@ -86,63 +277,23 @@ def test_tvsearch_uses_id_resolved_query_when_q_missing(client, monkeypatch): import app.api.torznab as tn import app.api.torznab.api as torznab_api_mod - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" - - seen = {"query": None} - - def _slug_from_query(query, site=None): - """ - Record the provided query in the shared `seen` mapping and return a fixed (site, slug) pair. - - Parameters: - query (str): The query string to record. - site (str | None): Optional site hint (unused by this stub). - - Returns: - tuple: A two-element tuple (site, slug) where `site` is `"aniworld.to"` and `slug` is `"slug"`. - - Side effects: - Mutates the `seen` mapping by setting `seen["query"] = query`. - """ - seen["query"] = query - return ("aniworld.to", "slug") - + _seed_ready_tv_catalog( + canonical_title="The Rookie", + query_aliases=[], + provider_title="The Rookie", + slug="the-rookie", + tvdb_id=350665, + ) monkeypatch.setattr( torznab_api_mod, "_resolve_tvsearch_query_from_ids", lambda **_kwargs: "The Rookie", ) - monkeypatch.setattr(tn, "_slug_from_query", _slug_from_query) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) monkeypatch.setattr( tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), - ) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - "Title" - ), - ) - monkeypatch.setattr( - tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - "magnet:?xt=urn:btih:test&dn=Title&aw_slug=slug&aw_s=1&aw_e=1&aw_lang=German+Sub&aw_site=aniworld.to" + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") ), ) @@ -153,61 +304,25 @@ def _slug_from_query(query, site=None): assert resp.status_code == 200 root = ET.fromstring(resp.text) assert root.find("./channel/item") is not None - assert seen["query"] == "The Rookie" def test_tvsearch_season_search_emits_multiple_episodes(client, monkeypatch) -> None: - """Emit one item per discovered season episode in season-search mode.""" import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod - - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" + import app.api.torznab.api as torznab_api - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr( - torznab_api_mod, "_metadata_episode_numbers_for_season", lambda **_kwargs: [] - ) - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "no") - monkeypatch.setattr( - tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [1, 2, 3], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2), (1, 3, 1, 3)], ) monkeypatch.setattr( tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang=German+Sub&aw_site={site}" + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") ), ) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", @@ -233,96 +348,22 @@ class Rec: def test_tvsearch_season_search_fallback_stops_on_consecutive_misses( client, monkeypatch ) -> None: - """Stop strict fallback probing after configured consecutive misses.""" import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod + import app.api.torznab.api as torznab_api - class Rec: - def __init__(self, height=1080, vcodec="h264", provider="VOE"): - self.available = True - self.is_fresh = True - self.height = height - self.vcodec = vcodec - self.provider = provider - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr( - torznab_api_mod, "_metadata_episode_numbers_for_season", lambda **_kwargs: [] - ) - monkeypatch.setattr(torznab_api_mod, "TORZNAB_SEASON_SEARCH_MODE", "strict") - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "no") - monkeypatch.setattr( - tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - - cached: dict[tuple[int, int, str], Rec] = {} - - def _get_availability(session, slug, season, episode, language, site="aniworld.to"): - _ = (session, slug, site) - return cached.get((season, episode, language)) - - monkeypatch.setattr(tn, "get_availability", _get_availability) - - probe_calls: list[int] = [] - - def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwargs): - _ = (slug, language, site) - probe_calls.append(episode) - if episode in (1, 2): - return (True, 1080, "h264", "VOE", {}) - return (False, None, None, None, None) - - monkeypatch.setattr(tn, "probe_episode_quality", _probe_quality) - - def _upsert_availability( - session, - slug, - season, - episode, - language, - available, - height=None, - vcodec=None, - provider=None, - extra=None, - site="aniworld.to", - ): - _ = (session, slug, extra, site) - if available: - cached[(season, episode, language)] = Rec(height, vcodec, provider) - - monkeypatch.setattr(tn, "upsert_availability", _upsert_availability) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2)], ) monkeypatch.setattr( tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang=German+Sub&aw_site={site}" + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") ), ) - - monkeypatch.setattr(torznab_api_mod, "TORZNAB_SEASON_SEARCH_MAX_EPISODES", 10) - monkeypatch.setattr( - torznab_api_mod, "TORZNAB_SEASON_SEARCH_MAX_CONSECUTIVE_MISSES", 2 - ) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", @@ -332,63 +373,25 @@ def _upsert_availability( root = ET.fromstring(resp.text) items = root.findall("./channel/item") assert len(items) == 2 - assert probe_calls == [1, 2, 3, 4] def test_tvsearch_ep_zero_is_treated_as_season_search(client, monkeypatch) -> None: - """Treat ep=0 as season-search and emit multiple season episode items.""" import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod - - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" + import app.api.torznab.api as torznab_api - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr(torznab_api_mod, "TORZNAB_SEASON_SEARCH_MODE", "fast") - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "no") - monkeypatch.setattr( - torznab_api_mod, - "_metadata_episode_numbers_for_season", - lambda **_kwargs: [1, 2], - ) - monkeypatch.setattr( - tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), - ) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2)], ) monkeypatch.setattr( tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang=German+Sub&aw_site={site}" + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") ), ) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", @@ -411,67 +414,22 @@ class Rec: def test_tvsearch_fast_season_mode_avoids_live_probe(client, monkeypatch) -> None: - """Avoid live quality probing when fast season-search mode is enabled.""" import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr(torznab_api_mod, "TORZNAB_SEASON_SEARCH_MODE", "fast") - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "no") - monkeypatch.setattr( - torznab_api_mod, - "_metadata_episode_numbers_for_season", - lambda **_kwargs: [1, 2], - ) - monkeypatch.setattr( - tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": [], - ) - monkeypatch.setattr( - torznab_api_mod, - "_discover_episode_languages_for_fast_season_mode", - lambda **_kwargs: ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": None, - ) - - probe_calls: list[tuple[int, str]] = [] + import app.api.torznab.api as torznab_api - def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwargs): - _ = (slug, site) - probe_calls.append((episode, language)) - return (True, 1080, "h264", "VOE", {}) - - monkeypatch.setattr(tn, "probe_episode_quality", _probe_quality) - - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2)], ) monkeypatch.setattr( tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang={language.replace(' ', '+')}&aw_site={site}" + "probe_episode_quality", + lambda **_kwargs: (_ for _ in ()).throw( + AssertionError("unexpected live probe") ), ) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", @@ -481,61 +439,17 @@ def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwarg root = ET.fromstring(resp.text) items = root.findall("./channel/item") assert len(items) == 2 - assert probe_calls == [] def test_tvsearch_season_search_limit_is_hard_item_cap(client, monkeypatch) -> None: - """Cap season-search output by item limit across episode/language variants.""" - import app.api.torznab as tn - import app.api.torznab.api as torznab_api_mod - - class Rec: - available = True - is_fresh = True - height = 1080 - vcodec = "h264" - provider = "prov" + import app.api.torznab.api as torznab_api - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "slug") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Series" - ) - monkeypatch.setattr( - torznab_api_mod, "_metadata_episode_numbers_for_season", lambda **_kwargs: [] - ) - monkeypatch.setattr( - tn, - "list_cached_episode_numbers_for_season", - lambda session, slug, season, site="aniworld.to": [1, 2], - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": Rec(), - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr( - tn, - "build_release_name", - lambda series_title, season, episode, height, vcodec, language, site="aniworld.to": ( - f"Title S{int(season):02d}E{int(episode):02d}" - ), - ) - monkeypatch.setattr( - tn, - "build_magnet", - lambda title, slug, season, episode, language, provider, site="aniworld.to", **_kwargs: ( - f"magnet:?xt=urn:btih:test&dn=Title&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_lang=German+Sub&aw_site={site}" - ), + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "both") + _seed_ready_tv_catalog( + canonical_title="Series", + query_aliases=["foo"], + episode_mappings=[(1, 1, 1, 1), (1, 2, 1, 2), (1, 3, 1, 3)], ) - monkeypatch.setattr(torznab_api_mod, "STRM_FILES_MODE", "both") resp = client.get( "/torznab/api", diff --git a/apps/api/tests/integration/api/torznab/test_indexed_catalog.py b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py new file mode 100644 index 00000000..31f70dfa --- /dev/null +++ b/apps/api/tests/integration/api/torznab/test_indexed_catalog.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +from sqlmodel import Session + + +def _seed_ready_catalog() -> None: + from app.db import ( + engine, + replace_canonical_episodes, + replace_provider_catalog_aliases, + replace_provider_catalog_episodes, + replace_provider_catalog_title, + replace_provider_episode_mappings, + replace_provider_series_mappings, + upsert_canonical_series, + upsert_provider_index_status, + ) + + generation = "gen-1" + with Session(engine) as session: + for provider in ("aniworld.to", "s.to", "megakino"): + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=24.0, + status="ready", + current_generation=generation, + latest_success_generation=generation, + bootstrap_completed=True, + title_index_status="ready", + ) + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug="kaguya-sama", + title="Kaguya-sama", + media_type_hint="series", + relative_path="/anime/stream/kaguya-sama", + indexed_generation=generation, + ) + replace_provider_catalog_aliases( + session, + provider="aniworld.to", + slug="kaguya-sama", + aliases=["Kaguya-sama", "Kaguya"], + indexed_generation=generation, + ) + replace_provider_catalog_episodes( + session, + provider="aniworld.to", + slug="kaguya-sama", + indexed_generation=generation, + episodes=[ + { + "season": 1, + "episode": 1, + "relative_path": "/anime/stream/kaguya-sama/staffel-1/episode-1", + "title_primary": "I Want To Be Invited To A Movie", + "title_secondary": None, + "media_type_hint": "episode", + "languages": [ + {"language": "German Sub", "host_hints": ["VOE"]}, + ], + } + ], + ) + upsert_canonical_series( + session, + tvdb_id=12345, + title="Kaguya-sama", + imdb_id="tt0000001", + aliases=["Kaguya"], + ) + replace_canonical_episodes( + session, + tvdb_id=12345, + episodes=[ + {"season": 1, "episode": 1, "title": "I Want To Be Invited To A Movie"} + ], + ) + replace_provider_series_mappings( + session, + provider="aniworld.to", + slug="kaguya-sama", + indexed_generation=generation, + mappings=[ + { + "tvdb_id": 12345, + "confidence": "confirmed", + "source": "title", + "rationale": "test", + } + ], + ) + replace_provider_episode_mappings( + session, + provider="aniworld.to", + slug="kaguya-sama", + indexed_generation=generation, + mappings=[ + { + "provider_season": 1, + "provider_episode": 1, + "tvdb_id": 12345, + "canonical_season": 1, + "canonical_episode": 1, + "confidence": "confirmed", + "source": "direct_numbering", + "rationale": "test", + } + ], + ) + session.commit() + + +def test_search_returns_503_when_catalog_bootstrap_pending(client) -> None: + from app.db import engine, upsert_provider_index_status + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="pending", + bootstrap_completed=False, + ) + + response = client.get("/torznab/api", params={"t": "search", "q": "Kaguya"}) + + assert response.status_code == 503 + assert "bootstrap" in response.json()["detail"].lower() + + +def test_search_test_result_bypasses_catalog_bootstrap(client, monkeypatch) -> None: + import app.api.torznab.api as torznab_api + + monkeypatch.setattr(torznab_api, "TORZNAB_RETURN_TEST_RESULT", True) + + response = client.get("/torznab/api", params={"t": "search"}) + + assert response.status_code == 200 + assert "" in response.text + + +def test_search_uses_indexed_catalog_without_live_probe(client, monkeypatch) -> None: + _seed_ready_catalog() + monkeypatch.setattr( + "app.utils.title_resolver.load_or_refresh_index", + lambda site="aniworld.to": (_ for _ in ()).throw( + AssertionError("unexpected live index refresh") + ), + ) + + response = client.get("/torznab/api", params={"t": "search", "q": "Kaguya"}) + + assert response.status_code == 200 + assert "Kaguya.sama.S01E01" in response.text + assert "aw_slug=kaguya-sama" in response.text + + +def test_tvsearch_uses_indexed_canonical_mapping(client, monkeypatch) -> None: + _seed_ready_catalog() + monkeypatch.setattr( + "app.utils.title_resolver.load_or_refresh_index", + lambda site="aniworld.to": (_ for _ in ()).throw( + AssertionError("unexpected live index refresh") + ), + ) + + response = client.get( + "/torznab/api", + params={"t": "tvsearch", "q": "Kaguya", "season": 1, "ep": 1}, + ) + + assert response.status_code == 200 + assert "Kaguya.sama.S01E01" in response.text + assert "aw_s=1" in response.text + assert "aw_e=1" in response.text diff --git a/apps/api/tests/integration/api/torznab/test_specials_mapping.py b/apps/api/tests/integration/api/torznab/test_specials_mapping.py index 6670452f..aee75a86 100644 --- a/apps/api/tests/integration/api/torznab/test_specials_mapping.py +++ b/apps/api/tests/integration/api/torznab/test_specials_mapping.py @@ -1,112 +1,127 @@ -import xml.etree.ElementTree as ET - - -def _fake_release_name( - series_title: str, - season: int, - episode: int, - height: int, - vcodec: str, - language: str, - site: str = "aniworld.to", -) -> str: - """ - Constructs a fake release name for a series episode using the provided season and episode numbers. - - Parameters: - series_title (str): Ignored; present for signature compatibility. - season (int): Season number; used and zero-padded to two digits in the result. - episode (int): Episode number; used and zero-padded to two digits in the result. - height (int): Ignored; present for signature compatibility. - vcodec (str): Ignored; present for signature compatibility. - language (str): Ignored; present for signature compatibility. - site (str): Ignored; present for signature compatibility. - - Returns: - str: A release-name string in the form "Kaguya.SXXEYY.1080p.WEB.H264.GER.SUB-ANIWORLD" - where XX is the zero-padded season and YY is the zero-padded episode. - """ - _ = (series_title, height, vcodec, language, site) - return ( - f"Kaguya.S{int(season):02d}E{int(episode):02d}.1080p.WEB.H264.GER.SUB-ANIWORLD" - ) - - -def _fake_magnet( - title: str, - slug: str, - season: int, - episode: int, - language: str, - provider: str | None, - site: str = "aniworld.to", - **_kwargs, -) -> str: - """ - Builds a fake magnet URI that encodes the provided slug, season, episode, and site as aw_* query parameters. - - Parameters: - slug (str): Series slug to include as the `aw_slug` query parameter. - season (int): Season index to include as the `aw_s` query parameter. - episode (int): Episode index to include as the `aw_e` query parameter. - site (str): Site identifier to include as the `aw_site` query parameter (default "aniworld.to"). - - Returns: - str: A magnet URI string containing `aw_slug`, `aw_s`, `aw_e`, and `aw_site` set to the provided values. - """ - _ = (title, language, provider) - return ( - f"magnet:?xt=urn:btih:test&dn=Kaguya" - f"&aw_slug={slug}&aw_s={season}&aw_e={episode}&aw_site={site}" - ) - - -def test_search_uses_special_mapping_alias_in_title(client, monkeypatch): - """ - Verify that a torznab search result applies a special episode mapping alias to the item title and enclosure parameters. +from __future__ import annotations - Performs a search request and asserts a 200 response, that an item exists whose title contains "S00E05", and that the enclosure URL includes "aw_s=0" and "aw_e=4". - """ - import app.api.torznab as tn - import app.api.torznab.api as torznab_api - from app.providers.aniworld.specials import SpecialEpisodeMapping +import xml.etree.ElementTree as ET - monkeypatch.setattr(torznab_api, "ANIBRIDGE_TEST_MODE", False) - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "kaguya") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Kaguya-sama" - ) - monkeypatch.setattr( - torznab_api, - "resolve_special_mapping_from_query", - lambda **_kwargs: SpecialEpisodeMapping( - source_season=0, - source_episode=4, - alias_season=0, - alias_episode=5, - metadata_title="special title", - metadata_tvdb_id=12345, - ), - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "probe_episode_quality", - lambda **_kwargs: (True, 1080, "h264", "VOE", {}), - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr(tn, "build_release_name", _fake_release_name) - monkeypatch.setattr(tn, "build_magnet", _fake_magnet) +from sqlmodel import Session + + +def _seed_special_mapping_catalog(*, languages: list[str]) -> None: + from app.db import ( + engine, + replace_canonical_episodes, + replace_provider_catalog_aliases, + replace_provider_catalog_episodes, + replace_provider_catalog_title, + replace_provider_episode_mappings, + replace_provider_series_mappings, + upsert_canonical_series, + upsert_provider_index_status, + ) + + generation = "gen-special" + with Session(engine) as session: + for provider in ("aniworld.to", "s.to", "megakino"): + upsert_provider_index_status( + session, + provider=provider, + refresh_interval_hours=24.0, + status="ready", + current_generation=generation, + latest_success_generation=generation, + bootstrap_completed=True, + title_index_status="ready", + ) + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug="kaguya", + title="Kaguya-sama", + media_type_hint="series", + relative_path="/anime/stream/kaguya", + indexed_generation=generation, + ) + replace_provider_catalog_aliases( + session, + provider="aniworld.to", + slug="kaguya", + aliases=["Kaguya-sama", "Kaguya"], + indexed_generation=generation, + ) + replace_provider_catalog_episodes( + session, + provider="aniworld.to", + slug="kaguya", + indexed_generation=generation, + episodes=[ + { + "season": 0, + "episode": 4, + "relative_path": "/anime/stream/kaguya/filme/film-4", + "title_primary": "special title", + "title_secondary": None, + "media_type_hint": "episode", + "languages": [ + {"language": language, "host_hints": ["VOE"]} + for language in languages + ], + } + ], + ) + upsert_canonical_series( + session, + tvdb_id=12345, + title="Kaguya-sama", + imdb_id="tt0000001", + aliases=["Kaguya"], + ) + replace_canonical_episodes( + session, + tvdb_id=12345, + episodes=[ + {"season": 0, "episode": 5, "title": "special title"}, + ], + ) + replace_provider_series_mappings( + session, + provider="aniworld.to", + slug="kaguya", + indexed_generation=generation, + mappings=[ + { + "tvdb_id": 12345, + "confidence": "confirmed", + "source": "title", + "rationale": "test", + } + ], + ) + replace_provider_episode_mappings( + session, + provider="aniworld.to", + slug="kaguya", + indexed_generation=generation, + mappings=[ + { + "provider_season": 0, + "provider_episode": 4, + "tvdb_id": 12345, + "canonical_season": 0, + "canonical_episode": 5, + "confidence": "confirmed", + "source": "special_alias", + "rationale": "test", + } + ], + ) + session.commit() + + +def test_search_uses_special_mapping_alias_in_title(client): + _seed_special_mapping_catalog(languages=["German Sub"]) resp = client.get( "/torznab/api", - params={"t": "search", "q": "Kaguya special title", "cat": "5070"}, + params={"t": "search", "q": "Kaguya", "cat": "5070"}, ) assert resp.status_code == 200 root = ET.fromstring(resp.text) @@ -123,73 +138,8 @@ def test_search_uses_special_mapping_alias_in_title(client, monkeypatch): def test_tvsearch_falls_back_to_special_mapping_when_requested_episode_missing( client, - monkeypatch, -): - """ - Verifies that a tvsearch request falls back to a SpecialEpisodeMapping when the requested episode is unavailable and that the resulting item reflects the alias episode and mapped source. - - Asserts the response contains an item whose title includes "S00E05" (the alias episode) and whose enclosure URL includes the mapped source parameters `aw_s=0` and `aw_e=4`. - """ - import app.api.torznab as tn - import app.api.torznab.api as torznab_api - from app.providers.aniworld.specials import SpecialEpisodeMapping - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "kaguya") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Kaguya-sama" - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": None, - ) - - def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwargs): - """ - Determine if the specified episode is available as a 1080p H.264 release and provide its metadata. - - Parameters: - slug (str): Series identifier (ignored by this probe). - season (int): Season number to probe. - episode (int): Episode number to probe. - language (str): Language tag (ignored by this probe). - site (str): Site identifier (ignored by this probe). - - Returns: - available (bool): `True` if the probe found a matching 1080p H.264 release, `False` otherwise. - height (int or None): Video height in pixels when available (1080), otherwise `None`. - vcodec (str or None): Video codec string when available ("h264"), otherwise `None`. - release_language (str or None): Release language tag when available ("VOE"), otherwise `None`. - extra (dict or None): Additional metadata (empty dict when available), otherwise `None`. - """ - _ = (slug, language, site) - if season == 0 and episode == 4: - return (True, 1080, "h264", "VOE", {}) - return (False, None, None, None, None) - - monkeypatch.setattr(tn, "probe_episode_quality", _probe_quality) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr(tn, "build_release_name", _fake_release_name) - monkeypatch.setattr(tn, "build_magnet", _fake_magnet) - monkeypatch.setattr( - torznab_api, - "resolve_special_mapping_from_episode_request", - lambda **_kwargs: SpecialEpisodeMapping( - source_season=0, - source_episode=4, - alias_season=0, - alias_episode=5, - metadata_title="special title", - metadata_tvdb_id=12345, - ), - ) +) -> None: + _seed_special_mapping_catalog(languages=["German Sub"]) resp = client.get( "/torznab/api", @@ -209,105 +159,39 @@ def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwarg def test_tvsearch_reuses_resolved_special_mapping_across_languages( - client, - monkeypatch, -): - """Verify a resolved special mapping is reused for subsequent languages.""" - import app.api.torznab as tn + client, monkeypatch +) -> None: import app.api.torznab.api as torznab_api - from app.providers.aniworld.specials import SpecialEpisodeMapping - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "kaguya") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Kaguya-sama" - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": [ - "German Sub", - "English Sub", - ], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": None, - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr(tn, "build_release_name", _fake_release_name) - monkeypatch.setattr(tn, "build_magnet", _fake_magnet) - monkeypatch.setattr( - torznab_api, - "resolve_special_mapping_from_episode_request", - lambda **_kwargs: SpecialEpisodeMapping( - source_season=0, - source_episode=4, - alias_season=0, - alias_episode=5, - metadata_title="special title", - metadata_tvdb_id=12345, - ), - ) - probe_calls: list[tuple[int, int, str]] = [] - - def _probe_quality(slug, season, episode, language, site="aniworld.to", **_kwargs): - _ = (slug, site) - probe_calls.append((season, episode, language)) - if season == 0 and episode == 4: - return (True, 1080, "h264", "VOE", {}) - return (False, None, None, None, None) - - monkeypatch.setattr(tn, "probe_episode_quality", _probe_quality) + _seed_special_mapping_catalog(languages=["German Sub", "English Sub"]) + monkeypatch.setattr(torznab_api, "STRM_FILES_MODE", "no") resp = client.get( "/torznab/api", params={"t": "tvsearch", "q": "Kaguya", "season": 0, "ep": 5, "cat": "5070"}, ) assert resp.status_code == 200 - requested_calls = [c for c in probe_calls if c[0] == 0 and c[1] == 5] - assert len(requested_calls) == 1 - mapped_calls = [c for c in probe_calls if c[0] == 0 and c[1] == 4] - assert len(mapped_calls) == 2 - - -def test_tvsearch_guid_alias_suffix_only_when_alias_differs( - client, - monkeypatch, -): - import app.api.torznab as tn - - monkeypatch.setattr( - tn, "_slug_from_query", lambda q, site=None: ("aniworld.to", "kaguya") - ) - monkeypatch.setattr( - tn, "resolve_series_title", lambda slug, site="aniworld.to": "Kaguya-sama" - ) - monkeypatch.setattr( - tn, - "list_available_languages_cached", - lambda session, slug, season, episode, site="aniworld.to": ["German Sub"], - ) - monkeypatch.setattr( - tn, - "get_availability", - lambda session, slug, season, episode, language, site="aniworld.to": None, - ) - monkeypatch.setattr( - tn, - "probe_episode_quality", - lambda **_kwargs: (True, 1080, "h264", "VOE", {}), - ) - monkeypatch.setattr(tn, "upsert_availability", lambda *args, **kwargs: None) - monkeypatch.setattr(tn, "build_release_name", _fake_release_name) - monkeypatch.setattr(tn, "build_magnet", _fake_magnet) + root = ET.fromstring(resp.text) + items = root.findall("./channel/item") + assert len(items) == 2 + urls = [ + ( + item.find("enclosure").get("url") + if item.find("enclosure") is not None + else "" + ) + for item in items + ] + assert sum("aw_lang=German+Sub" in url for url in urls) == 1 + assert sum("aw_lang=English+Sub" in url for url in urls) == 1 + + +def test_tvsearch_guid_alias_suffix_only_when_alias_differs(client) -> None: + _seed_special_mapping_catalog(languages=["German Sub"]) resp = client.get( "/torznab/api", - params={"t": "tvsearch", "q": "Kaguya", "season": 1, "ep": 1, "cat": "5070"}, + params={"t": "tvsearch", "q": "Kaguya", "season": 0, "ep": 5, "cat": "5070"}, ) assert resp.status_code == 200 root = ET.fromstring(resp.text) diff --git a/apps/api/tests/unit/app/test_config.py b/apps/api/tests/unit/app/test_config.py index e97b0624..651c5c0a 100644 --- a/apps/api/tests/unit/app/test_config.py +++ b/apps/api/tests/unit/app/test_config.py @@ -99,7 +99,9 @@ def test_provider_redirect_settings(monkeypatch): import sys monkeypatch.setenv("PROVIDER_REDIRECT_TIMEOUT_SECONDS", "15") + monkeypatch.setenv("PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", "9.5") monkeypatch.setenv("PROVIDER_REDIRECT_RETRIES", "4") + monkeypatch.setenv("JOB_PROGRESS_FLUSH_SECONDS", "0.25") monkeypatch.setenv("PROVIDER_CHALLENGE_BACKOFF_SECONDS", "120") if "app.config" in sys.modules: @@ -110,7 +112,9 @@ def test_provider_redirect_settings(monkeypatch): cfg = importlib.reload(cfg) assert cfg.PROVIDER_REDIRECT_TIMEOUT_SECONDS == 15 + assert cfg.PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS == 9.5 assert cfg.PROVIDER_REDIRECT_RETRIES == 4 + assert cfg.JOB_PROGRESS_FLUSH_SECONDS == 0.25 assert cfg.PROVIDER_CHALLENGE_BACKOFF_SECONDS == 120 monkeypatch.setenv("DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC", "not-a-number") @@ -121,3 +125,40 @@ def test_provider_redirect_settings(monkeypatch): cfg = importlib.import_module("app.config") cfg = importlib.reload(cfg) assert cfg.DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC == 0 + + +def test_runtime_home_defaults_to_data_dir_when_home_is_nonexistent( + monkeypatch, tmp_path +): + import importlib + import app + import sys + + monkeypatch.setenv("DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("DOWNLOAD_DIR", str(tmp_path / "downloads")) + monkeypatch.setenv("HOME", "/nonexistent") + + if "app.config" in sys.modules: + del sys.modules["app.config"] + if hasattr(app, "config"): + delattr(app, "config") + cfg = importlib.import_module("app.config") + cfg = importlib.reload(cfg) + + assert cfg.RUNTIME_HOME == (cfg.DATA_DIR / "home").resolve() + assert cfg.RUNTIME_HOME.exists() + assert cfg.RUNTIME_HOME == cfg.RUNTIME_HOME.resolve() + assert cfg.os.environ["HOME"] == str(cfg.RUNTIME_HOME) + + +def test_ensure_log_path_prefers_data_dir_env(monkeypatch, tmp_path): + from app.utils.logger import ensure_log_path + + monkeypatch.delenv("ANIBRIDGE_LOG_PATH", raising=False) + monkeypatch.setenv("DATA_DIR", str(tmp_path / "mounted-data")) + + log_path = ensure_log_path() + + assert log_path.parent == (tmp_path / "mounted-data").resolve() + assert log_path.parent.exists() + assert log_path.name.startswith("terminal-") diff --git a/apps/api/tests/unit/catalog/test_indexer.py b/apps/api/tests/unit/catalog/test_indexer.py new file mode 100644 index 00000000..e24ddfbf --- /dev/null +++ b/apps/api/tests/unit/catalog/test_indexer.py @@ -0,0 +1,681 @@ +from datetime import datetime, timedelta, timezone +from threading import Event, Thread +from types import SimpleNamespace + + +def test_catalog_scheduler_runs_immediately(monkeypatch): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + calls: list[str] = [] + + def fake_run_due_once() -> None: + calls.append("called") + indexer._stop_event.set() + + monkeypatch.setattr(indexer, "run_due_once", fake_run_due_once) + + indexer._run_loop() + + assert calls == ["called"] + + +def test_catalog_scheduler_start_clears_previous_stop(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + indexer._stop_event.set() + + monkeypatch.setattr(indexer_module, "ANIBRIDGE_TEST_MODE", False) + monkeypatch.setattr(indexer, "_ensure_status_rows", lambda: None) + monkeypatch.setattr(indexer, "_log_bootstrap_state", lambda: None) + monkeypatch.setattr(indexer, "_run_loop", lambda: None) + + indexer.start() + + assert not indexer._stop_event.is_set() + + +def test_catalog_progress_tracks_crawl_and_persist_counts(): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + indexer._set_progress( + "aniworld.to", + phase="crawling_titles", + total_titles=10, + reset_log_steps=True, + ) + indexer._advance_crawl_progress( + "aniworld.to", + current_slug="slug-1", + queue_depth=3, + ) + indexer._advance_failed_progress( + "aniworld.to", + current_slug="slug-2", + queue_depth=2, + ) + indexer._advance_persist_progress( + "aniworld.to", + current_slug="slug-1", + count=1, + queue_depth=1, + ) + + assert indexer._get_total_titles("aniworld.to") == 10 + assert indexer._get_persisted_titles("aniworld.to") == 1 + assert indexer._get_failed_titles("aniworld.to") == 1 + assert indexer._get_writer_lag("aniworld.to") == 0 + + +def test_stale_generation_detection_handles_running_and_published_states(): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + + assert ( + indexer._stale_generation( + SimpleNamespace( + status="running", + current_generation="gen-running", + latest_success_generation="gen-old", + ) + ) + == "gen-running" + ) + assert ( + indexer._stale_generation( + SimpleNamespace( + status="failed", + current_generation="gen-staging", + latest_success_generation="gen-old", + ) + ) + == "gen-staging" + ) + assert ( + indexer._stale_generation( + SimpleNamespace( + status="ready", + current_generation="gen-live", + latest_success_generation="gen-live", + ) + ) + is None + ) + + +def test_catalog_recovers_interrupted_running_state(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + + updates: list[dict[str, object]] = [] + cleaned: list[tuple[str, str]] = [] + warnings: list[str] = [] + statuses = { + "aniworld.to": SimpleNamespace( + provider="aniworld.to", + status="running", + bootstrap_completed=False, + current_generation="staging-123", + latest_started_at=None, + latest_success_generation=None, + next_refresh_after=None, + failure_count=2, + cursor_title_slug="one-piece", + ), + "s.to": None, + "megakino": SimpleNamespace( + provider="megakino", + status="ready", + bootstrap_completed=True, + current_generation="abc123", + latest_started_at=None, + latest_success_generation="abc123", + next_refresh_after=None, + failure_count=0, + cursor_title_slug=None, + ), + } + + class FakeSession: + def __enter__(self): + return object() + + def __exit__(self, exc_type, exc, tb): + return False + + def fake_warning(message: str, *args) -> None: + warnings.append(message.format(*args)) + + def fake_get_provider_index_status(_session, provider: str): + return statuses[provider] + + def fake_upsert_provider_index_status(_session, **kwargs): + updates.append(kwargs) + return None + + def fake_delete_provider_generation(_session, *, provider: str, generation: str): + cleaned.append((provider, generation)) + + monkeypatch.setattr(indexer_module, "Session", lambda _engine: FakeSession()) + monkeypatch.setattr( + indexer_module, "get_provider_index_status", fake_get_provider_index_status + ) + monkeypatch.setattr( + indexer_module, "delete_provider_generation", fake_delete_provider_generation + ) + monkeypatch.setattr( + indexer_module, + "upsert_provider_index_status", + fake_upsert_provider_index_status, + ) + monkeypatch.setattr(indexer_module.logger, "warning", fake_warning) + + ProviderCatalogIndexer()._ensure_status_rows() + + assert any( + "found interrupted staging generation for aniworld.to" in item + for item in warnings + ) + assert any("Initial bootstrap required" in item for item in warnings) + assert cleaned == [("aniworld.to", "staging-123")] + assert any( + update.get("provider") == "aniworld.to" and update.get("status") == "pending" + for update in updates + ) + assert any( + update.get("provider") == "s.to" and update.get("status") == "pending" + for update in updates + ) + + +def test_refresh_provider_starts_background_worker(monkeypatch): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + called: list[str] = [] + + def fake_refresh(provider: str) -> None: + called.append(provider) + + monkeypatch.setattr(indexer, "_refresh_provider", fake_refresh) + + indexer.refresh_provider("aniworld.to") + indexer.stop() + + assert called == ["aniworld.to"] + + +def test_is_due_handles_naive_next_refresh_after(): + from app.catalog.indexer import ProviderCatalogIndexer + + status = SimpleNamespace( + status="ready", + latest_success_at=datetime.now(timezone.utc), + next_refresh_after=datetime(2000, 1, 1, 0, 0, 0), + ) + + assert ProviderCatalogIndexer()._is_due(status) is True + + +def test_failed_first_bootstrap_respects_future_retry_backoff(): + from datetime import timedelta + + from app.catalog.indexer import ProviderCatalogIndexer + from app.db import utcnow + + now = utcnow() + status = SimpleNamespace( + provider="aniworld.to", + status="failed", + latest_success_generation=None, + title_index_status="failed", + title_index_next_retry_after=now + timedelta(hours=2), + next_refresh_after=None, + ) + + assert ProviderCatalogIndexer()._is_due(status) is False + + +def test_pick_due_stage_prefers_detail_then_canonical(monkeypatch): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + monkeypatch.setattr(indexer, "_detail_stage_has_due_work", lambda provider: True) + monkeypatch.setattr(indexer, "_canonical_stage_has_due_work", lambda provider: True) + + status = SimpleNamespace( + provider="aniworld.to", + status="partial", + latest_success_generation="gen-1", + title_index_status="ready", + detail_enrichment_status="pending", + detail_next_retry_after=None, + canonical_enrichment_status="pending", + canonical_next_retry_after=None, + next_refresh_after=None, + title_index_next_retry_after=None, + ) + + assert indexer._pick_due_stage(status) == "detail_enrichment" + + +def test_pick_due_stage_blocks_title_refresh_while_detail_retry_is_backed_off( + monkeypatch, +): + from datetime import timedelta + + from app.catalog.indexer import ProviderCatalogIndexer + from app.db import utcnow + + indexer = ProviderCatalogIndexer() + monkeypatch.setattr(indexer, "_detail_stage_has_due_work", lambda provider: True) + monkeypatch.setattr( + indexer, "_canonical_stage_has_due_work", lambda provider: False + ) + + status = SimpleNamespace( + provider="aniworld.to", + status="partial", + latest_success_generation="gen-1", + title_index_status="ready", + detail_enrichment_status="failed", + detail_next_retry_after=utcnow() + timedelta(hours=1), + canonical_enrichment_status="pending", + canonical_next_retry_after=None, + next_refresh_after=None, + title_index_next_retry_after=None, + ) + + assert indexer._pick_due_stage(status) is None + + +def test_title_index_failure_persists_even_when_writer_shutdown_raises(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + recorded_errors: list[str] = [] + + monkeypatch.setattr( + indexer_module, + "load_provider_title_index", + lambda provider, observer=None: (_ for _ in ()).throw(RuntimeError("boom")), + ) + monkeypatch.setattr(indexer, "_refresh_interval_hours", lambda provider: 24.0) + monkeypatch.setattr( + indexer, + "_signal_title_index_writer_shutdown", + lambda **kwargs: (_ for _ in ()).throw(RuntimeError("queue full")), + ) + monkeypatch.setattr( + indexer, + "_ensure_title_index_writer_stopped", + lambda **kwargs: (_ for _ in ()).throw(RuntimeError("writer still alive")), + ) + monkeypatch.setattr(indexer, "_set_progress", lambda *args, **kwargs: None) + + def fake_writer_run(callback): + session = object() + return callback(session) + + monkeypatch.setattr(indexer._writer, "run", fake_writer_run) + monkeypatch.setattr( + indexer, + "_finish_title_index_failure", + lambda session, **kwargs: recorded_errors.append(kwargs["error"]), + ) + monkeypatch.setattr( + indexer_module, + "upsert_provider_index_status", + lambda session, **kwargs: None, + ) + + indexer._run_title_index_stage("aniworld.to") + + assert recorded_errors + assert "boom" in recorded_errors[0] + assert "queue full" in recorded_errors[0] + assert "writer still alive" in recorded_errors[0] + + +def test_progress_snapshot_exposes_staged_readiness(client): + from app.catalog.indexer import get_catalog_indexer + from app.db import engine, upsert_provider_index_status + from sqlmodel import Session + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="partial", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="pending", + canonical_enrichment_status="failed", + ) + upsert_provider_index_status( + session, + provider="s.to", + refresh_interval_hours=24.0, + status="ready", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="ready", + canonical_enrichment_status="ready", + ) + upsert_provider_index_status( + session, + provider="megakino", + refresh_interval_hours=24.0, + status="ready", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="ready", + canonical_enrichment_status="ready", + ) + + snapshot = get_catalog_indexer().get_progress_snapshot() + by_provider = {item["provider"]: item for item in snapshot["providers"]} + + assert snapshot["bootstrap_ready"] is True + assert by_provider["aniworld.to"]["search_ready"] is True + assert by_provider["aniworld.to"]["full_ready"] is False + assert by_provider["aniworld.to"]["detail_enrichment_status"] == "pending" + assert by_provider["aniworld.to"]["canonical_enrichment_status"] == "failed" + + +def test_ensure_status_rows_backfills_legacy_ready_stage_fields(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import ProviderCatalogIndexer + from app.db import utcnow + + ready_at = utcnow() + legacy_status = SimpleNamespace( + bootstrap_completed=True, + latest_success_generation="gen-1", + title_index_status="pending", + latest_completed_at=ready_at, + latest_success_at=ready_at, + status="ready", + ) + recorded: list[dict[str, object]] = [] + + class FakeSession: + def __enter__(self): + return object() + + def __exit__(self, exc_type, exc, tb): + return False + + monkeypatch.setattr(indexer_module, "Session", lambda _engine: FakeSession()) + monkeypatch.setattr( + indexer_module, + "get_provider_index_status", + lambda _session, provider: legacy_status, + ) + + indexer = ProviderCatalogIndexer() + monkeypatch.setattr( + indexer._writer, + "run", + lambda callback: callback(object()), + ) + monkeypatch.setattr( + indexer_module, + "upsert_provider_index_status", + lambda _session, **kwargs: recorded.append(kwargs), + ) + + indexer._ensure_status_rows() + + assert any(item["title_index_status"] == "ready" for item in recorded) + assert any(item["detail_enrichment_status"] == "ready" for item in recorded) + assert any(item["canonical_enrichment_status"] == "ready" for item in recorded) + + +def test_write_coordinator_serializes_callbacks(monkeypatch): + import app.catalog.indexer as indexer_module + from app.catalog.indexer import CatalogIndexWriteCoordinator + + started = Event() + release = Event() + order: list[str] = [] + + class FakeSession: + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def commit(self): + return None + + monkeypatch.setattr(indexer_module, "Session", lambda _engine: FakeSession()) + + coordinator = CatalogIndexWriteCoordinator() + + def first() -> None: + coordinator.run( + lambda _session: (order.append("first"), started.set(), release.wait(1)) + ) + + def second() -> None: + started.wait(1) + coordinator.run(lambda _session: order.append("second")) + + t1 = Thread(target=first) + t2 = Thread(target=second) + t1.start() + t2.start() + started.wait(1) + assert order == ["first"] + release.set() + t1.join() + t2.join() + + assert order == ["first", "second"] + + +def test_detail_stage_persists_one_title_incrementally(client): + from app.catalog.indexer import ProviderCatalogIndexer + from app.catalog.providers import EpisodeLanguageRecord, EpisodeRecord, TitleRecord + from app.db import ( + ProviderCatalogEpisode, + ProviderTitleIndexState, + engine, + replace_provider_catalog_title, + upsert_provider_index_status, + upsert_provider_title_index_state, + ) + from sqlmodel import Session, select + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="partial", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="pending", + canonical_enrichment_status="pending", + ) + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug="demo", + title="Demo", + media_type_hint="series", + relative_path="/anime/stream/demo", + indexed_generation="gen-1", + ) + upsert_provider_title_index_state( + session, + provider="aniworld.to", + slug="demo", + detail_status="pending", + ) + session.commit() + + indexer = ProviderCatalogIndexer() + indexer._persist_stage_success( + provider="aniworld.to", + stage="detail_enrichment", + title_row=SimpleNamespace(slug="demo"), + payload=TitleRecord( + provider="aniworld.to", + slug="demo", + title="Demo", + aliases=["Demo"], + media_type_hint="series", + relative_path="/anime/stream/demo", + episodes=[ + EpisodeRecord( + season=1, + episode=1, + relative_path="/anime/stream/demo/staffel-1/episode-1", + title_primary="Episode 1", + title_secondary=None, + media_type_hint="episode", + languages=[ + EpisodeLanguageRecord( + language="German Dub", + host_hints=["VOE"], + ) + ], + ) + ], + ), + ) + + with Session(engine) as session: + episodes = session.exec(select(ProviderCatalogEpisode)).all() + state = session.get(ProviderTitleIndexState, ("aniworld.to", "demo")) + + assert len(episodes) == 1 + assert state is not None + assert state.detail_status == "ready" + + +def test_run_row_stage_does_not_mark_ready_when_only_future_retries_remain( + monkeypatch, +): + from app.catalog.indexer import ProviderCatalogIndexer + + indexer = ProviderCatalogIndexer() + events: list[str] = [] + + monkeypatch.setattr(indexer, "_refresh_interval_hours", lambda provider: 24.0) + monkeypatch.setattr(indexer, "_visible_generation", lambda provider: "gen-1") + monkeypatch.setattr(indexer, "_count_visible_titles", lambda provider: 1) + monkeypatch.setattr(indexer, "_load_due_stage_rows", lambda **kwargs: []) + monkeypatch.setattr( + indexer, + "_count_remaining_stage_rows", + lambda session, **kwargs: 1, + ) + monkeypatch.setattr(indexer, "_set_progress", lambda *args, **kwargs: None) + monkeypatch.setattr( + indexer._writer, + "run", + lambda callback: callback(object()), + ) + monkeypatch.setattr( + indexer, + "_mark_stage_running", + lambda **kwargs: events.append("running"), + ) + monkeypatch.setattr( + indexer, + "_mark_stage_pending", + lambda session, **kwargs: events.append("pending"), + ) + monkeypatch.setattr( + indexer, + "_mark_stage_ready", + lambda *args, **kwargs: events.append("ready"), + ) + + indexer._run_row_stage( + provider="aniworld.to", + stage="detail_enrichment", + concurrency=1, + ) + + assert events == ["running", "pending"] + + +def test_mark_stage_pending_persists_earliest_row_retry(client): + from app.catalog.indexer import ProviderCatalogIndexer + from app.db import ( + engine, + get_provider_index_status, + replace_provider_catalog_title, + upsert_provider_index_status, + upsert_provider_title_index_state, + utcnow, + ) + from sqlmodel import Session + + retry_later = utcnow() + timedelta(hours=2) + retry_earlier = utcnow() + timedelta(minutes=30) + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="aniworld.to", + refresh_interval_hours=24.0, + status="partial", + latest_success_generation="gen-1", + current_generation="gen-1", + bootstrap_completed=True, + title_index_status="ready", + detail_enrichment_status="running", + ) + for slug, retry_after in ( + ("later", retry_later), + ("earlier", retry_earlier), + ): + replace_provider_catalog_title( + session, + provider="aniworld.to", + slug=slug, + title=slug, + media_type_hint="series", + relative_path=f"/anime/stream/{slug}", + indexed_generation="gen-1", + ) + upsert_provider_title_index_state( + session, + provider="aniworld.to", + slug=slug, + detail_status="pending", + detail_next_retry_after=retry_after, + ) + + ProviderCatalogIndexer()._mark_stage_pending( + session, + provider="aniworld.to", + stage="detail_enrichment", + generation="gen-1", + refresh_interval_hours=24.0, + ) + status = get_provider_index_status(session, provider="aniworld.to") + + assert status is not None + assert status.detail_enrichment_status == "pending" + assert status.detail_next_retry_after == retry_earlier diff --git a/apps/api/tests/unit/catalog/test_metadata.py b/apps/api/tests/unit/catalog/test_metadata.py new file mode 100644 index 00000000..c82d10eb --- /dev/null +++ b/apps/api/tests/unit/catalog/test_metadata.py @@ -0,0 +1,41 @@ +from __future__ import annotations + + +def test_ttl_lru_cache_evicts_oldest_entry(): + from app.catalog.metadata import TtlLruCache + + cache = TtlLruCache[str, int](max_entries=2, ttl_seconds=3600) + cache.set("a", 1) + cache.set("b", 2) + cache.set("c", 3) + + assert cache.get("a") is None + assert cache.get("b") == 2 + assert cache.get("c") == 3 + assert cache.size() == 2 + + +def test_canonical_cache_stats_are_bounded(monkeypatch): + import app.catalog.metadata as metadata + + search_cache = metadata.TtlLruCache[str, list[dict[str, object]]]( + max_entries=2, + ttl_seconds=3600, + ) + show_cache = metadata.TtlLruCache[int, dict[str, object]]( + max_entries=1, + ttl_seconds=3600, + ) + monkeypatch.setattr(metadata, "_search_cache", search_cache) + monkeypatch.setattr(metadata, "_show_cache", show_cache) + + search_cache.set("foo", [{"id": 1}]) + search_cache.set("bar", [{"id": 2}]) + search_cache.set("baz", [{"id": 3}]) + show_cache.set(1, {"id": 1}) + show_cache.set(2, {"id": 2}) + + assert metadata.canonical_cache_stats() == { + "search_entries": 2, + "show_entries": 1, + } diff --git a/apps/api/tests/unit/catalog/test_providers.py b/apps/api/tests/unit/catalog/test_providers.py new file mode 100644 index 00000000..10b8f0d8 --- /dev/null +++ b/apps/api/tests/unit/catalog/test_providers.py @@ -0,0 +1,103 @@ +import time +from types import SimpleNamespace + +from app.catalog.providers import ( + _fallback_title_record, + _parse_aniworld_season_rows, + _parse_sto_season_rows, + _run_with_timeout, +) + + +def test_parse_aniworld_season_rows_uses_season_html_only(): + season = SimpleNamespace( + season_number=1, + are_movies=False, + _html=""" + + + + + + + + + Deutsch Titel - English Title + + + + + + + + + + + + + + + + + """, + ) + + episodes = _parse_aniworld_season_rows(season) + + assert len(episodes) == 1 + episode = episodes[0] + assert episode.season == 1 + assert episode.episode == 1 + assert episode.relative_path == "/anime/stream/demo/staffel-1/episode-1" + assert episode.title_primary == "Deutsch Titel" + assert episode.title_secondary == "English Title" + assert episode.media_type_hint == "episode" + assert [item.language for item in episode.languages] == [ + "German Dub", + "German Sub", + ] + assert episode.languages[0].host_hints == ["Filemoon", "VOE"] + + +def test_parse_sto_season_rows_extracts_episode_links_without_episode_pages(): + season = SimpleNamespace( + season_number=2, + _html=""" + Episode 1 + Episode 2 + Episode 2 duplicate + """, + ) + + episodes = _parse_sto_season_rows(season) + + assert [(item.season, item.episode, item.relative_path) for item in episodes] == [ + (2, 1, "/serie/demo/staffel-2/episode-1"), + (2, 2, "/serie/demo/staffel-2/episode-2"), + ] + assert all(item.languages == [] for item in episodes) + + +def test_run_with_timeout_raises_for_hung_title_crawl(): + def slow() -> object: + time.sleep(0.05) + return object() + + try: + _run_with_timeout(0.01, slow) + assert False, "expected timeout" + except TimeoutError as exc: + assert "title crawl exceeded" in str(exc) + + +def test_fallback_title_record_uses_provider_relative_path(): + record = _fallback_title_record( + provider_key="s.to", + slug="demo-show", + title="Demo Show", + aliases=["Demo Show"], + ) + + assert record.relative_path == "/serie/demo-show" + assert record.title == "Demo Show" + assert record.episodes == [] diff --git a/apps/api/tests/unit/core/downloader/test_provider_resolution.py b/apps/api/tests/unit/core/downloader/test_provider_resolution.py new file mode 100644 index 00000000..048c5a60 --- /dev/null +++ b/apps/api/tests/unit/core/downloader/test_provider_resolution.py @@ -0,0 +1,141 @@ +import time + +import pytest + + +def test_try_get_direct_times_out_and_returns_none(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + + class SlowEpisode: + def get_direct_link(self, provider_name: str, language: str) -> str: + time.sleep(0.2) + return f"{provider_name}:{language}" + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + + started_at = time.monotonic() + result = provider_resolution._try_get_direct( + SlowEpisode(), + "VOE", + "German Dub", + ) + + assert result is None + assert time.monotonic() - started_at < 0.15 + + +def test_try_get_direct_only_skips_same_provider_while_worker_is_running(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + + class SlowEpisode: + def __init__(self) -> None: + self.calls: list[str] = [] + + def get_direct_link(self, provider_name: str, language: str) -> str: + self.calls.append(provider_name) + if provider_name == "VOE": + time.sleep(0.2) + return f"{provider_name}:{language}" + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + + episode = SlowEpisode() + + assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None + assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None + assert ( + provider_resolution._try_get_direct(episode, "Doodstream", "German Dub") + == "Doodstream:German Dub" + ) + assert episode.calls == ["VOE", "Doodstream"] + + +def test_try_get_direct_handles_episode_without_weakref_support(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + + class SlottedEpisode: + __slots__ = ("calls",) + + def __init__(self) -> None: + self.calls: list[str] = [] + + def get_direct_link(self, provider_name: str, language: str) -> str: + self.calls.append(provider_name) + if provider_name == "VOE": + time.sleep(0.2) + return f"{provider_name}:{language}" + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + + episode = SlottedEpisode() + + assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None + assert provider_resolution._try_get_direct(episode, "VOE", "German Dub") is None + assert ( + provider_resolution._try_get_direct(episode, "Doodstream", "German Dub") + == "Doodstream:German Dub" + ) + assert episode.calls == ["VOE", "Doodstream"] + + +def test_get_direct_url_with_fallback_continues_after_timeout(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + + class Episode: + def get_direct_link(self, provider_name: str, language: str) -> str: + if provider_name == "VOE": + time.sleep(0.2) + return f"{provider_name}:{language}" + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + monkeypatch.setattr(provider_resolution, "PROVIDER_ORDER", ["VOE", "Doodstream"]) + + assert provider_resolution.get_direct_url_with_fallback( + Episode(), + preferred=None, + language="German Dub", + ) == ("Doodstream:German Dub", "Doodstream") + + +def test_try_get_direct_raises_for_missing_language(monkeypatch): + import app.core.downloader.provider_resolution as provider_resolution + from app.core.downloader.errors import LanguageUnavailableError + + class MissingLanguageEpisode: + def get_direct_link(self, provider_name: str, language: str) -> str: + del provider_name, language + raise ValueError( + "No provider found for language 'German Dub'. " + "Available languages: ['English Sub', 'German Sub']" + ) + + monkeypatch.setattr( + provider_resolution, + "PROVIDER_DIRECT_LINK_TIMEOUT_SECONDS", + 0.05, + ) + + with pytest.raises(LanguageUnavailableError) as exc_info: + provider_resolution._try_get_direct( + MissingLanguageEpisode(), + "VOE", + "German Dub", + ) + + assert exc_info.value.available == ["English Sub", "German Sub"] diff --git a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py index 263e9312..aef8ea81 100644 --- a/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py +++ b/apps/api/tests/unit/core/scheduler/test_strm_scheduler.py @@ -1,5 +1,7 @@ import errno import threading +import time +from concurrent.futures import Future from pathlib import Path from sqlmodel import Session @@ -212,3 +214,142 @@ def test_run_strm_creates_proxy_url(tmp_path, monkeypatch): assert mapping is not None assert mapping.resolved_url == "https://example.com/video.mp4" assert mapping.provider_used == "VOE" + + +def test_progress_updater_coalesces_bursty_db_writes(tmp_path, monkeypatch): + scheduler = _setup_scheduler(tmp_path, monkeypatch, strm_proxy_mode="direct") + monkeypatch.setattr(scheduler, "JOB_PROGRESS_FLUSH_SECONDS", 60.0) + + writes: list[dict[str, object]] = [] + + class FakeSession: + def __enter__(self): + return object() + + def __exit__(self, exc_type, exc, tb): + return False + + class FakeReporter: + def __init__(self, label: str): + self.label = label + + def update(self, snapshot): + del snapshot + + def close(self): + return None + + monkeypatch.setattr(scheduler, "Session", lambda _engine: FakeSession()) + monkeypatch.setattr(scheduler, "ProgressReporter", FakeReporter) + monkeypatch.setattr( + scheduler, + "update_job", + lambda _session, job_id, **fields: writes.append({"job_id": job_id, **fields}), + ) + + callback, writer = scheduler._progress_updater("job-1", threading.Event()) + callback( + { + "status": "downloading", + "downloaded_bytes": 1024, + "total_bytes": 10_000, + "speed": 1000, + } + ) + callback( + { + "status": "downloading", + "downloaded_bytes": 2048, + "total_bytes": 10_000, + "speed": 2000, + "eta": 5, + } + ) + writer.close(flush=True) + + assert len(writes) == 1 + assert writes[0]["job_id"] == "job-1" + assert writes[0]["downloaded_bytes"] == 2048 + assert writes[0]["total_bytes"] == 10_000 + assert writes[0]["speed"] == 2000.0 + assert writes[0]["eta"] == 5 + + +def test_progress_updater_flushes_without_final_close(tmp_path, monkeypatch): + scheduler = _setup_scheduler(tmp_path, monkeypatch, strm_proxy_mode="direct") + monkeypatch.setattr(scheduler, "JOB_PROGRESS_FLUSH_SECONDS", 0.01) + + writes: list[dict[str, object]] = [] + + class FakeSession: + def __enter__(self): + return object() + + def __exit__(self, exc_type, exc, tb): + return False + + class FakeReporter: + def __init__(self, label: str): + self.label = label + + def update(self, snapshot): + del snapshot + + def close(self): + return None + + monkeypatch.setattr(scheduler, "Session", lambda _engine: FakeSession()) + monkeypatch.setattr(scheduler, "ProgressReporter", FakeReporter) + monkeypatch.setattr( + scheduler, + "update_job", + lambda _session, job_id, **fields: writes.append({"job_id": job_id, **fields}), + ) + + callback, writer = scheduler._progress_updater("job-2", threading.Event()) + callback( + { + "status": "downloading", + "downloaded_bytes": 5000, + "total_bytes": 10_000, + "speed": 4000, + "eta": 1, + } + ) + writer.close(flush=False) + deadline = time.monotonic() + 0.5 + while time.monotonic() < deadline: + if writes: + break + time.sleep(0.01) + + assert writes == [] + + +def test_start_scheduled_job_cleans_up_fast_finishing_runner(tmp_path, monkeypatch): + scheduler = _setup_scheduler(tmp_path, monkeypatch, strm_proxy_mode="direct") + + class ImmediateExecutor: + def submit(self, runner, job_id, req, stop_event): + fut = Future() + runner(job_id, req, stop_event) + fut.set_result(None) + return fut + + def fake_runner(job_id, req, stop_event): + del req, stop_event + with scheduler.RUNNING_LOCK: + assert job_id in scheduler.RUNNING + scheduler.RUNNING.pop(job_id, None) + + with scheduler.RUNNING_LOCK: + scheduler.RUNNING.clear() + + monkeypatch.setattr(scheduler, "init_executor", lambda: None) + monkeypatch.setattr(scheduler, "EXECUTOR", ImmediateExecutor()) + monkeypatch.setattr(scheduler, "_run_download", fake_runner) + + scheduler.start_scheduled_job("job-fast", {}) + + with scheduler.RUNNING_LOCK: + assert "job-fast" not in scheduler.RUNNING diff --git a/apps/api/tests/unit/db/test_migrations.py b/apps/api/tests/unit/db/test_migrations.py index c6a67ae3..e54d960a 100644 --- a/apps/api/tests/unit/db/test_migrations.py +++ b/apps/api/tests/unit/db/test_migrations.py @@ -1,10 +1,26 @@ from __future__ import annotations from pathlib import Path +from alembic.config import Config +from alembic.script import ScriptDirectory from sqlalchemy import inspect -HEAD_REVISION = "20260204_0003" +def _head_revision() -> str: + config = Config( + str( + Path(__file__).resolve().parents[3] + / "app" + / "db" + / "migrations" + / "alembic.ini" + ) + ) + config.set_main_option( + "script_location", + str(Path(__file__).resolve().parents[3] / "app" / "db" / "migrations"), + ) + return ScriptDirectory.from_config(config).get_current_head() def _load_db(tmp_path: Path, monkeypatch): @@ -62,7 +78,7 @@ def test_apply_migrations_fresh_db(tmp_path, monkeypatch): assert "clienttask" in tables assert "strmurlmapping" in tables assert "alembic_version" in tables - assert _get_version(models) == HEAD_REVISION + assert _get_version(models) == _head_revision() def test_apply_migrations_legacy_db(tmp_path, monkeypatch): @@ -74,7 +90,7 @@ def test_apply_migrations_legacy_db(tmp_path, monkeypatch): inspector = inspect(models.engine) tables = set(inspector.get_table_names()) assert "alembic_version" in tables - assert _get_version(models) == HEAD_REVISION + assert _get_version(models) == _head_revision() def test_apply_migrations_empty_version_table(tmp_path, monkeypatch): @@ -97,4 +113,4 @@ def test_apply_migrations_empty_version_table(tmp_path, monkeypatch): assert "job" in tables assert "strmurlmapping" in tables assert "alembic_version" in tables - assert _get_version(models) == HEAD_REVISION + assert _get_version(models) == _head_revision() diff --git a/apps/api/tests/unit/db/test_models.py b/apps/api/tests/unit/db/test_models.py index 0b4a491e..7ab7bd24 100644 --- a/apps/api/tests/unit/db/test_models.py +++ b/apps/api/tests/unit/db/test_models.py @@ -136,7 +136,161 @@ def test_availability_and_clienttask_crud(client): category="anime", job_id="job-1", state="downloading", + provider="VOE", + mode="strm", ) - assert get_client_task(s, "abc") + upsert_client_task( + s, + hash="abc", + name="Updated Name", + slug="slug", + season=1, + episode=1, + language="German Dub", + save_path="/tmp", + category="anime", + job_id="job-1", + state="downloading", + ) + task = get_client_task(s, "abc") + assert task is not None + assert task.provider == "VOE" + assert task.mode == "strm" delete_client_task(s, "abc") assert get_client_task(s, "abc") is None + + +def test_replace_canonical_episodes_dedupes_duplicate_numbers(client): + from sqlmodel import Session, select + from app.db import ( + CanonicalEpisode, + engine, + replace_canonical_episodes, + upsert_canonical_series, + ) + + with Session(engine) as s: + upsert_canonical_series( + s, + tvdb_id=12345, + title="Demo Show", + aliases=["Demo Show"], + ) + replace_canonical_episodes( + s, + tvdb_id=12345, + episodes=[ + {"season": 1, "episode": 1, "title": "Pilot"}, + {"season": 1, "episode": 1, "title": "Pilot Duplicate"}, + {"season": 1, "episode": 2, "title": "Second"}, + ], + ) + s.commit() + + rows = s.exec( + select(CanonicalEpisode).where(CanonicalEpisode.tvdb_id == 12345) + ).all() + + assert len(rows) == 2 + assert {(row.season, row.episode) for row in rows} == {(1, 1), (1, 2)} + + +def test_upsert_canonical_series_keeps_aliases_when_omitted(client): + from sqlmodel import Session, select + from app.db import CanonicalSeriesAlias, engine, upsert_canonical_series + + with Session(engine) as s: + upsert_canonical_series( + s, + tvdb_id=999, + title="Demo Show", + aliases=["Demo Alias"], + ) + upsert_canonical_series( + s, + tvdb_id=999, + title="Demo Show Renamed", + aliases=None, + ) + s.commit() + + aliases = s.exec( + select(CanonicalSeriesAlias).where(CanonicalSeriesAlias.tvdb_id == 999) + ).all() + + assert [alias.alias for alias in aliases] == ["Demo Alias"] + + +def test_replace_provider_catalog_title_keeps_live_generation_intact(client): + from sqlmodel import Session, select + from app.db import ProviderCatalogTitle, engine, replace_provider_catalog_title + + with Session(engine) as s: + replace_provider_catalog_title( + s, + provider="aniworld.to", + slug="demo-show", + title="Demo Show", + media_type_hint="series", + relative_path="/anime/stream/demo-show", + indexed_generation="gen-live", + ) + replace_provider_catalog_title( + s, + provider="aniworld.to", + slug="demo-show", + title="Demo Show Updated", + media_type_hint="series", + relative_path="/anime/stream/demo-show-updated", + indexed_generation="gen-staged", + ) + s.commit() + + rows = s.exec( + select(ProviderCatalogTitle).where( + ProviderCatalogTitle.provider == "aniworld.to" + ) + ).all() + + assert {(row.slug, row.indexed_generation, row.title) for row in rows} == { + ("demo-show", "gen-live", "Demo Show"), + ("demo-show", "gen-staged", "Demo Show Updated"), + } + + +def test_replace_provider_catalog_episodes_dedupes_languages_and_host_hints(client): + from sqlmodel import Session, select + from app.db import ( + ProviderEpisodeLanguage, + engine, + replace_provider_catalog_episodes, + ) + + with Session(engine) as s: + replace_provider_catalog_episodes( + s, + provider="aniworld.to", + slug="demo-show", + indexed_generation="gen-1", + episodes=[ + { + "season": 1, + "episode": 1, + "relative_path": "/anime/stream/demo-show/staffel-1/episode-1", + "title_primary": "Pilot", + "title_secondary": None, + "media_type_hint": "episode", + "languages": [ + {"language": "German Dub", "host_hints": ["VOE", "VOE"]}, + {"language": "German Dub", "host_hints": ["Filemoon"]}, + ], + } + ], + ) + s.commit() + + rows = s.exec(select(ProviderEpisodeLanguage)).all() + + assert len(rows) == 1 + assert rows[0].language == "German Dub" + assert rows[0].host_hints == ["Filemoon", "VOE"] diff --git a/apps/api/tests/unit/providers/megakino/test_domain_resolver.py b/apps/api/tests/unit/providers/megakino/test_domain_resolver.py index ec6094ed..4026f5d2 100644 --- a/apps/api/tests/unit/providers/megakino/test_domain_resolver.py +++ b/apps/api/tests/unit/providers/megakino/test_domain_resolver.py @@ -58,7 +58,7 @@ def fake_get(url, *, timeout=0, allow_redirects=True, headers=None): "MEGAKINO_DOMAIN_CANDIDATES", ["first.example", "second.example"], ) - monkeypatch.setattr(domain_resolver, "http_get", fake_get) + monkeypatch.setattr(domain_resolver.requests, "get", fake_get) domains = domain_resolver.fetch_megakino_mirror_domains(timeout=1) assert domains == ["megakino1.to", "megakino1.fit"] @@ -86,25 +86,23 @@ def fake_get(url, *, timeout=0, allow_redirects=True, headers=None): "MEGAKINO_DOMAIN_CANDIDATES", ["first.example"], ) - monkeypatch.setattr(domain_resolver, "http_get", fake_get) + monkeypatch.setattr(domain_resolver.requests, "get", fake_get) domains = domain_resolver.fetch_megakino_mirror_domains(timeout=1) assert domains == ["megakino1.org"] assert any("/sitemap.xml" in call for call in calls) -def test_fetch_megakino_domain_dedupes_and_prefers_candidate_order(monkeypatch): +def test_fetch_megakino_domain_prefers_seed_order(monkeypatch): probed: list[str] = [] monkeypatch.setattr( - domain_resolver, - "MEGAKINO_DOMAIN_CANDIDATES", - ["second.example", "third.example"], + domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example", "second.example"] ) monkeypatch.setattr( domain_resolver, - "fetch_megakino_mirror_domains", - lambda timeout=0, **kwargs: ["first.example", "second.example"], + "_fetch_github_domain_hint", + lambda timeout=0: "third.example", ) def fake_probe(base_url: str, timeout=0): @@ -118,19 +116,18 @@ def fake_probe(base_url: str, timeout=0): resolved = domain_resolver.fetch_megakino_domain(timeout=1) assert resolved == "second.example" + assert probed == ["first.example", "second.example"] assert probed.count("second.example") == 1 def test_fetch_megakino_domain_returns_none_when_all_candidates_fail(monkeypatch): monkeypatch.setattr( - domain_resolver, - "MEGAKINO_DOMAIN_CANDIDATES", - ["first.example", "second.example"], + domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example", "second.example"] ) monkeypatch.setattr( domain_resolver, - "fetch_megakino_mirror_domains", - lambda timeout=0, **kwargs: [], + "_fetch_github_domain_hint", + lambda timeout=0: None, ) monkeypatch.setattr( domain_resolver, "_probe_megakino_sitemap", lambda *a, **k: None @@ -145,7 +142,7 @@ def fake_get(*args, **kwargs): time.sleep(1.0) raise RuntimeError("late failure") - monkeypatch.setattr(domain_resolver, "http_get", fake_get) + monkeypatch.setattr(domain_resolver.requests, "get", fake_get) started = time.monotonic() try: @@ -162,20 +159,25 @@ def fake_get(*args, **kwargs): assert elapsed < 0.5 -def test_fetch_megakino_domain_disables_mirror_sitemap_fallback(monkeypatch): - seen_kwargs: dict[str, object] = {} - - def fake_mirrors(timeout=0, **kwargs): - seen_kwargs.update(kwargs) - return [] +def test_fetch_megakino_domain_uses_github_hint_after_seed_failures(monkeypatch): + probed: list[str] = [] + monkeypatch.setattr(domain_resolver, "MEGAKINO_REDIRECT_SEEDS", ["first.example"]) monkeypatch.setattr( domain_resolver, - "MEGAKINO_DOMAIN_CANDIDATES", - [], + "_fetch_github_domain_hint", + lambda timeout=0: "hint.example", ) - monkeypatch.setattr(domain_resolver, "fetch_megakino_mirror_domains", fake_mirrors) + + def fake_probe(base_url: str, timeout=0): + domain = domain_resolver._normalize_domain(base_url) + probed.append(domain) + if domain == "hint.example": + return "hint.example" + return None + + monkeypatch.setattr(domain_resolver, "_probe_megakino_sitemap", fake_probe) resolved = domain_resolver.fetch_megakino_domain(timeout=1) - assert resolved is None - assert seen_kwargs.get("include_sitemap_fallback") is False + assert resolved == "hint.example" + assert probed == ["first.example", "hint.example"] diff --git a/apps/api/tests/unit/utils/title_resolver/test_file_index.py b/apps/api/tests/unit/utils/title_resolver/test_file_index.py index 804ca7c5..171a04ca 100644 --- a/apps/api/tests/unit/utils/title_resolver/test_file_index.py +++ b/apps/api/tests/unit/utils/title_resolver/test_file_index.py @@ -21,3 +21,44 @@ def test_load_index_from_file(tmp_path, monkeypatch): assert resolve_series_title("slug-one") == "Title One" assert resolve_series_title(None) is None + + +def test_resolve_series_title_falls_back_when_index_db_is_unavailable(monkeypatch): + import app.utils.title_resolver as tr + from sqlalchemy.exc import OperationalError + + monkeypatch.setattr( + tr, + "Session", + lambda _engine: (_ for _ in ()).throw( + OperationalError("stmt", {}, Exception("db down")) + ), + ) + monkeypatch.setattr( + tr, + "load_or_refresh_index", + lambda site="aniworld.to": {"slug-one": "Title One"}, + ) + + assert tr.resolve_series_title("slug-one") == "Title One" + + +def test_load_alternatives_falls_back_when_index_db_is_unavailable(monkeypatch): + import app.utils.title_resolver as tr + from sqlalchemy.exc import OperationalError + + monkeypatch.setattr(tr, "get_catalog_readiness_error", lambda: None) + monkeypatch.setattr( + tr, + "Session", + lambda _engine: (_ for _ in ()).throw( + OperationalError("stmt", {}, Exception("db down")) + ), + ) + monkeypatch.setattr( + tr, "_cached_alts", {"aniworld.to": {"slug-one": ["Title One"]}} + ) + monkeypatch.setattr(tr, "_get_site_cfg", lambda site: {}) + monkeypatch.setattr(tr, "_should_refresh", lambda *args, **kwargs: False) + + assert tr.load_or_refresh_alternatives() == {"slug-one": ["Title One"]} diff --git a/apps/api/tests/unit/utils/title_resolver/test_sto.py b/apps/api/tests/unit/utils/title_resolver/test_sto.py index 7bcf0f41..a7195c8e 100644 --- a/apps/api/tests/unit/utils/title_resolver/test_sto.py +++ b/apps/api/tests/unit/utils/title_resolver/test_sto.py @@ -1,6 +1,8 @@ from urllib.parse import parse_qs, urlparse +from types import SimpleNamespace import requests +from sqlmodel import Session import app.utils.title_resolver as tr @@ -93,7 +95,7 @@ def test_slug_from_query_prefers_precise_title_over_shared_token( monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) - assert tr.slug_from_query("Rookie Le flic de Los Angeles") == ( + assert tr.slug_from_query("Rookie Le flic de Los Angeles", site="s.to") == ( "s.to", "the-rookie", ) @@ -114,5 +116,80 @@ def test_slug_from_query_rejects_low_confidence_overlap(monkeypatch) -> None: ) monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) + monkeypatch.setattr(tr, "get_catalog_readiness_error", lambda: "not-ready") assert tr.slug_from_query("Rookie Le flic de Los Angeles") is None + + +def test_slug_from_query_accepts_db_alias_match(monkeypatch) -> None: + from app.db import ( + apply_migrations, + engine, + replace_provider_catalog_aliases, + replace_provider_catalog_title, + upsert_provider_index_status, + ) + + monkeypatch.setattr(tr, "CATALOG_SITES_LIST", ["s.to"]) + monkeypatch.setattr(tr, "load_or_refresh_index", lambda _site: {}) + monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) + monkeypatch.setattr(tr, "_search_sto_slug", lambda _query: None) + monkeypatch.setattr(tr, "get_catalog_readiness_error", lambda: None) + monkeypatch.setattr(tr, "engine", engine) + apply_migrations() + + with Session(engine) as session: + upsert_provider_index_status( + session, + provider="s.to", + refresh_interval_hours=24.0, + status="ready", + current_generation="gen-alias", + latest_success_generation="gen-alias", + bootstrap_completed=True, + ) + replace_provider_catalog_title( + session, + provider="s.to", + slug="the-rookie", + title="The Rookie", + media_type_hint="series", + relative_path="/serie/the-rookie", + indexed_generation="gen-alias", + ) + replace_provider_catalog_aliases( + session, + provider="s.to", + slug="the-rookie", + aliases=["Rookie Le flic de Los Angeles"], + indexed_generation="gen-alias", + ) + session.commit() + + assert tr.slug_from_query("Rookie Le flic de Los Angeles") == ( + "s.to", + "the-rookie", + ) + + +def test_site_scoped_db_lookup_checks_later_candidates(monkeypatch) -> None: + first = SimpleNamespace(provider="s.to", slug="weak-match") + second = SimpleNamespace(provider="s.to", slug="strong-match") + seen_limits: list[int] = [] + + def fake_search(session, *, query, providers, limit, **kwargs): + del session, query, providers, kwargs + seen_limits.append(limit) + return [first, second] + + def fake_score(session, *, query, candidate): + del session, query + return 0.1 if candidate is first else tr._MIN_TITLE_MATCH_SCORE + + monkeypatch.setattr(tr, "load_or_refresh_index", lambda _site: {}) + monkeypatch.setattr(tr, "load_or_refresh_alternatives", lambda _site: {}) + monkeypatch.setattr(tr, "search_indexed_provider_titles", fake_search) + monkeypatch.setattr(tr, "_score_indexed_db_candidate", fake_score) + + assert tr.slug_from_query("Strong Match", site="s.to") == ("s.to", "strong-match") + assert seen_limits == [5] diff --git a/docker/compose.dev.yaml b/docker/compose.dev.yaml index 4e296fc4..1249c654 100644 --- a/docker/compose.dev.yaml +++ b/docker/compose.dev.yaml @@ -15,6 +15,7 @@ services: - ../.env environment: - ANIBRIDGE_RELOAD=true + - ANIBRIDGE_PORT=8000 - DATA_DIR=/data - DOWNLOAD_DIR=/downloads volumes: diff --git a/docker/compose.yaml b/docker/compose.yaml index 7ccb113e..36385688 100644 --- a/docker/compose.yaml +++ b/docker/compose.yaml @@ -4,7 +4,7 @@ services: image: ghcr.io/zzackllack/anibridge:latest container_name: anibridge ports: - - "8000:8000" + - "8000:${ANIBRIDGE_PORT:-8000}" environment: # Container runtime / permissions - PUID=${PUID:-1000} @@ -30,7 +30,10 @@ services: - ../data:/data healthcheck: test: - ["CMD", "curl", "--fail", "--silent", "http://localhost:8000/health"] + [ + "CMD-SHELL", + "curl --fail --silent \"http://localhost:${ANIBRIDGE_PORT:-8000}/health\" >/dev/null", + ] interval: 30s timeout: 5s retries: 3 diff --git a/internal/agents/api.md b/internal/agents/api.md index 46f677d5..2b53298c 100644 --- a/internal/agents/api.md +++ b/internal/agents/api.md @@ -8,7 +8,9 @@ ## Health Endpoint (`/health`) - Method: GET -- Response: JSON with `status`, `database`, `scheduler`, `download_dir`, `version`, `runtime` +- Response: JSON with `status` and `catalog` +- `catalog` includes bootstrap readiness plus per-provider crawl/persist counters, + queue depth, writer lag, current slug, and staging-vs-live generation state. ## Torznab Namespace (`/torznab/api`) @@ -29,7 +31,9 @@ - Auth: `/auth/login`, `/auth/logout` set `SID` cookie `anibridge`. - Categories: `/torrents/categories` returns configured categories (default `AniBridge`). -- Torrents: `/torrents/add`, `/torrents/delete`, `/torrents/info` mimic qBittorrent responses. +- Torrents: `/torrents/add`, `/torrents/resume`, `/torrents/delete`, and + `/torrents/info` mimic qBittorrent responses. Paused adds retain the provider + and mode metadata required to start the original request when resumed. - Sync: `/sync/maindata` exposes job states for Sonarr integration. - Transfer: `/transfer/info`, `/transfer/speedLimitsMode`, etc., return safe defaults. - Deletion endpoint optionally removes files when `DELETE_FILES_ON_TORRENT_DELETE` is true. diff --git a/internal/agents/architecture.md b/internal/agents/architecture.md index 08358acc..39cbd9a9 100644 --- a/internal/agents/architecture.md +++ b/internal/agents/architecture.md @@ -45,6 +45,10 @@ ## Scheduler & Background Services - Thread pool size controlled by `MAX_CONCURRENCY` (default 3). +- Provider catalog indexing now runs as a staged streaming pipeline: + crawler workers emit completed titles into a bounded queue, one writer thread + persists batches into SQLite, and generation promotion only happens after the + full refresh succeeds. - Cleanup thread deletes downloads older than `DOWNLOADS_TTL_HOURS`. - Public IP monitor runs only when `PUBLIC_IP_CHECK_ENABLED=true`. - Lifespan ensures graceful shutdown of scheduler, DB engine, and background threads. @@ -53,5 +57,7 @@ - Loguru configuration lives in `apps/api/app/utils/logger.py`. - `TerminalLogger` duplicates stdout/stderr to `data/terminal-YYYY-MM-DD.log`. -- `/health` endpoint provides liveness/readiness details. +- `/health` and `/health/catalog` expose provider bootstrap state plus crawl, + persistence, queue-depth, and staging-generation progress for the catalog + indexer. - Update notifier logs when new GitHub releases are available. diff --git a/internal/agents/configuration.md b/internal/agents/configuration.md index 295d004f..1fad19e7 100644 --- a/internal/agents/configuration.md +++ b/internal/agents/configuration.md @@ -11,6 +11,8 @@ AniBridge centralizes configuration in `apps/api/app/config.py`. Values are deri `PROVIDER_REDIRECT_RETRIES`, `PROVIDER_CHALLENGE_BACKOFF_SECONDS`, `MAX_CONCURRENCY`, `DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC`, `DOWNLOADS_TTL_HOURS`, `CLEANUP_SCAN_INTERVAL_MIN` +- Provider catalog index: `PROVIDER_INDEX_*` refresh cadence, queue bounds, + writer batching, and failure-threshold controls for staged catalog refreshes - STRM: `STRM_FILES_MODE`, `STRM_PROXY_*` - Networking policy: external VPN/VPN-sidecar routing only + `PUBLIC_IP_CHECK_*` - Video-host order default: `VOE,Filemoon,Streamtape,Vidmoly,Doodstream,LoadX,Luluvdo,Vidoza` via `PROVIDER_ORDER`, mapped at runtime to `VIDEO_HOST_ORDER` @@ -46,59 +48,74 @@ AniBridge centralizes configuration in `apps/api/app/config.py`. Values are deri 17. `MEGAKINO_TITLES_REFRESH_HOURS` — Megakino refresh interval. 18. `MEGAKINO_DOMAIN_CHECK_INTERVAL_MIN` — Megakino domain checker interval. 19. `CATALOG_SITES` — Enabled catalogue sites. -20. `SOURCE_TAG` — Release source tag (default `WEB`). -21. `RELEASE_GROUP` — Release group label (default `aniworld`). -22. `RELEASE_GROUP_ANIWORLD` — AniWorld release group override. -23. `RELEASE_GROUP_STO` — s.to release group override. -24. `PROVIDER_ORDER` — Comma-separated video-host priority input; mapped at runtime to `VIDEO_HOST_ORDER`. -25. `PROVIDER_REDIRECT_TIMEOUT_SECONDS` — Timeout for resolving catalogue redirect tokens into video-host URLs (default `12`). -26. `PROVIDER_REDIRECT_RETRIES` — Extra retry attempts for transient video-host redirect failures (default `2`). -27. `PROVIDER_CHALLENGE_BACKOFF_SECONDS` — Base cool-down for Turnstile challenge retries (default `300`). -28. `MAX_CONCURRENCY` — Thread pool size (default `3`). -29. `DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC` — Per-download yt-dlp rate cap (`0` disables). -30. `INDEXER_NAME` — Torznab display name (default `AniBridge Torznab`). -31. `INDEXER_API_KEY` — Optional Torznab API key. -32. `TORZNAB_CAT_ANIME` — Category mapping (default `5070`). -33. `TORZNAB_CAT_MOVIE` — Movie category mapping (default `2000`). -34. `AVAILABILITY_TTL_HOURS` — Availability cache TTL (default `24`). -35. `TORZNAB_FAKE_SEEDERS` — Seeders in results (default `999`). -36. `TORZNAB_FAKE_LEECHERS` — Leechers in results (default `787`). -37. `TORZNAB_RETURN_TEST_RESULT` — Return test item (default `true`). -38. `TORZNAB_TEST_TITLE` — Test item title. -39. `TORZNAB_TEST_SLUG` — Test item slug. -40. `TORZNAB_TEST_SEASON` — Test season number. -41. `TORZNAB_TEST_EPISODE` — Test episode number. -42. `TORZNAB_TEST_LANGUAGE` — Test language label. -43. `TORZNAB_SEASON_SEARCH_MODE` — Season-search execution mode (`fast`/`strict`, default `fast`). -44. `TORZNAB_SEASON_SEARCH_MAX_EPISODES` — Season-search fallback probe ceiling (default `60`). -45. `TORZNAB_SEASON_SEARCH_MAX_CONSECUTIVE_MISSES` — Season-search fallback stop threshold (default `3`). -46. `DELETE_FILES_ON_TORRENT_DELETE` — Remove files on delete (default `true`). -47. `DOWNLOADS_TTL_HOURS` — TTL cleanup threshold (default `0`, disabled). -48. `CLEANUP_SCAN_INTERVAL_MIN` — Cleanup interval (default `30`). -49. `STRM_FILES_MODE` — STRM mode (`no`, `both`, `only`, default `no`). -50. `STRM_PROXY_MODE` — STRM proxy mode (`direct`, `proxy`, `redirect`, default `direct`). -51. `STRM_PUBLIC_BASE_URL` — Public base URL for STRM proxy URLs. -52. `STRM_PROXY_AUTH` — STRM proxy auth mode (`none`, `token`, `apikey`). -53. `STRM_PROXY_SECRET` — Shared secret for STRM proxy auth. -54. `STRM_PROXY_UPSTREAM_ALLOWLIST` — Comma-separated upstream host allowlist. -55. `STRM_PROXY_CACHE_TTL_SECONDS` — STRM URL cache TTL in seconds (default `0`). -56. `STRM_PROXY_TOKEN_TTL_SECONDS` — STRM proxy token TTL in seconds (default `900`). -57. `PROGRESS_FORCE_BAR` — Force progress bar (default `false`). -58. `PROGRESS_STEP_PERCENT` — Progress logging step (default `5`). -59. `ANIBRIDGE_UPDATE_CHECK` — Enable release polling (default `true`). -60. `ANIBRIDGE_GITHUB_TOKEN` — GitHub API token. -61. `ANIBRIDGE_GITHUB_OWNER` — GitHub owner (default `zzackllack`). -62. `ANIBRIDGE_GITHUB_REPO` — Repo name (default `AniBridge`). -63. `ANIBRIDGE_GHCR_IMAGE` — GHCR image slug (default `zzackllack/anibridge`). -64. `PUBLIC_IP_CHECK_ENABLED` — Enable periodic public IP logging (default `false`). -65. `PUBLIC_IP_CHECK_INTERVAL_MIN` — Public IP check interval minutes (default `30`). -66. `ANIBRIDGE_HOST` — Bind host. -67. `ANIBRIDGE_PORT` — Bind port. -68. `ANIBRIDGE_CORS_ORIGINS` — CORS origins. -69. `ANIBRIDGE_CORS_ALLOW_CREDENTIALS` — CORS credentials behavior. -70. `ANIBRIDGE_TEST_MODE` — Test-mode runtime toggle. -71. `PYTHONUNBUFFERED` — Set to `1` in Docker to keep logs flush. -72. `SONARR_*`, `PROWLARR_*` — Integration values documented in `docs/src/integrations/clients`. +20. `PROVIDER_INDEX_REFRESH_HOURS` — Default staged provider-index refresh cadence (default `24`). +21. `PROVIDER_INDEX_REFRESH_HOURS_ANIWORLD` — AniWorld provider-index cadence override. +22. `PROVIDER_INDEX_REFRESH_HOURS_STO` — s.to provider-index cadence override. +23. `PROVIDER_INDEX_REFRESH_HOURS_MEGAKINO` — megakino provider-index cadence override. +24. `PROVIDER_INDEX_SCHEDULER_POLL_SECONDS` — Scheduler poll interval for due refreshes (default `60`). +25. `PROVIDER_INDEX_GLOBAL_CONCURRENCY` — Max concurrent provider refreshes (default `1`). +26. `PROVIDER_INDEX_CONCURRENCY_ANIWORLD` — AniWorld title crawl worker count. +27. `PROVIDER_INDEX_CONCURRENCY_STO` — s.to title crawl worker count. +28. `PROVIDER_INDEX_CONCURRENCY_MEGAKINO` — megakino crawl worker count. +29. `PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS` — Soft title crawl timeout threshold (default `45`). +30. `PROVIDER_INDEX_QUEUE_SIZE` — Bounded title-result queue depth between crawlers and the SQLite writer (default `32`). +31. `PROVIDER_INDEX_WRITER_BATCH_SIZE` — SQLite writer commit batch size (default `8`). +32. `PROVIDER_INDEX_WRITER_FLUSH_SECONDS` — Max wait before the writer flushes a partial batch (default `1.0`). +33. `PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT` — Refresh abort threshold for failed title crawls (default `20`). +34. `PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS` — Minimum interval between repeated queue-backpressure logs (default `15`). +35. `SOURCE_TAG` — Release source tag (default `WEB`). +36. `RELEASE_GROUP` — Release group label (default `aniworld`). +37. `RELEASE_GROUP_ANIWORLD` — AniWorld release group override. +38. `RELEASE_GROUP_STO` — s.to release group override. +39. `PROVIDER_ORDER` — Comma-separated video-host priority input; mapped at runtime to `VIDEO_HOST_ORDER`. +40. `PROVIDER_REDIRECT_TIMEOUT_SECONDS` — Timeout for resolving catalogue redirect tokens into video-host URLs (default `12`). +41. `PROVIDER_REDIRECT_RETRIES` — Extra retry attempts for transient video-host redirect failures (default `2`). +42. `PROVIDER_CHALLENGE_BACKOFF_SECONDS` — Base cool-down for Turnstile challenge retries (default `300`). +43. `MAX_CONCURRENCY` — Thread pool size (default `3`). +44. `DOWNLOAD_RATE_LIMIT_BYTES_PER_SEC` — Per-download yt-dlp rate cap (`0` disables). +45. `INDEXER_NAME` — Torznab display name (default `AniBridge Torznab`). +46. `INDEXER_API_KEY` — Optional Torznab API key. +47. `TORZNAB_CAT_ANIME` — Category mapping (default `5070`). +48. `TORZNAB_CAT_MOVIE` — Movie category mapping (default `2000`). +49. `AVAILABILITY_TTL_HOURS` — Availability cache TTL (default `24`). +50. `TORZNAB_FAKE_SEEDERS` — Seeders in results (default `999`). +51. `TORZNAB_FAKE_LEECHERS` — Leechers in results (default `787`). +52. `TORZNAB_RETURN_TEST_RESULT` — Return test item (default `true`). +53. `TORZNAB_TEST_TITLE` — Test item title. +54. `TORZNAB_TEST_SLUG` — Test item slug. +55. `TORZNAB_TEST_SEASON` — Test season number. +56. `TORZNAB_TEST_EPISODE` — Test episode number. +57. `TORZNAB_TEST_LANGUAGE` — Test language label. +58. `TORZNAB_SEASON_SEARCH_MODE` — Season-search execution mode (`fast`/`strict`, default `fast`). +59. `TORZNAB_SEASON_SEARCH_MAX_EPISODES` — Season-search fallback probe ceiling (default `60`). +60. `TORZNAB_SEASON_SEARCH_MAX_CONSECUTIVE_MISSES` — Season-search fallback stop threshold (default `3`). +61. `DELETE_FILES_ON_TORRENT_DELETE` — Remove files on delete (default `true`). +62. `DOWNLOADS_TTL_HOURS` — TTL cleanup threshold (default `0`, disabled). +63. `CLEANUP_SCAN_INTERVAL_MIN` — Cleanup interval (default `30`). +64. `STRM_FILES_MODE` — STRM mode (`no`, `both`, `only`, default `no`). +65. `STRM_PROXY_MODE` — STRM proxy mode (`direct`, `proxy`, `redirect`, default `direct`). +66. `STRM_PUBLIC_BASE_URL` — Public base URL for STRM proxy URLs. +67. `STRM_PROXY_AUTH` — STRM proxy auth mode (`none`, `token`, `apikey`). +68. `STRM_PROXY_SECRET` — Shared secret for STRM proxy auth. +69. `STRM_PROXY_UPSTREAM_ALLOWLIST` — Comma-separated upstream host allowlist. +70. `STRM_PROXY_CACHE_TTL_SECONDS` — STRM URL cache TTL in seconds (default `0`). +71. `STRM_PROXY_TOKEN_TTL_SECONDS` — STRM proxy token TTL in seconds (default `900`). +72. `PROGRESS_FORCE_BAR` — Force progress bar (default `false`). +73. `PROGRESS_STEP_PERCENT` — Progress logging step (default `5`). +74. `ANIBRIDGE_UPDATE_CHECK` — Enable release polling (default `true`). +75. `ANIBRIDGE_GITHUB_TOKEN` — GitHub API token. +76. `ANIBRIDGE_GITHUB_OWNER` — GitHub owner (default `zzackllack`). +77. `ANIBRIDGE_GITHUB_REPO` — Repo name (default `AniBridge`). +78. `ANIBRIDGE_GHCR_IMAGE` — GHCR image slug (default `zzackllack/anibridge`). +79. `PUBLIC_IP_CHECK_ENABLED` — Enable periodic public IP logging (default `false`). +80. `PUBLIC_IP_CHECK_INTERVAL_MIN` — Public IP check interval minutes (default `30`). +81. `ANIBRIDGE_HOST` — Bind host. +82. `ANIBRIDGE_PORT` — Bind port. +83. `ANIBRIDGE_CORS_ORIGINS` — CORS origins. +84. `ANIBRIDGE_CORS_ALLOW_CREDENTIALS` — CORS credentials behavior. +85. `ANIBRIDGE_TEST_MODE` — Test-mode runtime toggle. +86. `PYTHONUNBUFFERED` — Set to `1` in Docker to keep logs flush. +87. `SONARR_*`, `PROWLARR_*` — Integration values documented in `docs/src/integrations/clients`. ## Removed Legacy Proxy Variables diff --git a/internal/specs/017-provider-catalog-index/00-scheduled-provider-indexing.md b/internal/specs/017-provider-catalog-index/00-scheduled-provider-indexing.md new file mode 100644 index 00000000..78c235e3 --- /dev/null +++ b/internal/specs/017-provider-catalog-index/00-scheduled-provider-indexing.md @@ -0,0 +1,195 @@ +# 017 Provider Catalog Index - Scheduled Provider Indexing + +## Goal + +Move provider discovery work out of the request path and into a scheduled indexing pipeline that builds and refreshes a local SQLite-backed catalog. + +The implementation must remain modular and provider-scoped so that changing one provider does not create cross-provider regressions. + +## Problem Statement + +Current search and `tvsearch` behavior still performs meaningful live work on demand: + +- provider title lookup may need a live or warm index refresh +- season discovery can fall back to sequential live probing +- episode language discovery can require provider page fetches +- availability and quality cache entries are populated reactively per request + +This keeps request latency variable and makes cold or broad searches expensive. + +## Proposed Direction + +Instead of discovering provider data only when Sonarr or Radarr asks for it, AniBridge should periodically walk each supported provider catalog and persist a local index of: + +- titles available on that provider +- provider-specific slugs and URL suffixes +- provider-specific season and episode structure +- provider-side language availability per episode +- provider-side movie entries for providers that expose them +- lightweight provider or host hints when they are available without deep resolution + +The request path should query SQLite first and should not perform request-time targeted refresh of missing titles, seasons, or episodes. + +## Version One Scope + +Version one should index the full provider catalog surface that AniBridge can serve: + +- all enabled providers +- all series on those providers +- all episodes of those series +- all provider-exposed movies + +Each provider should have its own indexing implementation and refresh handling behind a shared orchestration contract. + +## Refresh Model + +Use a background refresh pipeline, not a blocking full recrawl during normal operation. + +### Startup behavior + +- app startup should initialize the refresh scheduler +- app startup should check whether the local provider index is missing or stale +- if no usable index exists yet, catalog-dependent interactions should fail until the first complete index build for all enabled providers completes +- blocked catalog-dependent requests should return a suitable HTTP error, include a short explanation, and log the reason clearly +- if an older index already exists, AniBridge should continue serving it while a replacement index is built in parallel + +### Recurring behavior + +- run a scheduled refresh every 24 hours by default +- make refresh cadence configurable through environment variables +- support provider-specific refresh intervals through environment variables +- do not perform request-time targeted refresh for one title, season, or episode + +## Scope of the Indexed Data + +Persist data that is relatively stable and expensive to recompute: + +- provider title lists +- provider slug mappings +- provider URL suffixes or paths +- provider season and episode existence +- provider language availability +- provider-to-canonical entity mappings +- last indexed timestamps and refresh status + +Do not persist data that is short-lived or operationally volatile as part of the main index: + +- final direct media URLs +- temporary redirect URLs +- host redirect chains +- full probe payloads +- large raw HTML responses +- response headers from resolved stream URLs + +These should remain short-TTL operational caches, not permanent catalog records. + +## Provider Crawl Strategy + +### Title discovery + +For each enabled provider: + +- load the provider-wide title index or search endpoint equivalent +- persist one row per provider title entry +- persist stable provider identifiers such as slug, URL suffix or path, and media type hints + +### Episode discovery + +For each provider title: + +- enumerate seasons and episodes from the provider structure +- persist provider-native season and episode coordinates +- persist provider language availability +- persist only lightweight host or provider hints when they are available without deep resolution +- do not resolve deeper into ephemeral redirect or direct stream URLs + +### Refresh checkpoints + +Track per-provider and per-title progress so the crawler can resume safely after restart: + +- last successful provider refresh timestamp +- last successful title refresh timestamp +- cursor or page state where applicable +- failure count and last error summary + +## Request Path Implications + +After this feature, the request path should prefer: + +1. local provider catalog index +2. local canonical mapping tables +3. short-TTL operational caches + +Expected effects: + +- lower `tvsearch` latency +- fewer provider page fetches on search +- fewer sequential live probes for season discovery +- more deterministic behavior under repeated Sonarr scans + +Misses caused by absent or unresolved index data should not trigger ad-hoc live re-indexing. + +## Anti-Bot and Rate-Limit Considerations + +Full indexing increases background traffic and must be bounded carefully. + +Required controls: + +- per-provider concurrency limits +- global crawler concurrency limits +- retry with backoff on `429`, `403`, and transient upstream failures +- refresh checkpoints to avoid restarting whole crawls + +The system should assume that provider anti-bot behavior may change over time. + +## Storage Strategy + +SQLite growth is acceptable only if the data is normalized aggressively. + +Safe principles: + +- store canonical entities once +- store provider mappings separately +- store availability as compact, current-state rows +- avoid duplicating long text or JSON across episode rows +- store current state only, not history, unless a later feature requires it + +Optimize for performance and functionality first while keeping the database ideally under 1 GB and at most around 5 GB when justified. + +Potentially expensive data should be excluded from the permanent catalog unless there is a clear request-path need. + +## Refresh Semantics + +- refreshed index data should overwrite the previous current-state rows +- if a provider refresh fails, AniBridge may continue serving the older indexed data +- if a provider base URL changes, persisted provider-relative suffixes or paths should remain valid when reassembled against the current base URL +- if a title, episode, or movie was present in the previous successful index but is absent from the next successful refresh, AniBridge should delete that stale row during the refresh replacement process + +## Suggested Deliverables + +- new refresh scheduler surface for provider indexing +- persistent provider catalog tables and migrations +- refresh status and checkpoint tables +- operational metrics for index freshness, crawl duration, and fallback rate +- clear provider-specific indexing boundaries in the codebase + +## Non-Goals + +- replacing SQLite at this stage +- storing every transient stream resolution artifact permanently +- request-time targeted re-indexing of cache misses +- tightly coupling provider implementations to each other + +## Selected Decisions + +- all enabled providers should be indexed +- version one should index full provider coverage for series, episodes, and provider-exposed movies +- refresh cadence should default to 24 hours and remain configurable through environment variables, including provider-specific overrides +- if no index exists yet, catalog-dependent routes should fail until bootstrap indexing completes +- first bootstrap indexing should be complete and fully blocking before catalog-dependent routes become usable +- if an index exists but is stale, AniBridge should keep serving the old index while a new one is built in parallel +- request-time targeted refresh is explicitly out of scope +- indexing should include only non-ephemeral, non-probe-intensive data +- provider URLs should be stored as provider-relative suffixes or paths, not as full base URLs +- current-state data should be overwritten on refresh +- entries missing from the next successful refresh should be deleted at refresh time diff --git a/internal/specs/017-provider-catalog-index/01-normalization-and-id-mapping.md b/internal/specs/017-provider-catalog-index/01-normalization-and-id-mapping.md new file mode 100644 index 00000000..644cc746 --- /dev/null +++ b/internal/specs/017-provider-catalog-index/01-normalization-and-id-mapping.md @@ -0,0 +1,237 @@ +# 017 Provider Catalog Index - Normalization and ID Mapping + +## Goal + +Define a canonical mapping layer so AniBridge can match provider-specific series, seasons, episodes, and movies to the metadata models expected by Sonarr and Radarr with a higher hit rate than the current on-the-fly approach. + +## Problem Statement + +Provider catalogs do not align cleanly with Sonarr and Radarr expectations: + +- provider titles may differ from canonical metadata titles +- aliases and localized names vary +- specials and extras often use provider-specific numbering +- provider season and episode structures may not match TVDB ordering +- some providers expose only partial identifiers or no canonical IDs at all + +Today this mismatch is handled by a mix of live title matching, special-case mapping, and request-time probing. That limits correctness and repeatability. + +## Canonical Metadata Targets + +### Sonarr-facing canonical model + +Use TVDB-style series and episode numbering as the primary canonical model for TV content. + +Persist at least: + +- `tvdb_id` +- canonical series title +- canonical season number +- canonical episode number + +Secondary IDs are optional and should be stored only when they materially improve matching: + +- `tmdb_id` +- `imdb_id` +- `tvmaze_id` +- `anilist_id` +- `mal_id` + +The schema should be designed so AniBridge can support additional non-provider-native canonical orderings later when they improve Sonarr compatibility. +When these secondary TV identifiers are cheap to obtain and do not add disproportionate engineering complexity, AniBridge should persist them. + +### Radarr-facing canonical model + +Use TMDb as the primary canonical model for movies. + +Persist at least: + +- `tmdb_id` +- canonical movie title +- release year + +Optional secondary IDs: + +- `imdb_id` +- `tvdb_id` + +For version one, TMDb should be treated as the authoritative movie identity. Secondary IDs may be null or omitted when they do not materially help. + +## Mapping Layers + +Split the model into explicit layers. + +### Canonical entity layer + +Stores metadata-system identity independent of any provider: + +- canonical series +- canonical seasons +- canonical episodes +- canonical movies + +### Provider entity layer + +Stores how a provider represents the same title: + +- provider title row +- provider slug +- provider URL suffix or path +- provider media type hint +- provider-native season and episode coordinates +- provider-native language availability + +### Mapping layer + +Stores the relationship between provider entities and canonical entities: + +- provider title -> canonical series or movie +- provider episode -> canonical episode +- provider special or film entry -> canonical special episode or movie +- mapping confidence +- mapping source and last verification time + +## Matching Strategy + +Use a staged mapping strategy rather than a single fuzzy-title match. + +### Preferred match order for series + +1. explicit canonical ID from provider metadata, if available +2. existing confirmed local mapping +3. exact title or alias match against canonical metadata +4. constrained fuzzy title match +5. unresolved or low-confidence automatic mapping + +### Preferred match order for episodes + +1. existing confirmed provider episode mapping +2. direct canonical numbering alignment +3. provider special or extra mapping rules +4. alias-based remap to canonical special or extra episode +5. unresolved best-effort automatic mapping + +### Preferred match order for movies + +1. explicit `tmdb_id` +2. explicit `imdb_id` +3. title plus year exact match +4. constrained fuzzy match + +## Specials, Extras, and Films + +Specials should be first-class mapping records, not ad-hoc request-time exceptions. + +Store: + +- provider source season and episode +- canonical alias season and episode when TV-mapped +- mapping rationale or source +- confidence flag + +This allows AniBridge to answer Sonarr requests consistently even when the provider exposes the content under `film-N`, season `0`, or another non-canonical structure. + +If provider content is clearly represented as a film or movie rather than a TV special, AniBridge should preserve that distinction and map it into the movie model for Radarr instead of force-normalizing it into TV season `0`. + +## Confidence and Verification + +Not all mappings are equally trustworthy. + +Suggested states: + +- `confirmed` +- `high_confidence` +- `low_confidence` +- `unresolved` +- `conflict` + +Why this matters: + +- request path can trust confirmed mappings immediately +- low-confidence mappings can be eligible for background re-check +- conflicting mappings can be excluded from automatic response emission + +Version one may still emit low-confidence matches on a best-effort basis when no better candidate exists. +If one provider episode plausibly maps to multiple canonical episodes, AniBridge should emit all plausible matches rather than suppressing output. + +## Request Path Usage + +When Sonarr or Radarr requests content: + +1. resolve the request to the canonical ID model expected by the client +2. query the local mapping tables for matching provider entities +3. return provider-backed results derived from confirmed or sufficiently strong mappings +4. do not perform request-time targeted metadata enrichment just because the local mapping is absent + +For generic AniBridge search that is not explicitly Sonarr or Radarr ID-driven, provider titles may still be returned even when canonical enrichment is not yet complete. + +This should raise both: + +- probability of finding the correct title +- probability of returning the correct season and episode numbering + +## Suggested Data to Persist + +Persist: + +- canonical IDs +- canonical titles and aliases +- provider slugs and URL suffixes or paths +- provider-native season and episode coordinates +- mapping confidence and timestamps +- language availability per mapped episode + +Prefer compact persistence: + +- provider-relative URL suffixes or paths instead of full base URLs +- current-state records instead of history tables +- only those secondary IDs that materially improve Sonarr or Radarr matching + +Avoid persisting in the canonical mapping tables: + +- raw HTML +- large API payloads +- direct stream URLs +- large probe artifacts + +## Failure Modes to Plan For + +- one provider title maps to multiple plausible canonical series +- one provider episode maps to multiple plausible canonical episodes +- one canonical episode is split or merged differently by a provider +- specials are exposed as films instead of episodes +- provider title aliases drift over time +- canonical metadata source updates numbering after the initial mapping + +The design should support re-mapping without rebuilding the entire catalog from scratch. + +Canonical metadata changes should remap automatically on the next successful refresh. + +## Recommended Outcome + +The indexing feature and this mapping layer should ship together conceptually: + +- indexing without canonical mapping improves speed but not match quality enough +- canonical mapping without scheduled indexing improves logic but still leaves too much live work + +Combined, they create a local database that is both faster and more compatible with Sonarr and Radarr than the current request-driven approach. + +Low-confidence or conflicting mappings do not need to be surfaced to operators in version one, but the persistence model should not prevent a later web-UI override layer. + +Provider-specific alias tables should be captured during provider crawl whenever the provider exposes usable alias data directly. This keeps provider-specific naming close to the source and reduces later enrichment work. + +## Selected Decisions + +- canonical TV mapping should prioritize Sonarr-friendly orderings and remain extensible beyond only TVDB ordering +- canonical movie mapping should prioritize TMDb and may omit or null secondary IDs when they do not materially help +- secondary TV identifiers such as `tvmaze_id`, `anilist_id`, and `mal_id` should be persisted when they are cheap to obtain and do not add significant engineering complexity +- ambiguous anime sequel, remake, split-cour, and similar cases should use best-effort automatic mapping in version one +- low-confidence mappings may still be used best-effort +- if one provider episode has multiple plausible canonical matches, AniBridge should return all plausible matches +- content clearly represented as films should stay in the movie domain rather than being forced into TV specials +- operator review flows for low-confidence mappings can be deferred +- the persistence model should leave room for future manual override support through the planned web UI +- automatic re-mapping should occur on later refreshes when canonical metadata changes +- stale mapped data may continue to be served if refresh is temporarily blocked upstream +- provider titles may still be returned for generic search even when canonical enrichment is incomplete +- provider-specific alias tables should be captured during provider crawl when feasible diff --git a/internal/specs/017-provider-catalog-index/02-streaming-persistence-and-memory-bounds.md b/internal/specs/017-provider-catalog-index/02-streaming-persistence-and-memory-bounds.md new file mode 100644 index 00000000..5082abf1 --- /dev/null +++ b/internal/specs/017-provider-catalog-index/02-streaming-persistence-and-memory-bounds.md @@ -0,0 +1,273 @@ +# 017 Provider Catalog Index - Streaming Persistence and Memory Bounds + +## Goal + +Redesign provider bootstrap and refresh execution so AniBridge can index large providers without unbounded memory growth, long silent stalls, or container restarts under realistic first-run load. + +The preferred solution is to stream crawled provider results into SQLite continuously instead of buffering a full provider crawl in memory and only persisting at the end. + +## Problem Statement + +The current provider indexing architecture is correct in spirit but too memory-heavy in practice for large catalogs such as `s.to`. + +Today the refresh flow for AniWorld and `s.to` is effectively: + +1. load the provider title index +2. crawl many titles in parallel +3. accumulate all crawled `TitleRecord` results in memory +4. persist the full provider result set only after crawling is complete +5. mark the refreshed generation as current + +This creates a large in-memory buffer whose size grows with: + +- total title count +- total episode count across those titles +- alias and language metadata +- canonical mapping payloads and intermediate metadata +- number of in-flight worker results + +Lowering concurrency only slows the rate of growth. It does not remove the underlying architectural pressure. + +## Observed Failure Mode + +During first bootstrap runs, memory usage can climb steadily into multiple gigabytes before the provider refresh finishes. + +Typical characteristics: + +- memory rises roughly with crawl progress instead of staying bounded +- the process may appear healthy for a long time and then die abruptly +- large providers such as `s.to` are the worst case because they have many titles and many episodes per title +- progress visibility degrades because the system is still "working" while holding more and more buffered state + +This behavior is more consistent with whole-provider result buffering than with a classic steady-state memory leak. + +## Root Cause + +The main issue is architectural buffering. + +AniBridge currently keeps too much expanded provider state alive at once: + +- full crawled provider results +- per-title episode lists +- canonical enrichment results +- temporarily timed-out worker state that may still be finishing in background threads + +The memory profile therefore scales with provider size instead of with a small bounded working set. + +## Proposed Direction + +Replace the current "crawl everything, then persist everything" flow with a streaming provider pipeline: + +1. discover provider titles +2. crawl titles in parallel +3. emit each completed title result into a bounded queue +4. persist queued results continuously into a staging generation +5. keep serving the previous successful generation during the refresh +6. flip the provider's current successful generation only after the full staged refresh completes successfully + +This preserves refresh consistency while bounding memory growth. + +## Required Architecture Changes + +### Provider crawl contract + +Provider crawl code should stop returning one large `list[TitleRecord]` for the full provider refresh. + +Instead, each provider crawler should expose a streaming-oriented contract that: + +- yields or submits one completed title result at a time +- reports title discovery totals as soon as they are known +- reports per-title progress as results are completed +- allows the orchestrator to apply backpressure when the persistence side falls behind + +The contract may be implemented as: + +- a generator that yields title results +- a callback-based emitter +- a worker pool that pushes into a queue owned by the indexer + +The specific API shape is less important than the bounded-memory behavior. + +### Dedicated writer path + +Use one dedicated persistence path per provider refresh. + +Recommended structure: + +- N crawler workers per provider +- 1 dedicated DB writer thread or task per provider refresh +- 1 bounded in-memory queue between crawl workers and the writer + +The writer is responsible for: + +- inserting or replacing provider title rows for the staging generation +- inserting aliases, episodes, language availability, and mappings for the staging generation +- updating per-provider progress counters +- committing in small batches + +SQLite write contention should remain controlled by avoiding many concurrent writers for the same provider refresh. + +### Staging generation semantics + +Do not publish partially refreshed provider state to the request path. + +Required behavior: + +- write new rows into a staging generation while the previous successful generation remains active +- keep `latest_success_generation` unchanged until the staged refresh fully succeeds +- if the refresh fails midway, keep serving the previous successful generation +- if no successful generation exists yet, keep bootstrap gating behavior intact + +This is the key consistency rule that allows streaming persistence without exposing partial catalogs as complete. + +## Queue and Backpressure Requirements + +The queue between crawlers and the writer must be bounded. + +Required properties: + +- configurable maximum queue size +- producer backpressure when the queue is full +- clear logs when crawlers are blocked on persistence backpressure +- no unbounded fallback list or hidden in-memory spillover + +The queue size should be chosen so that: + +- the writer has enough buffered work to stay busy +- memory remains predictably bounded even for the largest provider + +The implementation should prefer slowing crawl throughput over allowing queue growth beyond the configured bound. + +## Persistence Semantics + +Per-title persistence should happen as soon as the title result is available. + +Required behavior: + +- persist each title independently into the staging generation +- commit frequently enough that long refreshes make visible forward progress +- make restarts and crash recovery able to resume or restart from a known staging state + +Implementation guidance: + +- use small batched transactions rather than one transaction per episode row +- clean up abandoned staging generations on the next startup or refresh attempt +- keep provider status and cursor state aligned with what has already been durably written + +## Failure and Recovery Semantics + +### Title-level failures + +One bad title must not stall or invalidate the whole provider refresh by default. + +Required behavior: + +- timed-out or failed titles should be logged with provider, slug, and reason +- the refresh should continue unless the failure rate crosses a configured threshold +- skipped titles should remain absent from the new generation unless explicitly retried successfully later + +### Refresh-level failures + +If the full provider refresh cannot complete: + +- do not promote the staging generation +- do not delete the currently served successful generation +- persist a clear provider-level error summary +- make the next run able to clean up or reuse stale staging rows safely + +### Restart recovery + +On startup, AniBridge should detect interrupted staging refreshes and handle them explicitly. + +Required behavior: + +- log that an interrupted staging generation was found +- mark the prior run as interrupted +- either delete the abandoned staging generation or restart it from a supported checkpoint + +Version one may choose cleanup-and-restart over true mid-provider resume if that is simpler and more reliable. + +## Progress and Observability + +The new design must improve visibility, not reduce it. + +Required progress signals: + +- title discovery started +- title discovery completed with total count +- crawl progress as `completed/total` and percent when total is known +- queue depth and writer lag +- persistence progress as `persisted/total` +- generation promotion success +- explicit staging cleanup or abandonment messages + +The health surface should remain able to report: + +- provider phase +- processed titles +- total titles when known +- current slug or recently active slug +- last error summary +- whether the provider is serving an older successful generation while a new one is building + +## Memory and Performance Requirements + +The redesigned flow must keep memory bounded primarily by: + +- worker concurrency +- queue size +- one-title working set +- writer batch size + +It must not scale memory roughly linearly with full provider size. + +Additional recommendations: + +- drop intermediate canonical payloads as soon as they have been normalized and written +- avoid keeping large per-title objects alive after queue submission +- avoid background timeout wrappers that leave many unreachable or still-running worker threads alive for long periods + +Concurrency should remain configurable, but the implementation must not depend on low concurrency to stay within safe memory limits. + +## Implementation Guidance + +### Suggested rollout order + +1. introduce staging-generation streaming persistence behind the existing provider status model +2. convert AniWorld and `s.to` from full-list return values to streaming title emission +3. keep Megakino aligned with the same orchestration contract where practical +4. add queue depth, writer lag, and staging-generation logging +5. tighten cleanup of abandoned staging generations and interrupted runs + +### Acceptable simplifications + +The first implementation does not need: + +- fully parallel SQLite writers for the same provider +- cross-provider shared queue infrastructure +- perfect mid-run resume at arbitrary title boundaries + +What matters first is: + +- bounded memory +- continuous persistence +- atomic generation promotion +- clear recovery behavior + +## Non-Goals + +- replacing SQLite +- exposing partial staging rows to the request path as if they were ready +- maximizing raw crawl speed at the expense of stability +- introducing a complicated distributed job system + +## Selected Decisions + +- streaming provider results into SQLite is the selected solution +- a bounded queue with backpressure is required +- one dedicated writer path per provider refresh is preferred over many parallel SQLite writers +- staging generations must remain invisible to the request path until refresh success +- old successful generations must remain served during replacement refreshes +- cleanup-and-restart is acceptable for interrupted staging generations in version one +- lowering crawl concurrency is not considered a real fix for the memory issue, only a temporary mitigation +- progress reporting must include real counts and writer-state visibility, not only long-running heartbeat messages diff --git a/internal/specs/017-provider-catalog-index/03-performance.md b/internal/specs/017-provider-catalog-index/03-performance.md new file mode 100644 index 00000000..ee4327c0 --- /dev/null +++ b/internal/specs/017-provider-catalog-index/03-performance.md @@ -0,0 +1,1222 @@ +# AniBridge Provider Catalog Indexing: Memory-Bounded Local Bootstrap and Progressive Enrichment Specification + +## 1. Purpose + +This specification defines the required redesign of AniBridge's provider catalog indexing system so that: + +- first startup is usable quickly; +- full provider catalog data is eventually persisted locally; +- memory stays bounded under realistic self-hosted deployments; +- no official prebuilt provider-derived database or JSON catalog is shipped by the project; +- request paths continue to read from the local database instead of doing uncontrolled live probing; +- SQLite remains viable as the default database backend; +- normal users do not need to manually tune concurrency, queue sizes, retry intervals, or caching settings. + +The core design principle is: + +> AniBridge must compute provider-derived catalog data locally, progressively, and with strict memory bounds. The app must never require loading provider-scale state into Python memory. + +This document focuses only on performance, memory, startup behavior, indexing architecture, and SQLite-safety. + +The following topics are explicitly out of scope for this task: + +- import/export of catalog snapshots; +- official prebuilt provider catalog snapshots; +- Uvicorn reload behavior; +- legal policy text changes; +- UI polish beyond minimal progress/readiness information needed for correct behavior. + +--- + +## 2. Background and Current Problem + +The current `provider-catalog-index` branch already moved from a previous "crawl everything then persist everything" design toward a streaming persistence model. + +However, during realistic first bootstrap runs the container can still grow to multiple gigabytes of RAM and may eventually stop or be killed by the container/runtime. + +The observed and discussed problems are: + +1. The catalog contains many titles: + - AniWorld has thousands of titles. + - `s.to` has more than ten thousand titles. + - Each title may expand into many episodes, languages, host hints, aliases, and canonical metadata mappings. + +2. Python object overhead is large: + - HTML responses; + - BeautifulSoup parse trees; + - provider library objects; + - `TitleRecord` dataclasses; + - nested episode/language objects; + - canonical metadata payloads; + - dictionaries/lists/strings; + - SQLAlchemy ORM identity map objects. + +3. A bounded result queue does not automatically bound total process memory because memory can also be retained by: + - active crawler workers; + - `Future` objects; + - global metadata caches; + - still-running timed-out workers; + - SQLAlchemy sessions; + - retry storms after failures; + - concurrent provider refreshes. + +4. SQLite write contention can happen when multiple provider writer paths or failure/status updates write concurrently. + +5. The first-run user experience is poor if a full crawl is required before the software can be used. + +6. Setting all concurrency to `1` is not an acceptable product solution. It is only an emergency workaround. + +7. Shipping a finished provider-derived database would improve UX but creates a worse legal/platform risk posture because the project would distribute a curated provider index as an official artifact. + +Therefore, AniBridge needs a local progressive indexing architecture that is fast enough for normal use, memory-bounded by default, and safe for SQLite. + +--- + +## 3. High-Level Product Goal + +On a fresh install, AniBridge should behave like this: + +1. Start the container. +2. Initialize the database. +3. Quickly build a lightweight provider title index: + - slug; + - title; + - aliases; + - provider-relative path; + - media type hint when available. +4. Mark the provider as searchable once the title index exists. +5. Allow the application to respond using database-backed catalog data. +6. Continue crawling expensive details in the background: + - seasons; + - episodes; + - available languages; + - host hints; + - canonical TVDB mappings; + - other normalized mapping data. +7. Never hold the full provider catalog in Python memory. +8. Keep RAM usage bounded below a strict target. + +Target resource goals: + +- Normal idle runtime: below 512 MB RAM. +- During provider indexing: preferably below 1 GB RAM. +- Absolute design target: no normal first-run indexing path should exceed 1 GB RAM. +- The implementation must avoid any algorithm where memory grows approximately linearly with total provider size. + +These targets are product requirements, not tuning suggestions. + +--- + +## 4. Legal/Risk-Driven Distribution Decision + +AniBridge must not ship or automatically download an official prebuilt provider-derived catalog database or JSON file as part of this task. + +Rationale: + +- Source code distribution and provider-derived catalog data distribution have different risk profiles. +- A prebuilt catalog would likely be considered an official project artifact. +- A prebuilt catalog could be attacked more easily through takedown or platform complaint processes because it is a concrete curated index. +- The project should avoid becoming the distributor of provider-derived operational metadata. + +Therefore: + +- No official provider-derived catalog snapshot must be added. +- No GitHub Release catalog asset must be required for first startup. +- No automatic download of a project-hosted provider catalog must be implemented. +- No bundled provider-derived SQLite/JSON catalog must be included in the Docker image. +- Local self-hosted computation remains the default. + +This does not prohibit normal database migrations, schema files, empty seed tables, or code-defined provider configuration. + +--- + +## 5. Non-Goals + +Do not implement the following in this task: + +1. Import/export of provider catalogs. +2. Official provider-derived database snapshots. +3. External hosted catalog update service. +4. Uvicorn reload changes. +5. Replacing SQLite with PostgreSQL as a requirement. +6. Requiring Redis, Celery, or another distributed job system. +7. A complex UI redesign. +8. Live probing as the normal search/request path. +9. Large-scale user-configurable indexing strategy UI. +10. Any feature that requires the user to manually understand queue sizes, concurrency, or SQLite locks. + +--- + +## 6. Core Architectural Decision + +The selected architecture is: + +```text +Local progressive indexing + +Phase A: lightweight provider title index + -> fast + -> memory bounded + -> enables basic catalog search/readiness + +Phase B: provider detail enrichment + -> background + -> bounded concurrency + -> writes episodes/languages/host hints + +Phase C: canonical metadata enrichment + -> background + -> bounded concurrency + -> DB-backed or bounded cache only + -> writes TVDB/canonical mappings + +Request path: + -> database only + -> no uncontrolled live provider crawl fallback + -> may trigger explicit targeted warm-up only when designed as a DB-writing indexing job +``` + +Important: + +* The system must not wait for Phase B and Phase C to finish before the app is considered basically usable. +* The request path must not directly perform expensive live crawling as a hidden fallback. +* Any targeted on-demand indexing must be explicit, bounded, persisted, and visible as an indexing job. + +--- + +## 7. Definitions + +### Provider + +A catalog source such as: + +* `aniworld.to` +* `s.to` +* `megakino` + +### Title Index + +The lightweight list of provider titles. + +Contains only cheap title-level metadata: + +* provider key; +* slug; +* display title; +* normalized title; +* aliases; +* normalized aliases; +* provider-relative path; +* media type hint if known; +* generation; +* indexed timestamp. + +This phase must not crawl every title detail page. + +### Detail Index + +The expanded provider-level metadata for a title: + +* seasons; +* episodes; +* episode-relative paths; +* episode titles when available; +* available languages; +* host hints; +* media type hints; +* detail crawl state. + +### Canonical Index + +Normalized mappings to external canonical metadata, for example: + +* TVDB series ID; +* canonical episode mapping; +* confidence; +* source; +* rationale. + +### Generation + +A version identifier for a consistent indexing pass. + +AniBridge may continue using the existing provider generation model, but the implementation must support partial/progressive stages without exposing inconsistent full-catalog state as complete. + +### Bootstrap Readiness + +There are multiple readiness levels: + +1. `title_ready` + + * The provider title index exists. + * Basic title search can work from DB. + +2. `detail_ready` + + * A meaningful detail crawl has completed for the provider. + +3. `canonical_ready` + + * Canonical mapping enrichment has completed or reached a configured baseline. + +4. `full_ready` + + * Detail and canonical enrichment have completed for the current provider generation. + +The app must not treat "title ready" and "full ready" as the same thing. + +--- + +## 8. Required Startup Behavior + +On application startup: + +1. Apply database migrations. +2. Ensure provider indexing status rows/stage rows exist. +3. Detect interrupted indexing work. +4. Clean up abandoned staging generations if necessary. +5. Start the provider catalog scheduler. +6. Schedule title index bootstrap for any provider that is not `title_ready`. +7. Schedule detail enrichment for title-ready providers that are not detail-ready. +8. Schedule canonical enrichment for titles/details that are not canonical-ready. +9. Do not block the web server until full catalog indexing is complete. + +The app must become reachable even if provider indexing is still running. + +Health/readiness responses must clearly distinguish: + +```text +app_ready: true +catalog_title_ready: true/false +catalog_detail_ready: true/false +catalog_canonical_ready: true/false +catalog_full_ready: true/false +provider phases... +``` + +--- + +## 9. Required Request Behavior + +### 9.1 Search and Torznab-like Query Behavior + +Request handlers must read from the database. + +They must not perform uncontrolled live crawling of provider title pages or provider episode pages during normal search. + +If the title index is not ready: + +* Return a clear "catalog title index is still initializing" response. +* Do not trigger large live crawling inside the request. + +If a title is found but details are not indexed yet: + +* Return only DB-backed data that is available; or +* Return a clear "details are still being indexed" message; or +* Optionally enqueue a bounded targeted warm-up job if that endpoint behavior is explicitly implemented. + +The request must not block for a full provider crawl. + +### 9.2 Optional Targeted Warm-Up + +A targeted warm-up feature may be implemented because it improves first-run usability without requiring a global prebuilt DB. + +If implemented, it must follow these rules: + +* It must be explicit. +* It must index one title or a small bounded set of titles. +* It must write results to the DB. +* It must use the same memory-bounded crawler pipeline as background indexing. +* It must not bypass persistence. +* It must not become an uncontrolled live-probing fallback for every request. +* It must expose job/progress state or at least clear pending/completed behavior. +* It must obey concurrency limits. + +Example behavior: + +```text +User searches title. +Title exists in DB title index. +Details missing. +System may enqueue targeted detail indexing for that title. +Request returns "details indexing queued/in progress" instead of doing full live work inline. +``` + +--- + +## 10. Indexing Pipeline + +The provider indexing pipeline must be split into stages. + +### 10.1 Stage A: Provider Title Index Bootstrap + +Purpose: + +* Quickly make provider titles searchable from DB. +* Avoid expensive per-title crawling. +* Avoid canonical metadata fetching. +* Avoid episode/detail expansion. + +Input: + +* Provider alphabet/index page. +* Local provider-specific title index source if configured. + +Output: + +* `ProviderCatalogTitle` rows. +* `ProviderCatalogAlias` rows. +* Stage status updated to `title_ready`. + +Hard requirements: + +* Must not construct one huge list of expanded `TitleRecord` objects. +* Must not crawl every episode page. +* Must not query SkyHook/TVDB/canonical metadata. +* Must write to SQLite in small batches. +* Must use direct SQL or short-lived SQLAlchemy sessions to avoid large identity maps. +* Must commit frequently. +* Must be safe to restart. + +Recommended default: + +* Title index bootstrap may run quickly and with low concurrency because it is not the bottleneck. +* It should complete in seconds to a few minutes, not hours. + +### 10.2 Stage B: Provider Detail Enrichment + +Purpose: + +* Crawl title detail pages. +* Persist episode/language/host-hint data. + +Input: + +* Title rows from the DB that are missing detail enrichment or are stale. + +Output: + +* `ProviderCatalogEpisode` rows. +* `ProviderEpisodeLanguage` rows. +* `ProviderTitleIndexState` detail success/failure state. +* Updated detail stage progress. + +Hard requirements: + +* Must process titles incrementally. +* Must never hold the entire provider detail catalog in memory. +* Must never accumulate a provider-sized Python list of detail results. +* Must use bounded title crawl concurrency. +* Must use a bounded queue or direct row-command pipeline. +* Must write to SQLite continuously. +* Must drop HTML/Soup/provider objects as soon as the title is persisted. +* Must tolerate per-title failures. +* Must not fail the whole provider because one title fails unless failure rate exceeds a configured threshold. +* Must respect retry backoff. + +### 10.3 Stage C: Canonical Metadata Enrichment + +Purpose: + +* Resolve titles/episodes to canonical IDs/mappings. +* Write canonical series and episode mapping rows. + +Input: + +* Provider titles/details from DB. + +Output: + +* `CanonicalSeries` rows. +* `CanonicalEpisode` rows. +* `ProviderSeriesMapping` rows. +* `ProviderEpisodeMapping` rows. +* `ProviderMovieMapping` rows if applicable. + +Hard requirements: + +* Must not use unbounded in-memory caches. +* Must not keep thousands of full show payloads in process-global dictionaries. +* Must use either: + + * a DB-backed canonical metadata cache; or + * a strict in-memory LRU/TTL cache with a hard max size. +* Must have low bounded concurrency by default. +* Must write results incrementally. +* Must tolerate failed canonical lookups. +* Must not block title readiness. +* Must not block detail indexing unless a specific mapping is required for that operation. + +Recommended default: + +* Canonical enrichment should be slower and safer than provider title/detail crawling. +* Use conservative concurrency because external metadata APIs can be slow and responses can be large. + +--- + +## 11. Memory-Bounding Requirements + +The implementation must satisfy all of the following. + +### 11.1 No Provider-Sized In-Memory Results + +Forbidden: + +```python +all_titles = crawl_entire_provider() +persist(all_titles) +``` + +Forbidden: + +```python +titles: list[TitleRecord] = [] +for title in provider: + titles.append(crawl_title(title)) +``` + +Required: + +```text +for each title: + crawl title + normalize to compact rows + write rows to DB + drop temporary objects +``` + +### 11.2 Bounded Queues + +Any queue between crawler workers and persistence must have a fixed max size. + +Default queue size should be small enough to keep RAM bounded. + +Recommended defaults: + +```env +PROVIDER_INDEX_QUEUE_SIZE=8 +PROVIDER_INDEX_WRITER_BATCH_SIZE=32 +``` + +If the queue is full: + +* crawler workers must block; +* logs should indicate backpressure at a rate-limited interval; +* the system must not create an unbounded fallback list. + +### 11.3 Compact Row Commands Preferred + +The queue should not contain large fully expanded objects if avoidable. + +Preferred queue item style: + +```text +PersistTitleDetailCommand + provider + slug + title fields + aliases + compact episode rows + compact language rows + compact mapping rows +``` + +Avoid queueing: + +* BeautifulSoup objects; +* raw HTML; +* HTTP responses; +* provider library model objects; +* full external API payloads; +* unnecessary canonical raw payloads. + +### 11.4 Drop Temporary Objects + +After each title is persisted: + +* raw HTML references must be released; +* BeautifulSoup references must be released; +* provider library objects must not be stored globally; +* large canonical payloads must be reduced to compact DB rows; +* SQLAlchemy sessions must be closed; +* batch lists must be cleared. + +### 11.5 Bounded Metadata Cache + +Process-global metadata caches must be bounded. + +Forbidden: + +```python +_search_cache = {} +_show_cache = {} +``` + +unless there is an enforced max size and eviction. + +Acceptable: + +```python +_search_cache = TTLCache(maxsize=512, ttl=3600) +_show_cache = TTLCache(maxsize=256, ttl=3600) +``` + +Better: + +* DB-backed canonical cache table; +* tiny in-memory LRU hot cache; +* compact cached payloads only. + +Cache requirements: + +* hard max entry count; +* TTL; +* no full provider-sized retention; +* no full raw API response retention if only a subset is needed; +* clear unit tests for max-size behavior. + +### 11.6 SQLAlchemy Session Discipline + +Persistence code must avoid long-lived sessions that accumulate thousands of ORM objects. + +Required: + +* short sessions per batch; +* commit frequently; +* clear/close session after each batch; +* avoid unnecessary `select(...).all()` before deletes; +* prefer direct delete statements for replacing child rows; +* consider `session.expire_all()` or new session per batch if identity map growth is observed; +* avoid query-triggered autoflush surprises by structuring transactions carefully. + +--- + +## 12. SQLite Safety Requirements + +SQLite is the default database and must work safely. + +### 12.1 WAL and Busy Timeout + +The SQLite engine must enable: + +```sql +PRAGMA journal_mode=WAL; +PRAGMA synchronous=NORMAL; +PRAGMA busy_timeout=30000; +``` + +Also configure the DB connection timeout, for example: + +```python +connect_args={ + "check_same_thread": False, + "timeout": 30, +} +``` + +### 12.2 Single-Writer Discipline + +Provider indexing writes must be serialized. + +Acceptable implementation options: + +1. A single global DB writer queue for catalog indexing. +2. A global provider-index DB write lock. +3. A write coordinator that ensures only one catalog-index write transaction runs at a time. + +The implementation must ensure that these write paths do not run concurrently against SQLite: + +* title index batch writes; +* detail enrichment batch writes; +* canonical enrichment writes; +* provider status writes; +* title failure-state writes; +* generation cleanup/promotion writes. + +If using a global write lock, all catalog indexing writes must use it. + +### 12.3 Provider-Level Concurrency Defaults + +Default provider-level concurrency must be: + +```env +PROVIDER_INDEX_GLOBAL_CONCURRENCY=1 +``` + +This means: + +* only one provider refresh/enrichment stage should run at a time by default; +* inside that provider, title-level concurrency may still be greater than one. + +This default protects SQLite from cross-provider write contention while preserving useful crawl speed. + +### 12.4 Title-Level Concurrency Defaults + +Recommended defaults: + +```env +PROVIDER_INDEX_CONCURRENCY_ANIWORLD=4 +PROVIDER_INDEX_CONCURRENCY_STO=4 +PROVIDER_INDEX_CONCURRENCY_MEGAKINO=2 +``` + +These are product defaults, not emergency values. + +The implementation must allow environment overrides, but the defaults must be safe for normal self-hosting. + +### 12.5 Retry Backoff Must Be Respected + +A failed provider or stage must not be retried immediately if `next_refresh_after` or equivalent retry timestamp is in the future. + +Required due logic: + +```python +if status is running: + return False + +if next_refresh_after exists and next_refresh_after > now: + return False + +otherwise: + return due according to status/stage rules +``` + +Forbidden behavior: + +```python +if latest_success_at is None: + return True +``` + +when a retry timestamp exists. + +This is critical to prevent first-bootstrap retry storms. + +--- + +## 13. Worker Timeout and Cancellation Requirements + +The current behavior of marking a title as timed out while leaving the underlying worker running is dangerous if the provider is retried quickly. + +Required behavior: + +1. All HTTP calls used by provider indexing must have hard timeouts. +2. A title timeout must prevent new work from piling up. +3. If a provider/stage fails, the scheduler must not immediately start another run while old workers may still be active. +4. The system must track active provider/stage workers and avoid duplicate runs. +5. `executor.shutdown(wait=False)` must not be used in a way that allows repeated retries while old threads continue consuming memory. +6. If Python threads cannot be safely stopped, the retry/backoff logic must account for that and wait long enough before another run. +7. Timed-out title work must not retain large result objects. + +Recommended implementation: + +* Prefer bounded waits and clean draining. +* Do not submit new futures after writer failure. +* Cancel pending futures. +* Let running futures finish within timeout. +* Mark provider/stage as failed with retry backoff if shutdown is incomplete. +* Do not reschedule until retry backoff has elapsed. + +--- + +## 14. Scheduler Requirements + +The scheduler must support staged indexing. + +### 14.1 Stage Ordering + +For each provider: + +1. Ensure title index exists. +2. Then schedule detail enrichment. +3. Then schedule canonical enrichment. + +Do not run expensive detail/canonical enrichment for a provider whose title index is missing. + +### 14.2 Fairness + +Default behavior should avoid one provider starving all others forever. + +Recommended: + +* Run one provider/stage at a time by default. +* Choose due work by priority: + + 1. missing title index; + 2. targeted warm-up jobs; + 3. missing detail enrichment; + 4. missing canonical enrichment; + 5. scheduled refreshes. + +### 14.3 Retry Backoff + +Each stage must have a retry timestamp after failure. + +Failures must not cause immediate retry loops. + +### 14.4 Progress State + +The scheduler must expose progress per provider and stage: + +* provider; +* stage; +* status; +* total titles when known; +* queued titles; +* active workers; +* completed titles; +* failed titles; +* persisted rows or persisted titles; +* current/recent slug; +* queue depth; +* writer lag; +* last error; +* next retry time; +* whether title index is ready; +* whether details are complete; +* whether canonical mappings are complete. + +--- + +## 15. Database Schema Requirements + +The exact schema may reuse existing models where appropriate, but the final system must be able to represent the following states clearly. + +### 15.1 Provider Stage Status + +There must be durable state for each provider and indexing stage. + +Required logical fields: + +```text +provider +stage +status +generation +latest_success_generation +started_at +completed_at +latest_success_at +next_retry_after +total_items +completed_items +failed_items +cursor_slug +last_error_summary +updated_at +``` + +Stages should include at least: + +```text +title_index +detail_enrichment +canonical_enrichment +``` + +This may be implemented as: + +* a new `ProviderIndexStageStatus` table; or +* carefully extended existing provider status tables. + +The implementation must avoid ambiguous status fields where `bootstrap_completed` means different things in different contexts. + +### 15.2 Title Detail State + +There must be durable state per provider title for detail indexing. + +Required logical fields: + +```text +provider +slug +detail_status +detail_attempted_at +detail_success_at +detail_failure_count +detail_last_error_summary +canonical_status +canonical_attempted_at +canonical_success_at +canonical_failure_count +canonical_last_error_summary +updated_at +``` + +This may extend the existing `ProviderTitleIndexState` table. + +### 15.3 Canonical Metadata Cache + +If canonical metadata is cached in DB, use tables with compact payloads. + +Required logical fields: + +```text +cache_key +source +payload_compact_json +created_at +expires_at +last_used_at +``` + +The payload must be compact and contain only fields required by AniBridge mapping. + +Do not store unnecessarily large raw responses. + +### 15.4 Generation Visibility + +The request path must only serve rows from a generation considered visible/active for the relevant data type. + +If title index generation is ready but details are not complete: + +* title search may use title rows; +* detail-dependent endpoints must know details may be incomplete. + +Do not mark a provider as fully ready just because title rows exist. + +--- + +## 16. Persistence Requirements + +### 16.1 Replace Rows Efficiently + +For per-title child rows such as aliases, episodes, languages, and mappings: + +* delete old rows for that provider/title/stage; +* insert new rows; +* do not first load all old rows into Python unless needed. + +Avoid: + +```python +session.exec(select(Child).where(...)).all() +session.exec(delete(Child).where(...)) +``` + +Prefer: + +```python +session.exec(delete(Child).where(...)) +``` + +### 16.2 Batch Size + +Default writer batch size: + +```env +PROVIDER_INDEX_WRITER_BATCH_SIZE=32 +``` + +Rules: + +* batch size must be configurable; +* batch must be small enough to avoid large memory spikes; +* batch must be large enough to avoid one transaction per tiny row; +* writer must flush by size and by time. + +### 16.3 Failure Recording + +Title failures must be recorded durably. + +However, failure recording must not bypass SQLite write serialization. + +All failure-state writes must go through the same write coordinator or lock. + +### 16.4 Promotion + +For stages that use generations: + +* write into staging generation; +* mark stage generation successful only after successful completion; +* do not expose partial full-stage state as complete; +* if refresh fails, keep previous successful generation visible; +* clean up abandoned staging generation on restart or next run. + +For progressive detail enrichment, per-title successful writes may become visible if the API clearly treats detail completeness as per-title/progressive. This must not falsely report provider-wide `detail_ready`. + +--- + +## 17. Configuration Defaults + +The following default values must be safe for normal users. + +```env +PROVIDER_INDEX_GLOBAL_CONCURRENCY=1 + +PROVIDER_INDEX_CONCURRENCY_ANIWORLD=4 +PROVIDER_INDEX_CONCURRENCY_STO=4 +PROVIDER_INDEX_CONCURRENCY_MEGAKINO=2 + +PROVIDER_INDEX_QUEUE_SIZE=8 +PROVIDER_INDEX_WRITER_BATCH_SIZE=32 +PROVIDER_INDEX_WRITER_FLUSH_SECONDS=1.0 + +PROVIDER_INDEX_TITLE_TIMEOUT_SECONDS=45 +PROVIDER_INDEX_FAILURE_THRESHOLD_PERCENT=20.0 +PROVIDER_INDEX_BACKPRESSURE_LOG_SECONDS=15.0 + +CANONICAL_INDEX_CONCURRENCY=2 +CANONICAL_CACHE_MEMORY_MAX_SEARCH=512 +CANONICAL_CACHE_MEMORY_MAX_SHOW=256 +CANONICAL_CACHE_TTL_SECONDS=3600 +``` + +If exact variable names differ, implement equivalent settings and document them. + +Invalid environment values must be sanitized. + +For example: + +* concurrency less than 1 becomes 1; +* queue size less than 1 becomes 1; +* negative timeout uses default; +* invalid float/int logs warning and uses default. + +--- + +## 18. Observability Requirements + +Logs must make indexing behavior understandable without being spammy. + +Required log events: + +1. Scheduler startup with effective safe defaults. +2. Provider title index start. +3. Provider title index total loaded. +4. Title index batch persisted. +5. Title index ready. +6. Detail enrichment start. +7. Detail enrichment progress heartbeat. +8. Detail enrichment batch persisted. +9. Queue backpressure warning, rate-limited. +10. Per-title failure warning, rate-limited or summarized if noisy. +11. Canonical enrichment start. +12. Canonical enrichment progress. +13. Canonical cache hit/miss summary, not every request at info level. +14. Stage completed. +15. Stage failed with retry timestamp. +16. Stale staging generation cleanup. +17. SQLite lock retry/busy warning if encountered. +18. Memory budget warning if optional memory instrumentation is implemented. + +Health endpoint should expose provider/stage progress. + +--- + +## 19. Acceptance Criteria + +The implementation is complete only if all criteria below are satisfied. + +### 19.1 Functional + +* Fresh install starts without a provider-derived snapshot. +* DB schema migrates successfully. +* Title index stage runs before detail/canonical enrichment. +* Title index stage writes searchable provider titles. +* App can report partial catalog readiness. +* Detail enrichment runs in background. +* Canonical enrichment runs in background or as a bounded follow-up stage. +* Request handlers read from DB. +* No normal request triggers a full provider crawl. +* Failed stages respect retry backoff. +* Interrupted staging generations are cleaned up or handled explicitly. + +### 19.2 Memory + +* No provider-scale list of expanded title records exists. +* Global canonical metadata caches are bounded or DB-backed. +* Queue sizes are bounded. +* SQLAlchemy sessions do not retain provider-scale identity maps. +* Large parse objects are not stored beyond a single title's processing. +* Default indexing should be designed to stay below 1 GB RAM. + +### 19.3 SQLite + +* WAL mode is enabled. +* Busy timeout is configured. +* Provider indexing writes are serialized. +* Cross-provider writer contention is avoided by default. +* `database is locked` should not be normal during first bootstrap. +* If SQLite lock contention still occurs, it must retry or fail gracefully with backoff, not start a retry storm. + +### 19.4 User Experience + +* Normal users do not need to tune concurrency. +* First startup does not require waiting for full provider detail/canonical enrichment. +* Basic title catalog readiness happens quickly. +* Progress is visible. +* Full enrichment continues automatically. + +### 19.5 Legal/Risk Boundary + +* No official provider-derived catalog DB/JSON is shipped. +* No automatic download of project-hosted provider-derived catalog data exists. +* Local computation remains the default. + +--- + +## 20. Suggested Implementation Plan + +### Step 1: Fix scheduler retry logic + +* Ensure retry timestamps are respected even during first bootstrap. +* Remove logic where `latest_success_at is None` forces immediate due status despite retry backoff. + +### Step 2: Add SQLite write safety + +* Add WAL, synchronous NORMAL, busy timeout. +* Add a global catalog-index write coordinator or lock. +* Ensure all catalog indexing writes use it. + +### Step 3: Introduce explicit stage status + +* Add or extend DB models to represent: + + * title index status; + * detail enrichment status; + * canonical enrichment status. +* Add migrations. +* Update health endpoint. + +### Step 4: Split title indexing from detail crawling + +* Implement fast provider title index stage. +* Persist titles/aliases only. +* Mark title-ready separately from full-ready. + +### Step 5: Rework detail enrichment to operate from DB rows + +* Query due title rows in small chunks. +* Crawl only those titles. +* Persist each result or small batch. +* Drop temporary objects. + +### Step 6: Bound or persist canonical caches + +* Replace unbounded dict caches. +* Prefer DB-backed compact canonical cache. +* If DB-backed cache is too large for this task, use strict TTL LRU cache with hard max sizes. + +### Step 7: Split canonical enrichment from provider detail crawl + +* Avoid canonical API calls during initial title bootstrap. +* Prefer canonical enrichment after details are persisted. +* Make it independently bounded and retryable. + +### Step 8: Add progress and tests + +* Add unit tests for: + + * scheduler due/backoff logic; + * bounded cache max size; + * no immediate retry after failure; + * title stage does not call detail crawler; + * SQLite write coordinator is used. +* Add integration tests for: + + * fresh DB startup state; + * title-ready before full-ready; + * failed provider retry backoff; + * detail enrichment persists incrementally. + +--- + +## 21. Explicit Implementation Constraints + +The implementation must not: + +* add official provider-derived catalog assets; +* require users to configure low concurrency manually; +* block startup until all provider details are indexed; +* use unbounded global caches; +* use unbounded queues; +* store raw HTML or BeautifulSoup objects in queues; +* allow multiple provider writers to fight over SQLite by default; +* retry failed bootstrap stages immediately; +* perform full provider live crawling inside request handlers; +* mark full catalog readiness when only title index readiness exists. + +--- + +## 22. Desired Final Behavior Example + +Fresh install: + +```text +Application startup. +Database migrations complete. +Provider catalog scheduler started. +aniworld.to title_index: running +s.to title_index: pending +megakino title_index: pending +Application HTTP server ready. +``` + +After title index for one provider: + +```text +aniworld.to title_index: ready, 2421 titles +aniworld.to detail_enrichment: running, 37/2421 titles +catalog_title_ready=true +catalog_full_ready=false +``` + +User searches: + +```text +Search query reads title DB. +Matching title is found. +If detail rows exist, return detail-backed result. +If detail rows do not exist, return clear "details indexing" state or enqueue explicit targeted warm-up. +``` + +Background continues: + +```text +aniworld.to detail_enrichment: running +queue_depth=3 +writer_lag=2 +memory remains bounded +``` + +Failure: + +```text +s.to detail_enrichment failed: network/provider issue +next_retry_after=... +scheduler does not retry before next_retry_after +previous visible data remains available +``` + +Completion: + +```text +aniworld.to title_index: ready +aniworld.to detail_enrichment: ready +aniworld.to canonical_enrichment: ready +aniworld.to full_ready=true +``` + +--- + +## 23. Summary + +The correct solution is not to lower all concurrency to one and not to ship a prebuilt provider catalog. + +The correct solution is: + +```text +local progressive indexing ++ fast title index bootstrap ++ background detail enrichment ++ background canonical enrichment ++ strict memory bounds ++ bounded caches ++ SQLite single-writer discipline ++ WAL/busy timeout ++ retry backoff ++ DB-only request path +``` + +This preserves the self-hosted legal/risk boundary while making AniBridge practical for normal users. diff --git a/internal/specs/017-provider-catalog-index/README.md b/internal/specs/017-provider-catalog-index/README.md new file mode 100644 index 00000000..d1a632bc --- /dev/null +++ b/internal/specs/017-provider-catalog-index/README.md @@ -0,0 +1,5 @@ +# 017 Provider Catalog Index - Spec Index + +- `00-scheduled-provider-indexing.md` - scheduled full-catalog indexing and refresh strategy +- `01-normalization-and-id-mapping.md` - canonical ID mapping and provider-to-Sonarr/Radarr normalization strategy +- `02-streaming-persistence-and-memory-bounds.md` - streaming provider persistence, staging generations, and bounded-memory bootstrap strategy