appergb · appergb · Jul 2, 2026 · Jul 2, 2026
@@ -144,6 +144,101 @@ pub struct TranscriptSourceResult {
     pub error: Option<String>,
 }
 
+/// One visual ("Moments") hit for `search_media` — a source-second range in one
+/// asset, or a still image (no range). Source-second timings, ready to convert to
+/// `trimStartFrame`/`trimEndFrame` (upstream `visualResults`' `moments` entries).
+#[derive(Debug, Clone)]
+pub struct SearchVisualHit {
+    /// Asset id (`mediaRef`).
+    pub media_ref: String,
+    /// Shot-start in source seconds (omitted for stills).
+    pub start_seconds: f64,
+    /// Shot-end in source seconds (omitted for stills).
+    pub end_seconds: f64,
+    /// Uncalibrated similarity score (ordering only).
+    pub score: f32,
+    /// True for still images: no time range → upstream sets `type: "image"`.
+    pub is_image: bool,
+}
+
+/// One spoken ("Spoken") hit for `search_media`: a transcript segment matching
+/// every query term (upstream `spokenResults` entries).
+#[derive(Debug, Clone)]
+pub struct SearchSpokenHit {
+    pub media_ref: String,
+    pub start_seconds: f64,
+    pub end_seconds: f64,
+    pub text: String,
+}
+
+/// The visual index's state for the `search_media` `status` field, mirroring
+/// upstream's `visualStatus` string enum (`ToolExecutor+Search.swift:91-100`).
+/// The dispatcher serializes the exact upstream spelling.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SearchIndexState {
+    /// Model installed, everything (currently) indexed.
+    Ready,
+    /// Model installed, indexing still in progress.
+    Indexing,
+    /// Model not yet downloaded.
+    ModelNotInstalled,
+    /// Model download in flight.
+    DownloadingModel,
+    /// Model loading/preparing.
+    Preparing,
+    /// Visual search disabled (no backend / build without it).
+    Disabled,
+    /// Model load or download failed.
+    Failed,
+}
+
+impl SearchIndexState {
+    /// The upstream string spelling for the `status` field.
+    pub fn as_str(self) -> &'static str {
+        match self {
+            SearchIndexState::Ready => "ready",
+            SearchIndexState::Indexing => "indexing",
+            SearchIndexState::ModelNotInstalled => "modelNotInstalled",
+            SearchIndexState::DownloadingModel => "downloadingModel",
+            SearchIndexState::Preparing => "preparing",
+            SearchIndexState::Disabled => "disabled",
+            SearchIndexState::Failed => "failed",
+        }
+    }
+}
+
+/// One asset to search for [`MediaBridge::search_media`]: the dispatcher resolves
+/// the candidate set (optionally restricted to one `mediaRef`) and hands these
+/// down, since only the bridge can resolve ids to files + read the caches.
+#[derive(Debug, Clone)]
+pub struct SearchCandidate {
+    /// Asset id (`mediaRef`).
+    pub media_ref: String,
+    /// True for video/image (visual-searchable).
+    pub is_visual: bool,
+    /// True for video/audio (spoken-searchable).
+    pub is_spoken: bool,
+}
+
+/// The full `search_media` result the bridge returns; the dispatcher shapes it
+/// into the upstream JSON envelope (`status`/`indexableAssets`/`indexedAssets`/
+/// `moments`/`spoken`). Groups rank independently and are never blended.
+#[derive(Debug, Clone)]
+pub struct SearchMediaResult {
+    /// The visual index state for the `status` field.
+    pub status: SearchIndexState,
+    /// Count of visual assets in scope (upstream `indexableAssets`).
+    pub indexable_assets: usize,
+    /// How many of those already have a current on-disk index
+    /// (upstream `indexedAssets`); `None` when the model isn't loaded so the
+    /// count can't be computed (upstream omits the key then).
+    pub indexed_assets: Option<usize>,
+    /// Visual hits (empty when `scope == "spoken"` or the index isn't ready).
+    pub moments: Vec<SearchVisualHit>,
+    /// Spoken hits (empty when `scope == "visual"`; work regardless of status).
+    pub spoken: Vec<SearchSpokenHit>,
+}
+
 /// The injected capability boundary for the render + import tools. `Send + Sync`
 /// so the [`Dispatcher`](super::dispatch::Dispatcher) can hold `Arc<dyn
 /// MediaBridge>` across threads (matching [`CoreHandle`](super::core_handle)).
@@ -191,6 +286,29 @@ pub trait MediaBridge: Send + Sync {
             "import_media: importing is not available in this build",
         ))
     }
+
+    /// Search the media library by content: visual (SigLIP2 semantic) and spoken
+    /// (transcript keyword). `candidates` is the resolved, in-scope asset set
+    /// (already filtered to one `mediaRef` when the caller restricted it);
+    /// `scope` is `"visual"`/`"spoken"`/`"both"` and `limit` the per-group cap.
+    /// The two groups rank independently and are never blended (upstream). The
+    /// default reports `disabled` with no hits so a bridge-less build still
+    /// returns an honest, well-formed result the model can read.
+    fn search_media(
+        &self,
+        _candidates: &[SearchCandidate],
+        _query: &str,
+        _scope: &str,
+        _limit: usize,
+    ) -> Result<SearchMediaResult, BridgeError> {
+        Ok(SearchMediaResult {
+            status: SearchIndexState::Disabled,
+            indexable_assets: 0,
+            indexed_assets: None,
+            moments: Vec::new(),
+            spoken: Vec::new(),
+        })
+    }
 }
 
 /// Turn one [`InspectedFrame`] into an MCP image [`Block`], base64-encoding the

@@ -20,6 +20,16 @@ pub const EMBEDDING_DIM: usize = 768;
 pub const IMAGE_SIZE: u32 = 256;
 pub const CONTEXT_LENGTH: usize = 64;
 
+/// Base URL the ONNX model files are fetched from (`{base}/{file}`), mirroring
+/// `WhisperModel.base_url`. Placeholder until the ONNX build is hosted (SPEC
+/// T8.0): the download command constructs `{base}/image_encoder.onnx` etc., and
+/// SHA-256-verifies each against [`manifest`]'s (currently placeholder) hashes,
+/// so a real download only succeeds once both this URL and the manifest
+/// hashes/bytes are filled in. The Hugging Face `resolve/main` raw-file endpoint
+/// is the intended host (same shape as the whisper model URL).
+pub const MODEL_DOWNLOAD_BASE_URL: &str =
+    "https://huggingface.co/opentake/siglip2-base-patch16-256-onnx/resolve/main";
+
 /// The [`EmbedderSpec`] for the configured SigLIP2 model. `normalized` defaults
 /// to `false` to match upstream's assumption that the exported model L2-
 /// normalizes internally (SPEC §0.8); flip it only if calibration proves the

@@ -30,11 +30,18 @@ opentake-core = { workspace = true }
 opentake-project = { workspace = true }
 opentake-ops = { workspace = true }
 opentake-domain = { workspace = true }
-# Transcription is ON for the shipped app: `whisper-backend` compiles the
-# whisper.cpp CPU backend (via cmake — preinstalled on GitHub runners, no CUDA),
-# `model-download` pulls the ggml model over HTTPS with SHA-1 verification. Both
-# stay optional at the opentake-media level (its own tests run without them).
-opentake-media = { workspace = true, features = ["whisper-backend", "model-download"] }
+# Transcription + visual semantic search are ON for the shipped app:
+# `whisper-backend` compiles the whisper.cpp CPU backend (via cmake —
+# preinstalled on GitHub runners, no CUDA); `ort-backend` links ONNX Runtime for
+# the SigLIP2 dual-encoder — its `download-binaries` feature fetches a
+# *statically* linked prebuilt onnxruntime at build time and bakes it into the
+# binary (no runtime .so/.dylib to ship; same build-time-network shape as
+# whisper.cpp / model-download); `model-download` pulls the model files over
+# HTTPS with checksum verification. All three stay optional at the
+# opentake-media level (its own default tests run offline, without them);
+# ort-inference tests are feature-gated + runtime-skipped like the GPU/ffmpeg
+# integration tests.
+opentake-media = { workspace = true, features = ["whisper-backend", "ort-backend", "model-download"] }
 opentake-render = { workspace = true }
 opentake-gen = { workspace = true }
 opentake-agent = { workspace = true }

@@ -17,6 +17,7 @@ mod library;
 mod mcp;
 mod media;
 mod render;
+mod search;
 mod secret;
 mod transcribe;
 
@@ -192,6 +193,11 @@ pub fn run() {
             transcribe::transcribe_media,
             transcribe::transcript_get,
             captions::generate_captions,
+            search::search_model_status,
+            search::download_search_model,
+            search::search_index_status,
+            search::search_index_start,
+            search::search_query,
             library::library_list,
             library::library_favorite,
             library::library_unfavorite,

@@ -27,6 +27,7 @@ use base64::Engine as _;
 use opentake_agent::mcp::core_handle::{AppCoreHandle, CoreHandle};
 use opentake_agent::mcp::media_bridge::{
     BridgeError, ImportOutcome, ImportSource, InspectResult, InspectedFrame, MediaBridge,
+    SearchCandidate, SearchIndexState, SearchMediaResult, SearchSpokenHit, SearchVisualHit,
     TranscriptSource, TranscriptSourceResult,
 };
 use opentake_agent::mcp::server;
@@ -247,6 +248,97 @@ impl MediaBridge for TauriMediaBridge {
             )),
         }
     }
+
+    fn search_media(
+        &self,
+        candidates: &[SearchCandidate],
+        query: &str,
+        scope: &str,
+        limit: usize,
+    ) -> Result<SearchMediaResult, BridgeError> {
+        // Resolve every candidate id to its source path from the live manifest.
+        // Missing (offline) files are kept — their index/transcript reads simply
+        // yield nothing, matching upstream (a missing file has no results, not an
+        // error). Unresolvable ids are dropped.
+        let manifest = self.core.media();
+        let project_dir = self.core.project_dir();
+        let resolver = opentake_domain::MediaResolver::new(&manifest, project_dir.as_deref());
+        let mut visual_paths: Vec<(String, PathBuf)> = Vec::new();
+        let mut spoken_paths: Vec<(String, PathBuf)> = Vec::new();
+        for c in candidates {
+            let Some(path) = resolver.expected_path(&c.media_ref) else {
+                continue;
+            };
+            if c.is_visual {
+                visual_paths.push((c.media_ref.clone(), path.clone()));
+            }
+            if c.is_spoken {
+                spoken_paths.push((c.media_ref.clone(), path));
+            }
+        }
+
+        let fps = self.core.get_timeline().timeline.fps;
+        let installed = crate::search::model_installed(&self.engine);
+
+        // Visual group (skipped for scope == "spoken").
+        let (status, indexable_assets, indexed_assets, moments) = if scope == "spoken" {
+            (SearchIndexState::Disabled, 0, None, Vec::new())
+        } else {
+            let (indexable, indexed) = crate::search::visual_coverage(&self.engine, &visual_paths);
+            // Status mirrors upstream `visualStatus`: without the model it's
+            // `modelNotInstalled`; with it, `indexing` while any indexable asset
+            // is still un-indexed, else `ready`. (Download/preparing/failed are
+            // transient front-end states the panel owns; the tool reports the
+            // stable installed/ready/indexing view.)
+            let status = if !installed {
+                SearchIndexState::ModelNotInstalled
+            } else if indexable > 0 && indexed < indexable {
+                SearchIndexState::Indexing
+            } else {
+                SearchIndexState::Ready
+            };
+            let moments: Vec<SearchVisualHit> =
+                crate::search::visual_hits_by_id(&self.engine, &visual_paths, query, fps, limit)
+                    .into_iter()
+                    .map(|h| SearchVisualHit {
+                        media_ref: h.media_id,
+                        start_seconds: h.start_sec,
+                        end_seconds: h.end_sec,
+                        score: h.score,
+                        is_image: h.is_image,
+                    })
+                    .collect();
+            // `indexedAssets` is only meaningful when the model is loaded
+            // (upstream sets it only when an embedder spec exists).
+            let indexed_opt = if installed { Some(indexed) } else { None };
+            (status, indexable, indexed_opt, moments)
+        };
+
+        // Spoken group (skipped for scope == "visual"). Works regardless of the
+        // visual index — keyword search over cached transcripts.
+        let spoken: Vec<SearchSpokenHit> = if scope == "visual" {
+            Vec::new()
+        } else {
+            self.engine
+                .search_spoken(query, &spoken_paths, limit)
+                .into_iter()
+                .map(|h| SearchSpokenHit {
+                    media_ref: h.asset_id,
+                    start_seconds: h.start,
+                    end_seconds: h.end,
+                    text: h.text,
+                })
+                .collect()
+        };
+
+        Ok(SearchMediaResult {
+            status,
+            indexable_assets,
+            indexed_assets,
+            moments,
+            spoken,
+        })
+    }
 }
 
 impl TauriMediaBridge {