diff --git a/crates/opentake-agent/src/mcp/dispatch.rs b/crates/opentake-agent/src/mcp/dispatch.rs
index de38968..d3aef22 100644
--- a/crates/opentake-agent/src/mcp/dispatch.rs
+++ b/crates/opentake-agent/src/mcp/dispatch.rs
@@ -37,7 +37,7 @@ use serde_json::Value;
 use crate::mcp::core_handle::CoreHandle;
 use crate::mcp::gen_catalog;
 use crate::mcp::media_bridge::{
-    frame_to_block, ImportSource, InspectResult, MediaBridge, TranscriptSource,
+    frame_to_block, ImportSource, InspectResult, MediaBridge, SearchCandidate, TranscriptSource,
 };
 use crate::plugin::registry::PluginRegistry;
 use crate::signal::engine;
@@ -208,19 +208,19 @@ impl Dispatcher {
             ToolName::SmartReframe => self.smart_reframe(args),
             ToolName::TightenSilences => self.tighten_silences(args, before),
 
-            // --- Render + import + transcript (wired to the injected MediaBridge) ---
+            // --- Render + import + transcript + search (wired to the injected MediaBridge) ---
             ToolName::InspectTimeline => self.inspect_timeline(args, before),
             ToolName::ImportMedia => self.import_media(args, manifest),
             ToolName::GetTranscript => self.get_transcript(args, before, manifest),
             ToolName::AddCaptions => self.add_captions(args, before, manifest),
+            ToolName::SearchMedia => self.search_media(args, manifest),
 
             // --- Not yet implementable in this phase (honest stubs) ---
-            // Media reads (inspect/search) still need the analysis backend;
-            // generation/upscale need the async GenClient + BYOK auth.
-            // Motion graphics (#34) now routes through the planned Motion Canvas
-            // plugin: render mp4 -> import media -> place clip.
+            // inspect_media still needs the analysis backend; generation/upscale
+            // need the async GenClient + BYOK auth. Motion graphics (#34) now
+            // routes through the planned Motion Canvas plugin: render mp4 ->
+            // import media -> place clip.
             ToolName::InspectMedia
-            | ToolName::SearchMedia
             | ToolName::GenerateVideo
             | ToolName::GenerateImage
             | ToolName::GenerateAudio
@@ -405,6 +405,134 @@ impl Dispatcher {
         Ok(ToolResult::ok(outcome.message))
     }
 
+    /// `search_media`: content search over the library — visual (SigLIP2
+    /// semantic) + spoken (transcript keyword), ranked independently and never
+    /// blended. 1:1 port of `ToolExecutor+Search.searchMedia`
+    /// (`ToolExecutor+Search.swift:6-32`): validate `query`/`scope`/`limit` +
+    /// optional `mediaRef` restrict here, resolve the candidate set from the
+    /// manifest, run both searches behind the [`MediaBridge`], and shape the
+    /// upstream JSON envelope (`status`/`indexableAssets`/`indexedAssets`/
+    /// `moments`/`spoken`). Scores are uncalibrated (ordering only). When the
+    /// visual index isn't ready, `moments` may be empty — the model is told via
+    /// `status` and the `indexableAssets`/`indexedAssets` counts, and Spoken +
+    /// Files-style name lookups still work.
+    fn search_media(
+        &self,
+        args: &Value,
+        manifest: &MediaManifest,
+    ) -> Result<ToolResult, ToolError> {
+        use serde_json::json;
+        let a: SearchMediaArgs = decode_tool_args(args, "")?;
+        let query = a.query.trim().to_string();
+        if query.is_empty() {
+            return Ok(ToolResult::error("search_media: query is empty"));
+        }
+        // scope ∈ {visual, spoken, both}, default both (upstream).
+        let scope = a.scope.as_deref().unwrap_or("both");
+        if !matches!(scope, "visual" | "spoken" | "both") {
+            return Ok(ToolResult::error(format!(
+                "search_media: scope must be visual, spoken, or both (got '{scope}')"
+            )));
+        }
+        // limit default 10, clamped to 1..=50 (upstream `min(max(limit,1),50)`).
+        let limit = a.limit.unwrap_or(10).clamp(1, 50) as usize;
+
+        // Optional `mediaRef` restricts the search to one existing asset.
+        let restrict: Option<String> = match a.media_ref.as_deref() {
+            Some(ref_id) => {
+                let entry = manifest.entries.iter().find(|e| e.id == ref_id);
+                match entry {
+                    Some(e) => Some(e.id.clone()),
+                    None => {
+                        return Ok(ToolResult::error(format!(
+                            "search_media: media not found: {ref_id}"
+                        )));
+                    }
+                }
+            }
+            None => None,
+        };
+
+        // Build the candidate set from the manifest (kind → visual/spoken).
+        use opentake_domain::ClipType;
+        let candidates: Vec<SearchCandidate> = manifest
+            .entries
+            .iter()
+            .filter(|e| restrict.as_deref().is_none_or(|r| r == e.id))
+            .map(|e| SearchCandidate {
+                media_ref: e.id.clone(),
+                is_visual: matches!(e.kind, ClipType::Video | ClipType::Image),
+                is_spoken: matches!(e.kind, ClipType::Video | ClipType::Audio),
+            })
+            .collect();
+
+        let Some(bridge) = self.bridge.as_ref() else {
+            return Ok(ToolResult::error(
+                "search_media: search is not available in this build",
+            ));
+        };
+        let result = bridge
+            .search_media(&candidates, &query, scope, limit)
+            .map_err(|e| ToolError::new(e.message))?;
+
+        // Shape the upstream JSON. `name` per hit is looked up from the manifest.
+        let name_of = |media_ref: &str| -> String {
+            manifest
+                .entries
+                .iter()
+                .find(|e| e.id == media_ref)
+                .map(|e| e.name.clone())
+                .unwrap_or_default()
+        };
+
+        let mut payload = serde_json::Map::new();
+        if scope != "spoken" {
+            // Visual group: status + counts always present; moments when ready.
+            payload.insert("status".into(), json!(result.status.as_str()));
+            payload.insert("indexableAssets".into(), json!(result.indexable_assets));
+            if let Some(indexed) = result.indexed_assets {
+                payload.insert("indexedAssets".into(), json!(indexed));
+            }
+            let moments: Vec<Value> = result
+                .moments
+                .iter()
+                .map(|h| {
+                    let mut m = serde_json::Map::new();
+                    m.insert("mediaRef".into(), json!(h.media_ref));
+                    m.insert("name".into(), json!(name_of(&h.media_ref)));
+                    m.insert("score".into(), json!(h.score as f64));
+                    if h.is_image {
+                        m.insert("type".into(), json!("image"));
+                    } else {
+                        m.insert("startSeconds".into(), json!(h.start_seconds));
+                        m.insert("endSeconds".into(), json!(h.end_seconds));
+                    }
+                    Value::Object(m)
+                })
+                .collect();
+            payload.insert("moments".into(), json!(moments));
+        }
+        if scope != "visual" {
+            let spoken: Vec<Value> = result
+                .spoken
+                .iter()
+                .map(|h| {
+                    json!({
+                        "mediaRef": h.media_ref,
+                        "name": name_of(&h.media_ref),
+                        "startSeconds": h.start_seconds,
+                        "endSeconds": h.end_seconds,
+                        "text": h.text,
+                    })
+                })
+                .collect();
+            payload.insert("spoken".into(), json!(spoken));
+        }
+
+        let out = round_floats_3dp(Value::Object(payload));
+        Ok(ToolResult::ok(out.to_string()))
+    }
+
     /// `get_transcript`: the live timeline transcript in project frames. Walks
     /// every caption-eligible audio/video clip, transcribes each unique source
     /// once (cached, via the [`MediaBridge`]), maps each word through the clip's
@@ -2343,6 +2471,9 @@ mod tests {
     use std::sync::Arc;
 
     use crate::mcp::core_handle::CoreHandle;
+    use crate::mcp::media_bridge::{
+        SearchIndexState, SearchMediaResult, SearchSpokenHit, SearchVisualHit,
+    };
 
     /// A faithful [`CoreHandle`] over a real in-memory [`AppCore`], seeded with a
     /// video track and one media asset so `add_clips` can run end to end.
@@ -3358,8 +3489,15 @@ mod tests {
         /// Records the media_refs passed to the last `transcribe_sources` call,
         /// so tests can assert dedup.
         transcribe_calls: Mutex<Vec<Vec<String>>>,
+        /// Canned `search_media` result; when `None` the trait default (disabled)
+        /// runs. Records the `(query, scope, limit, candidate ids)` of each call.
+        search_result: Mutex<Option<SearchMediaResult>>,
+        search_calls: Mutex<Vec<SearchCall>>,
     }
 
+    /// One recorded `search_media` call: `(query, scope, limit, candidate ids)`.
+    type SearchCall = (String, String, usize, Vec<String>);
+
     impl FakeBridge {
         fn with_transcript(self, media_ref: &str, t: TranscriptionResult) -> Self {
             self.transcripts
@@ -3447,6 +3585,31 @@ mod tests {
                 message: format!("Imported via {tag}."),
             })
         }
+
+        fn search_media(
+            &self,
+            candidates: &[SearchCandidate],
+            query: &str,
+            scope: &str,
+            limit: usize,
+        ) -> Result<SearchMediaResult, BridgeError> {
+            self.search_calls.lock().unwrap().push((
+                query.to_string(),
+                scope.to_string(),
+                limit,
+                candidates.iter().map(|c| c.media_ref.clone()).collect(),
+            ));
+            if let Some(result) = self.search_result.lock().unwrap().clone() {
+                return Ok(result);
+            }
+            Ok(SearchMediaResult {
+                status: SearchIndexState::Disabled,
+                indexable_assets: 0,
+                indexed_assets: None,
+                moments: Vec::new(),
+                spoken: Vec::new(),
+            })
+        }
     }
 
     /// A dispatcher whose timeline has a single 60-frame clip and a `FakeBridge`
@@ -3719,6 +3882,223 @@ mod tests {
         );
     }
 
+    // MARK: - search_media (visual + spoken content search via the MediaBridge)
+
+    fn image_entry(id: &str, name: &str) -> MediaManifestEntry {
+        let mut e = entry(id, name);
+        e.kind = ClipType::Image;
+        e
+    }
+
+    /// A dispatcher over a manifest with a video (`v`), audio (`a`), and image
+    /// (`i`) asset, plus a `FakeBridge` seeded with `result`. Returns both so
+    /// tests can assert the recorded call + JSON shape.
+    fn search_dispatcher(result: SearchMediaResult) -> (Dispatcher, Arc<FakeBridge>) {
+        let tl = Timeline::new();
+        let mut m = MediaManifest::new();
+        m.entries.push(entry("v", "Harbor Sunset"));
+        m.entries.push(audio_entry("a", "Interview"));
+        m.entries.push(image_entry("i", "Poster"));
+        let handle = Arc::new(StateHandle::new(tl, m));
+        let bridge = Arc::new(FakeBridge::default());
+        *bridge.search_result.lock().unwrap() = Some(result);
+        let d = Dispatcher::with_bridge(
+            handle,
+            Arc::new(RwLock::new(PluginRegistry::new())),
+            Some(bridge.clone() as Arc<dyn MediaBridge>),
+        );
+        (d, bridge)
+    }
+
+    fn sample_search_result() -> SearchMediaResult {
+        SearchMediaResult {
+            status: SearchIndexState::Ready,
+            indexable_assets: 2,
+            indexed_assets: Some(2),
+            moments: vec![
+                SearchVisualHit {
+                    media_ref: "v".into(),
+                    start_seconds: 3.0,
+                    end_seconds: 6.0,
+                    score: 0.82,
+                    is_image: false,
+                },
+                SearchVisualHit {
+                    media_ref: "i".into(),
+                    start_seconds: 0.0,
+                    end_seconds: 0.0,
+                    score: 0.5,
+                    is_image: true,
+                },
+            ],
+            spoken: vec![SearchSpokenHit {
+                media_ref: "a".into(),
+                start_seconds: 12.0,
+                end_seconds: 14.0,
+                text: "the budget plan".into(),
+            }],
+        }
+    }
+
+    #[test]
+    fn search_media_shapes_upstream_json_with_both_groups() {
+        let (d, bridge) = search_dispatcher(sample_search_result());
+        let r = d.dispatch(
+            "search_media",
+            serde_json::json!({ "query": "sunset harbor" }),
+        );
+        assert!(!r.is_error, "{}", r.text_joined());
+        let v: serde_json::Value = serde_json::from_str(&first_text(&r)).unwrap();
+
+        // Visual group: status + counts + moments.
+        assert_eq!(v["status"], "ready");
+        assert_eq!(v["indexableAssets"], 2);
+        assert_eq!(v["indexedAssets"], 2);
+        let moments = v["moments"].as_array().unwrap();
+        assert_eq!(moments.len(), 2);
+        // Video hit carries a source-second range + name; no `type`.
+        assert_eq!(moments[0]["mediaRef"], "v");
+        assert_eq!(moments[0]["name"], "Harbor Sunset");
+        assert_eq!(moments[0]["startSeconds"], 3.0);
+        assert_eq!(moments[0]["endSeconds"], 6.0);
+        assert!(moments[0].get("type").is_none());
+        // Image hit is `type: image`, no range.
+        assert_eq!(moments[1]["mediaRef"], "i");
+        assert_eq!(moments[1]["type"], "image");
+        assert!(moments[1].get("startSeconds").is_none());
+
+        // Spoken group: mediaRef/name/range/text.
+        let spoken = v["spoken"].as_array().unwrap();
+        assert_eq!(spoken.len(), 1);
+        assert_eq!(spoken[0]["mediaRef"], "a");
+        assert_eq!(spoken[0]["name"], "Interview");
+        assert_eq!(spoken[0]["text"], "the budget plan");
+
+        // Default scope=both, limit=10 forwarded; all three ids are candidates.
+        let calls = bridge.search_calls.lock().unwrap();
+        assert_eq!(calls.len(), 1);
+        assert_eq!(calls[0].1, "both");
+        assert_eq!(calls[0].2, 10);
+        assert_eq!(calls[0].3.len(), 3);
+    }
+
+    #[test]
+    fn search_media_scope_visual_omits_spoken() {
+        let (d, _bridge) = search_dispatcher(sample_search_result());
+        let r = d.dispatch(
+            "search_media",
+            serde_json::json!({ "query": "harbor", "scope": "visual" }),
+        );
+        assert!(!r.is_error, "{}", r.text_joined());
+        let v: serde_json::Value = serde_json::from_str(&first_text(&r)).unwrap();
+        assert!(v.get("moments").is_some());
+        assert!(v.get("spoken").is_none()); // upstream: visual scope omits spoken
+        assert!(v.get("status").is_some());
+    }
+
+    #[test]
+    fn search_media_scope_spoken_omits_visual_status() {
+        let (d, _bridge) = search_dispatcher(sample_search_result());
+        let r = d.dispatch(
+            "search_media",
+            serde_json::json!({ "query": "budget", "scope": "spoken" }),
+        );
+        assert!(!r.is_error, "{}", r.text_joined());
+        let v: serde_json::Value = serde_json::from_str(&first_text(&r)).unwrap();
+        assert!(v.get("spoken").is_some());
+        // Spoken-only skips the visual group entirely (no status/moments).
+        assert!(v.get("status").is_none());
+        assert!(v.get("moments").is_none());
+    }
+
+    #[test]
+    fn search_media_limit_is_clamped_1_to_50() {
+        let (d, bridge) = search_dispatcher(sample_search_result());
+        // Over-max clamps to 50.
+        let _ = d.dispatch(
+            "search_media",
+            serde_json::json!({ "query": "x", "limit": 999 }),
+        );
+        // Under-min clamps to 1.
+        let _ = d.dispatch(
+            "search_media",
+            serde_json::json!({ "query": "x", "limit": 0 }),
+        );
+        let calls = bridge.search_calls.lock().unwrap();
+        assert_eq!(calls[0].2, 50);
+        assert_eq!(calls[1].2, 1);
+    }
+
+    #[test]
+    fn search_media_media_ref_restricts_candidates() {
+        let (d, bridge) = search_dispatcher(sample_search_result());
+        let r = d.dispatch(
+            "search_media",
+            serde_json::json!({ "query": "x", "mediaRef": "v" }),
+        );
+        assert!(!r.is_error, "{}", r.text_joined());
+        let calls = bridge.search_calls.lock().unwrap();
+        // Only the one restricted asset is a candidate.
+        assert_eq!(calls[0].3, vec!["v".to_string()]);
+    }
+
+    #[test]
+    fn search_media_unknown_media_ref_errors() {
+        let (d, _bridge) = search_dispatcher(sample_search_result());
+        let r = d.dispatch(
+            "search_media",
+            serde_json::json!({ "query": "x", "mediaRef": "nope" }),
+        );
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("media not found"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn search_media_empty_query_errors() {
+        let (d, _bridge) = search_dispatcher(sample_search_result());
+        let r = d.dispatch("search_media", serde_json::json!({ "query": "   " }));
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("query is empty"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn search_media_invalid_scope_errors() {
+        let (d, _bridge) = search_dispatcher(sample_search_result());
+        let r = d.dispatch(
+            "search_media",
+            serde_json::json!({ "query": "x", "scope": "sideways" }),
+        );
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("scope must be"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn search_media_without_bridge_reports_unavailable() {
+        let mut m = MediaManifest::new();
+        m.entries.push(entry("v", "Clip"));
+        let handle = Arc::new(StateHandle::new(Timeline::new(), m));
+        let d = Dispatcher::new(handle, Arc::new(RwLock::new(PluginRegistry::new())));
+        let r = d.dispatch("search_media", serde_json::json!({ "query": "x" }));
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("not available in this build"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
     // MARK: - get_transcript (timeline transcript via the MediaBridge)
 
     fn word(text: &str, start: f64, end: f64) -> TranscriptionWord {
diff --git a/crates/opentake-agent/src/mcp/media_bridge.rs b/crates/opentake-agent/src/mcp/media_bridge.rs
index 3cc6316..8100877 100644
--- a/crates/opentake-agent/src/mcp/media_bridge.rs
+++ b/crates/opentake-agent/src/mcp/media_bridge.rs
@@ -144,6 +144,101 @@ pub struct TranscriptSourceResult {
     pub error: Option<String>,
 }
 
+/// One visual ("Moments") hit for `search_media` — a source-second range in one
+/// asset, or a still image (no range). Source-second timings, ready to convert to
+/// `trimStartFrame`/`trimEndFrame` (upstream `visualResults`' `moments` entries).
+#[derive(Debug, Clone)]
+pub struct SearchVisualHit {
+    /// Asset id (`mediaRef`).
+    pub media_ref: String,
+    /// Shot-start in source seconds (omitted for stills).
+    pub start_seconds: f64,
+    /// Shot-end in source seconds (omitted for stills).
+    pub end_seconds: f64,
+    /// Uncalibrated similarity score (ordering only).
+    pub score: f32,
+    /// True for still images: no time range → upstream sets `type: "image"`.
+    pub is_image: bool,
+}
+
+/// One spoken ("Spoken") hit for `search_media`: a transcript segment matching
+/// every query term (upstream `spokenResults` entries).
+#[derive(Debug, Clone)]
+pub struct SearchSpokenHit {
+    pub media_ref: String,
+    pub start_seconds: f64,
+    pub end_seconds: f64,
+    pub text: String,
+}
+
+/// The visual index's state for the `search_media` `status` field, mirroring
+/// upstream's `visualStatus` string enum (`ToolExecutor+Search.swift:91-100`).
+/// The dispatcher serializes the exact upstream spelling.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SearchIndexState {
+    /// Model installed, everything (currently) indexed.
+    Ready,
+    /// Model installed, indexing still in progress.
+    Indexing,
+    /// Model not yet downloaded.
+    ModelNotInstalled,
+    /// Model download in flight.
+    DownloadingModel,
+    /// Model loading/preparing.
+    Preparing,
+    /// Visual search disabled (no backend / build without it).
+    Disabled,
+    /// Model load or download failed.
+    Failed,
+}
+
+impl SearchIndexState {
+    /// The upstream string spelling for the `status` field.
+    pub fn as_str(self) -> &'static str {
+        match self {
+            SearchIndexState::Ready => "ready",
+            SearchIndexState::Indexing => "indexing",
+            SearchIndexState::ModelNotInstalled => "modelNotInstalled",
+            SearchIndexState::DownloadingModel => "downloadingModel",
+            SearchIndexState::Preparing => "preparing",
+            SearchIndexState::Disabled => "disabled",
+            SearchIndexState::Failed => "failed",
+        }
+    }
+}
+
+/// One asset to search for [`MediaBridge::search_media`]: the dispatcher resolves
+/// the candidate set (optionally restricted to one `mediaRef`) and hands these
+/// down, since only the bridge can resolve ids to files + read the caches.
+#[derive(Debug, Clone)]
+pub struct SearchCandidate {
+    /// Asset id (`mediaRef`).
+    pub media_ref: String,
+    /// True for video/image (visual-searchable).
+    pub is_visual: bool,
+    /// True for video/audio (spoken-searchable).
+    pub is_spoken: bool,
+}
+
+/// The full `search_media` result the bridge returns; the dispatcher shapes it
+/// into the upstream JSON envelope (`status`/`indexableAssets`/`indexedAssets`/
+/// `moments`/`spoken`). Groups rank independently and are never blended.
+#[derive(Debug, Clone)]
+pub struct SearchMediaResult {
+    /// The visual index state for the `status` field.
+    pub status: SearchIndexState,
+    /// Count of visual assets in scope (upstream `indexableAssets`).
+    pub indexable_assets: usize,
+    /// How many of those already have a current on-disk index
+    /// (upstream `indexedAssets`); `None` when the model isn't loaded so the
+    /// count can't be computed (upstream omits the key then).
+    pub indexed_assets: Option<usize>,
+    /// Visual hits (empty when `scope == "spoken"` or the index isn't ready).
+    pub moments: Vec<SearchVisualHit>,
+    /// Spoken hits (empty when `scope == "visual"`; work regardless of status).
+    pub spoken: Vec<SearchSpokenHit>,
+}
+
 /// The injected capability boundary for the render + import tools. `Send + Sync`
 /// so the [`Dispatcher`](super::dispatch::Dispatcher) can hold `Arc<dyn
 /// MediaBridge>` across threads (matching [`CoreHandle`](super::core_handle)).
@@ -191,6 +286,29 @@ pub trait MediaBridge: Send + Sync {
             "import_media: importing is not available in this build",
         ))
     }
+
+    /// Search the media library by content: visual (SigLIP2 semantic) and spoken
+    /// (transcript keyword). `candidates` is the resolved, in-scope asset set
+    /// (already filtered to one `mediaRef` when the caller restricted it);
+    /// `scope` is `"visual"`/`"spoken"`/`"both"` and `limit` the per-group cap.
+    /// The two groups rank independently and are never blended (upstream). The
+    /// default reports `disabled` with no hits so a bridge-less build still
+    /// returns an honest, well-formed result the model can read.
+    fn search_media(
+        &self,
+        _candidates: &[SearchCandidate],
+        _query: &str,
+        _scope: &str,
+        _limit: usize,
+    ) -> Result<SearchMediaResult, BridgeError> {
+        Ok(SearchMediaResult {
+            status: SearchIndexState::Disabled,
+            indexable_assets: 0,
+            indexed_assets: None,
+            moments: Vec::new(),
+            spoken: Vec::new(),
+        })
+    }
 }
 
 /// Turn one [`InspectedFrame`] into an MCP image [`Block`], base64-encoding the
diff --git a/crates/opentake-media/src/search/config.rs b/crates/opentake-media/src/search/config.rs
index fe9c1d3..2289320 100644
--- a/crates/opentake-media/src/search/config.rs
+++ b/crates/opentake-media/src/search/config.rs
@@ -20,6 +20,16 @@ pub const EMBEDDING_DIM: usize = 768;
 pub const IMAGE_SIZE: u32 = 256;
 pub const CONTEXT_LENGTH: usize = 64;
 
+/// Base URL the ONNX model files are fetched from (`{base}/{file}`), mirroring
+/// `WhisperModel.base_url`. Placeholder until the ONNX build is hosted (SPEC
+/// T8.0): the download command constructs `{base}/image_encoder.onnx` etc., and
+/// SHA-256-verifies each against [`manifest`]'s (currently placeholder) hashes,
+/// so a real download only succeeds once both this URL and the manifest
+/// hashes/bytes are filled in. The Hugging Face `resolve/main` raw-file endpoint
+/// is the intended host (same shape as the whisper model URL).
+pub const MODEL_DOWNLOAD_BASE_URL: &str =
+    "https://huggingface.co/opentake/siglip2-base-patch16-256-onnx/resolve/main";
+
 /// The [`EmbedderSpec`] for the configured SigLIP2 model. `normalized` defaults
 /// to `false` to match upstream's assumption that the exported model L2-
 /// normalizes internally (SPEC §0.8); flip it only if calibration proves the
diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml
index a09434e..c928adb 100644
--- a/src-tauri/Cargo.toml
+++ b/src-tauri/Cargo.toml
@@ -30,11 +30,18 @@ opentake-core = { workspace = true }
 opentake-project = { workspace = true }
 opentake-ops = { workspace = true }
 opentake-domain = { workspace = true }
-# Transcription is ON for the shipped app: `whisper-backend` compiles the
-# whisper.cpp CPU backend (via cmake — preinstalled on GitHub runners, no CUDA),
-# `model-download` pulls the ggml model over HTTPS with SHA-1 verification. Both
-# stay optional at the opentake-media level (its own tests run without them).
-opentake-media = { workspace = true, features = ["whisper-backend", "model-download"] }
+# Transcription + visual semantic search are ON for the shipped app:
+# `whisper-backend` compiles the whisper.cpp CPU backend (via cmake —
+# preinstalled on GitHub runners, no CUDA); `ort-backend` links ONNX Runtime for
+# the SigLIP2 dual-encoder — its `download-binaries` feature fetches a
+# *statically* linked prebuilt onnxruntime at build time and bakes it into the
+# binary (no runtime .so/.dylib to ship; same build-time-network shape as
+# whisper.cpp / model-download); `model-download` pulls the model files over
+# HTTPS with checksum verification. All three stay optional at the
+# opentake-media level (its own default tests run offline, without them);
+# ort-inference tests are feature-gated + runtime-skipped like the GPU/ffmpeg
+# integration tests.
+opentake-media = { workspace = true, features = ["whisper-backend", "ort-backend", "model-download"] }
 opentake-render = { workspace = true }
 opentake-gen = { workspace = true }
 opentake-agent = { workspace = true }
diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs
index 263aa40..323e7cc 100644
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@@ -17,6 +17,7 @@ mod library;
 mod mcp;
 mod media;
 mod render;
+mod search;
 mod secret;
 mod transcribe;
 
@@ -192,6 +193,11 @@ pub fn run() {
             transcribe::transcribe_media,
             transcribe::transcript_get,
             captions::generate_captions,
+            search::search_model_status,
+            search::download_search_model,
+            search::search_index_status,
+            search::search_index_start,
+            search::search_query,
             library::library_list,
             library::library_favorite,
             library::library_unfavorite,
diff --git a/src-tauri/src/mcp.rs b/src-tauri/src/mcp.rs
index 9e33545..e3add78 100644
--- a/src-tauri/src/mcp.rs
+++ b/src-tauri/src/mcp.rs
@@ -27,6 +27,7 @@ use base64::Engine as _;
 use opentake_agent::mcp::core_handle::{AppCoreHandle, CoreHandle};
 use opentake_agent::mcp::media_bridge::{
     BridgeError, ImportOutcome, ImportSource, InspectResult, InspectedFrame, MediaBridge,
+    SearchCandidate, SearchIndexState, SearchMediaResult, SearchSpokenHit, SearchVisualHit,
     TranscriptSource, TranscriptSourceResult,
 };
 use opentake_agent::mcp::server;
@@ -247,6 +248,97 @@ impl MediaBridge for TauriMediaBridge {
             )),
         }
     }
+
+    fn search_media(
+        &self,
+        candidates: &[SearchCandidate],
+        query: &str,
+        scope: &str,
+        limit: usize,
+    ) -> Result<SearchMediaResult, BridgeError> {
+        // Resolve every candidate id to its source path from the live manifest.
+        // Missing (offline) files are kept — their index/transcript reads simply
+        // yield nothing, matching upstream (a missing file has no results, not an
+        // error). Unresolvable ids are dropped.
+        let manifest = self.core.media();
+        let project_dir = self.core.project_dir();
+        let resolver = opentake_domain::MediaResolver::new(&manifest, project_dir.as_deref());
+        let mut visual_paths: Vec<(String, PathBuf)> = Vec::new();
+        let mut spoken_paths: Vec<(String, PathBuf)> = Vec::new();
+        for c in candidates {
+            let Some(path) = resolver.expected_path(&c.media_ref) else {
+                continue;
+            };
+            if c.is_visual {
+                visual_paths.push((c.media_ref.clone(), path.clone()));
+            }
+            if c.is_spoken {
+                spoken_paths.push((c.media_ref.clone(), path));
+            }
+        }
+
+        let fps = self.core.get_timeline().timeline.fps;
+        let installed = crate::search::model_installed(&self.engine);
+
+        // Visual group (skipped for scope == "spoken").
+        let (status, indexable_assets, indexed_assets, moments) = if scope == "spoken" {
+            (SearchIndexState::Disabled, 0, None, Vec::new())
+        } else {
+            let (indexable, indexed) = crate::search::visual_coverage(&self.engine, &visual_paths);
+            // Status mirrors upstream `visualStatus`: without the model it's
+            // `modelNotInstalled`; with it, `indexing` while any indexable asset
+            // is still un-indexed, else `ready`. (Download/preparing/failed are
+            // transient front-end states the panel owns; the tool reports the
+            // stable installed/ready/indexing view.)
+            let status = if !installed {
+                SearchIndexState::ModelNotInstalled
+            } else if indexable > 0 && indexed < indexable {
+                SearchIndexState::Indexing
+            } else {
+                SearchIndexState::Ready
+            };
+            let moments: Vec<SearchVisualHit> =
+                crate::search::visual_hits_by_id(&self.engine, &visual_paths, query, fps, limit)
+                    .into_iter()
+                    .map(|h| SearchVisualHit {
+                        media_ref: h.media_id,
+                        start_seconds: h.start_sec,
+                        end_seconds: h.end_sec,
+                        score: h.score,
+                        is_image: h.is_image,
+                    })
+                    .collect();
+            // `indexedAssets` is only meaningful when the model is loaded
+            // (upstream sets it only when an embedder spec exists).
+            let indexed_opt = if installed { Some(indexed) } else { None };
+            (status, indexable, indexed_opt, moments)
+        };
+
+        // Spoken group (skipped for scope == "visual"). Works regardless of the
+        // visual index — keyword search over cached transcripts.
+        let spoken: Vec<SearchSpokenHit> = if scope == "visual" {
+            Vec::new()
+        } else {
+            self.engine
+                .search_spoken(query, &spoken_paths, limit)
+                .into_iter()
+                .map(|h| SearchSpokenHit {
+                    media_ref: h.asset_id,
+                    start_seconds: h.start,
+                    end_seconds: h.end,
+                    text: h.text,
+                })
+                .collect()
+        };
+
+        Ok(SearchMediaResult {
+            status,
+            indexable_assets,
+            indexed_assets,
+            moments,
+            spoken,
+        })
+    }
 }
 
 impl TauriMediaBridge {
diff --git a/src-tauri/src/search.rs b/src-tauri/src/search.rs
new file mode 100644
index 0000000..8e2506c
--- /dev/null
+++ b/src-tauri/src/search.rs
@@ -0,0 +1,839 @@
+//! Visual + spoken semantic search command surface.
+//!
+//! Wires the built-but-previously-unreachable SigLIP2 visual-search engine
+//! (`opentake_media::search`) to the app, alongside the already-wired spoken
+//! (transcript keyword) search. Upstream is `Search/SearchIndexCoordinator.swift`
+//! (per-project indexing queue + query) and `MediaTab+Search.swift` (the three
+//! result groups: Moments / Spoken / Files). OpenTake substitutes ONNX Runtime
+//! for CoreML, so the SigLIP2 model is two explicit `.onnx` files the user
+//! downloads once (mirroring the whisper flow in `transcribe.rs`).
+//!
+//! Commands (all camelCase DTOs, `web/src/lib/types.ts` contract — the repo's #1
+//! bug class — with serde round-trip tests):
+//! - [`search_model_status`] — is the SigLIP2 model installed? (+ label / size).
+//! - [`download_search_model`] — async download with `search://progress` events,
+//!   SHA-256 verified exactly as `search::model_download::install` provides.
+//! - [`search_index_status`] — how much of the project's visual media is indexed.
+//! - [`search_index_start`] — index every not-yet-indexed video/image asset
+//!   (sampled frames → SigLIP2 embeddings → `PALMEMB1` store), emitting
+//!   `search://index` progress events. Idempotent (already-current assets skip).
+//! - [`search_query`] — run the three-group query: Moments (visual), Spoken
+//!   (transcript), Files (name match). Matches upstream's groups, caps, and order.
+//!
+//! The visual index / query path needs the ONNX Runtime backend (feature
+//! `ort-backend`, ON for the shipped app). When the model isn't installed, the
+//! visual groups degrade to empty and `search_query` still returns Spoken + Files
+//! — so plain filename filtering keeps working with zero setup, exactly like the
+//! upstream Files group.
+
+use std::path::PathBuf;
+
+use serde::{Deserialize, Serialize};
+use tauri::{AppHandle, Emitter, State};
+
+use opentake_core::AppCore;
+use opentake_domain::{ClipType, MediaResolver};
+use opentake_media::search::config as search_config;
+use opentake_media::search::config::{RELATIVE_CUTOFF, SEARCH_LIMIT, VISUAL_MATCH_COSINE_FLOOR};
+use opentake_media::MediaEngine;
+
+use crate::media::MediaState;
+
+/// One visual ("Moments") hit projected to the front end. `frame` is the shot's
+/// start in **source frames** (upstream drags `shotStart…shotEnd`; the panel
+/// thumbnails at `shotStart`). `startSec`/`endSec` carry the full source-second
+/// range so the UI can drag it onto the timeline as a trimmed clip. camelCase.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct MomentHitDto {
+    /// Asset id (the clip layer's `media_ref`).
+    pub media_id: String,
+    /// Shot-start frame in source-media frames (thumb + preview anchor).
+    pub frame: i64,
+    /// Shot-start in source seconds (drag range lower bound).
+    pub start_sec: f64,
+    /// Shot-end in source seconds (drag range upper bound). Equals `start_sec`
+    /// for stills (zero-length shot).
+    pub end_sec: f64,
+    /// Uncalibrated similarity score (ordering only — upstream note).
+    pub score: f32,
+    /// True for still images (no time range → drag as a plain asset).
+    pub is_image: bool,
+}
+
+/// One spoken ("Spoken") hit: an asset's transcript segment matching every query
+/// term. Keyword hits are unranked upstream; `score` is a fixed `1.0` so the DTO
+/// shape is uniform (ordering within the group follows transcript order).
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct SpokenHitDto {
+    pub media_id: String,
+    pub start_sec: f64,
+    pub end_sec: f64,
+    pub text: String,
+    pub score: f32,
+}
+
+/// One filename ("Files") match. `score` is a fixed `1.0` (name matches are
+/// unranked; upstream sorts by the panel's sort mode, default insertion order).
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct FileHitDto {
+    pub media_id: String,
+    pub score: f32,
+}
+
+/// The full three-group query result, mirroring upstream's Moments / Spoken /
+/// Files sections (`MediaTab+Search.swift:12-33`).
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Default)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchResultsDto {
+    pub moments: Vec<MomentHitDto>,
+    pub spoken: Vec<SpokenHitDto>,
+    pub files: Vec<FileHitDto>,
+}
+
+/// Whether the SigLIP2 model is installed, plus enough to prompt a download.
+/// Mirrors `transcribe.rs`'s `ModelStatusDto`.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchModelStatusDto {
+    /// True when both encoder files + tokenizer are present on disk.
+    pub installed: bool,
+    /// Human label for the model (`"siglip2-base-patch16-256"`).
+    pub model: String,
+    /// Approximate combined download size in bytes (image + text encoder +
+    /// tokenizer), for the prompt. `0` until the ONNX assets are hosted.
+    pub bytes: i64,
+}
+
+/// Visual-index coverage for the project's indexable (video/image) assets.
+/// Drives the panel's indexing affordance (upstream `SearchIndexCoordinator`'s
+/// `batchTotal`/`batchCompleted`, surfaced here as a snapshot the UI polls or
+/// receives via `search://index` events).
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Default)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchIndexStatusDto {
+    /// The model must be installed before anything can be indexed.
+    pub model_installed: bool,
+    /// Count of video/image assets in the project (upstream `indexableAssets`).
+    pub indexable: usize,
+    /// How many of those already have a current on-disk embedding index.
+    pub indexed: usize,
+}
+
+/// Progress payload for the `search://progress` (model download) event.
+#[derive(Clone, Debug, Serialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+struct DownloadProgress {
+    fraction: f64,
+}
+
+/// Progress payload for the `search://index` event: `completed`/`total` assets
+/// plus the current asset's fraction (mirrors the coordinator's progress ring
+/// math in `MediaTab+IndexStatus.swift`).
+#[derive(Clone, Debug, Serialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+struct IndexProgress {
+    completed: usize,
+    total: usize,
+    fraction: f64,
+}
+
+// MARK: - Pure helpers (testable without the ONNX backend)
+
+/// The combined model download size (image + text encoder + tokenizer).
+fn model_bytes() -> i64 {
+    let m = search_config::manifest();
+    m.image_encoder.bytes + m.text_encoder.bytes + m.tokenizer.bytes
+}
+
+/// Convert source seconds to a source frame with upstream's **truncating**
+/// `secondsToFrame` (`Int(s*fps)`, not rounding). `fps <= 0` falls back to 30.
+fn seconds_to_frame(seconds: f64, fps: i32) -> i64 {
+    let fps = if fps > 0 { fps as f64 } else { 30.0 };
+    (seconds.max(0.0) * fps) as i64
+}
+
+/// Name-substring match for the Files group — case-insensitive `contains`, the
+/// zero-setup fallback (upstream `passesFilters`' `localizedCaseInsensitiveContains`).
+/// Returns matches in manifest (insertion) order to mirror the default
+/// `.dateAdded` sort. Never mutates the input.
+fn file_matches(entries: &[(String, String)], query: &str) -> Vec<FileHitDto> {
+    let q = query.trim().to_lowercase();
+    if q.is_empty() {
+        return Vec::new();
+    }
+    entries
+        .iter()
+        .filter(|(_, name)| name.to_lowercase().contains(&q))
+        .map(|(id, _)| FileHitDto {
+            media_id: id.clone(),
+            score: 1.0,
+        })
+        .collect()
+}
+
+/// Project a spoken (transcript keyword) hit into its DTO. Score is a fixed
+/// `1.0` (keyword matches are unranked; ordering is transcript order).
+fn spoken_dto(h: &opentake_media::SpokenHit) -> SpokenHitDto {
+    SpokenHitDto {
+        media_id: h.asset_id.clone(),
+        start_sec: h.start,
+        end_sec: h.end,
+        text: h.text.clone(),
+        score: 1.0,
+    }
+}
+
+/// Project a visual rank `Hit` into its Moments DTO. `is_image` is true when the
+/// shot is zero-length (`shot_start == shot_end`, upstream's still-image row).
+fn moment_dto(h: &opentake_media::Hit, fps: i32) -> MomentHitDto {
+    let is_image = h.shot_end <= h.shot_start;
+    MomentHitDto {
+        media_id: h.asset_id.clone(),
+        frame: seconds_to_frame(h.shot_start, fps),
+        start_sec: h.shot_start,
+        end_sec: h.shot_end,
+        score: h.score,
+        is_image,
+    }
+}
+
+/// One indexable/searchable asset resolved from the live manifest: id, absolute
+/// source path, and kind (drives visual vs. spoken candidacy).
+struct ResolvedAsset {
+    id: String,
+    name: String,
+    path: PathBuf,
+    kind: ClipType,
+}
+
+/// Resolve every manifest asset to `(id, name, path, kind)`, dropping any whose
+/// path can't be resolved. Offline (missing) files are kept — indexing/search
+/// skip them at read time, matching upstream (a missing file simply yields no
+/// index rather than dropping the asset).
+fn resolve_assets(core: &AppCore) -> Vec<ResolvedAsset> {
+    let manifest = core.media();
+    let project_dir = core.project_dir();
+    let resolver = MediaResolver::new(&manifest, project_dir.as_deref());
+    manifest
+        .entries
+        .iter()
+        .filter_map(|e| {
+            let path = resolver.expected_path(&e.id)?;
+            Some(ResolvedAsset {
+                id: e.id.clone(),
+                name: e.name.clone(),
+                path,
+                kind: e.kind,
+            })
+        })
+        .collect()
+}
+
+/// A visual asset is a video or image (upstream `type == .video || .image`).
+fn is_visual(kind: ClipType) -> bool {
+    matches!(kind, ClipType::Video | ClipType::Image)
+}
+
+/// A spoken-searchable asset is a video or audio (upstream candidate filter in
+/// `scheduleMomentSearch` / `spokenResults`).
+fn is_spoken(kind: ClipType) -> bool {
+    matches!(kind, ClipType::Video | ClipType::Audio)
+}
+
+// MARK: - Commands
+
+/// `search_model_status`: report whether the SigLIP2 ONNX model is installed.
+/// Never downloads. The panel calls this to decide whether to show the
+/// "Smart search" download affordance (upstream `MediaTab+IndexStatus.swift`).
+#[tauri::command]
+pub fn search_model_status(media: State<'_, MediaState>) -> SearchModelStatusDto {
+    let models_dir = media.engine().models_dir();
+    let manifest = search_config::manifest();
+    SearchModelStatusDto {
+        installed: opentake_media::search::model_download::installed(models_dir, &manifest)
+            .is_some(),
+        model: manifest.model.clone(),
+        bytes: model_bytes(),
+    }
+}
+
+/// `download_search_model`: fetch the SigLIP2 ONNX assets (idempotent), emit
+/// `search://progress` events as bytes arrive, and SHA-256-verify each file
+/// before installing — exactly the machinery `search::model_download::install`
+/// provides. Async (network-bound) so it never blocks the UI. Returns the
+/// installed status on success.
+#[tauri::command]
+pub async fn download_search_model(
+    app: AppHandle,
+    media: State<'_, MediaState>,
+) -> Result<SearchModelStatusDto, String> {
+    let models_dir = media.engine().models_dir().to_path_buf();
+    let manifest = search_config::manifest();
+    let base_url = search_config::MODEL_DOWNLOAD_BASE_URL;
+    let on_progress = |fraction: f64| {
+        let _ = app.emit("search://progress", DownloadProgress { fraction });
+    };
+    opentake_media::search::model_download::install(&models_dir, &manifest, base_url, on_progress)
+        .await
+        .map_err(|e| e.to_string())?;
+    Ok(SearchModelStatusDto {
+        installed: true,
+        model: manifest.model.clone(),
+        bytes: model_bytes(),
+    })
+}
+
+/// `search_index_status`: snapshot how much of the project's indexable (video/
+/// image) media already has a current on-disk embedding index. Never indexes.
+/// The panel uses it to decide whether to offer "index now" and to show the
+/// progress ring's denominator.
+#[tauri::command]
+pub fn search_index_status(
+    core: State<'_, AppCore>,
+    media: State<'_, MediaState>,
+) -> SearchIndexStatusDto {
+    let engine = media.engine();
+    let models_dir = engine.models_dir();
+    let manifest = search_config::manifest();
+    let model_installed =
+        opentake_media::search::model_download::installed(models_dir, &manifest).is_some();
+    let spec = search_config::embedder_spec();
+    let assets = resolve_assets(&core);
+    let visual: Vec<&ResolvedAsset> = assets.iter().filter(|a| is_visual(a.kind)).collect();
+    let indexed = visual
+        .iter()
+        .filter(|a| !opentake_media::search::needs_index(engine.cache_root(), &a.path, &spec))
+        .count();
+    SearchIndexStatusDto {
+        model_installed,
+        indexable: visual.len(),
+        indexed,
+    }
+}
+
+/// `search_index_start`: index every not-yet-current video/image asset in the
+/// project (sampled frames → SigLIP2 embeddings → `PALMEMB1` store), emitting
+/// `search://index` progress as each asset completes. Idempotent — already-current
+/// assets are skipped by the indexer. Errors if the model isn't installed
+/// (guiding the UI to `download_search_model`). Runs the CPU/GPU-bound inference
+/// on Tauri's worker thread (the command is sync, so Tauri dispatches it off the
+/// UI thread, matching `transcribe_media`). The ONNX backend is always enabled in
+/// the shipped app (`opentake-media`'s `ort-backend` feature), so this calls the
+/// SigLIP2 embedder directly, mirroring how `transcribe.rs` calls whisper.
+#[tauri::command]
+pub fn search_index_start(
+    app: AppHandle,
+    core: State<'_, AppCore>,
+    media: State<'_, MediaState>,
+) -> Result<SearchIndexStatusDto, String> {
+    let engine = media.engine();
+    let embedder = load_embedder(engine)?;
+    let assets = resolve_assets(&core);
+    index_assets(app, engine, &assets, &embedder)?;
+    // Return a fresh status snapshot so the UI settles on the final counts.
+    Ok(index_status_snapshot(engine, &assets))
+}
+
+/// `search_query`: run the three-group content query — Moments (visual, when the
+/// model is installed), Spoken (transcript keyword), Files (name match). Matches
+/// upstream's groups, caps, and order (`MediaTab+Search.swift`). Visual is
+/// best-effort: with no installed model (or an all-unindexed project) `moments`
+/// is empty and Spoken + Files still return — so plain filename filtering is the
+/// zero-setup fallback. Never errors on a missing model (an empty query returns
+/// empty groups).
+#[tauri::command]
+pub fn search_query(
+    core: State<'_, AppCore>,
+    media: State<'_, MediaState>,
+    query: String,
+) -> SearchResultsDto {
+    let trimmed = query.trim().to_string();
+    if trimmed.is_empty() {
+        return SearchResultsDto::default();
+    }
+    let engine = media.engine();
+    let assets = resolve_assets(&core);
+    let fps = core.get_timeline().timeline.fps;
+
+    // Files: name-substring over every asset (the zero-setup fallback).
+    let name_entries: Vec<(String, String)> = assets
+        .iter()
+        .map(|a| (a.id.clone(), a.name.clone()))
+        .collect();
+    let files = file_matches(&name_entries, &trimmed);
+
+    // Spoken: keyword over cached transcripts of video/audio assets.
+    let spoken_candidates: Vec<(String, PathBuf)> = assets
+        .iter()
+        .filter(|a| is_spoken(a.kind))
+        .map(|a| (a.id.clone(), a.path.clone()))
+        .collect();
+    let spoken: Vec<SpokenHitDto> = engine
+        .search_spoken(&trimmed, &spoken_candidates, SEARCH_LIMIT)
+        .iter()
+        .map(spoken_dto)
+        .collect();
+
+    // Moments: visual rank over on-disk embedding indexes (needs the model).
+    let moments = search_visual(engine, &assets, &trimmed, fps);
+
+    SearchResultsDto {
+        moments,
+        spoken,
+        files,
+    }
+}
+
+/// Visual query for the panel: rank the project's visual assets, capped at the
+/// panel default [`SEARCH_LIMIT`]. Delegates to [`visual_hits_by_id`].
+fn search_visual(
+    engine: &MediaEngine,
+    assets: &[ResolvedAsset],
+    query: &str,
+    fps: i32,
+) -> Vec<MomentHitDto> {
+    let id_paths: Vec<(String, PathBuf)> = assets
+        .iter()
+        .filter(|a| is_visual(a.kind))
+        .map(|a| (a.id.clone(), a.path.clone()))
+        .collect();
+    visual_hits_by_id(engine, &id_paths, query, fps, SEARCH_LIMIT)
+}
+
+/// Rank `query` against the on-disk embedding indexes of the given visual assets
+/// (`(id, path)` pairs), returning up to `limit` Moments hits. Loads the
+/// installed model, encodes the text query, loads each asset's `.embed` index
+/// (skipping missing/stale), and ranks best-per-shot with the `min_score` floor
+/// then `limit` + relative cutoff — the exact upstream order. Empty when the
+/// model isn't installed, the query can't encode, or nothing is indexed. Shared
+/// by the panel query and the `search_media` MCP bridge so both rank identically.
+pub(crate) fn visual_hits_by_id(
+    engine: &MediaEngine,
+    id_paths: &[(String, PathBuf)],
+    query: &str,
+    fps: i32,
+    limit: usize,
+) -> Vec<MomentHitDto> {
+    use opentake_media::search::embed_store;
+
+    let Ok(embedder) = load_embedder(engine) else {
+        return Vec::new();
+    };
+    // Encode the text query once; a failure yields no visual hits.
+    let Ok(vector) = opentake_media::search::Embedder::encode_text(&embedder, query) else {
+        return Vec::new();
+    };
+
+    // Load each asset's current index (skip missing/stale silently).
+    let mut indexes: Vec<(String, embed_store::AssetIndex)> = Vec::new();
+    for (id, path) in id_paths {
+        let Some(key) = embed_store::key(path) else {
+            continue;
+        };
+        if let Ok(index) = embed_store::load(engine.cache_root(), &key) {
+            indexes.push((id.clone(), index));
+        }
+    }
+    if indexes.is_empty() {
+        return Vec::new();
+    }
+
+    opentake_media::search_visual_ranked(
+        &vector,
+        &indexes,
+        limit,
+        RELATIVE_CUTOFF,
+        Some(VISUAL_MATCH_COSINE_FLOOR),
+    )
+    .iter()
+    .map(|h| moment_dto(h, fps))
+    .collect()
+}
+
+/// Compute the visual-index coverage for a set of visual asset `(id, path)`
+/// pairs: `(indexable, indexed)`. Shared by the `search_media` bridge for its
+/// `indexableAssets`/`indexedAssets` fields. `indexed` counts assets whose
+/// on-disk embedding index is current for the configured model.
+pub(crate) fn visual_coverage(
+    engine: &MediaEngine,
+    id_paths: &[(String, PathBuf)],
+) -> (usize, usize) {
+    let spec = search_config::embedder_spec();
+    let indexed = id_paths
+        .iter()
+        .filter(|(_, path)| !opentake_media::search::needs_index(engine.cache_root(), path, &spec))
+        .count();
+    (id_paths.len(), indexed)
+}
+
+/// True when the SigLIP2 model is installed. Shared with the `search_media`
+/// bridge to pick its `status` string.
+pub(crate) fn model_installed(engine: &MediaEngine) -> bool {
+    let manifest = search_config::manifest();
+    opentake_media::search::model_download::installed(engine.models_dir(), &manifest).is_some()
+}
+
+// MARK: - ort-backed indexing internals
+
+/// Load the installed SigLIP2 embedder, or a structured "model not installed"
+/// error the UI turns into a download prompt. Mirrors `transcribe.rs`'s
+/// `load_backend`.
+fn load_embedder(engine: &MediaEngine) -> Result<opentake_media::search::OrtEmbedder, String> {
+    let models_dir = engine.models_dir();
+    let manifest = search_config::manifest();
+    let installed = opentake_media::search::model_download::installed(models_dir, &manifest)
+        .ok_or_else(|| {
+            format!(
+                "visual search model not installed — download '{}' first",
+                manifest.model
+            )
+        })?;
+    let tokenizer_json = installed.tokenizer_folder.join("tokenizer.json");
+    opentake_media::search::OrtEmbedder::new(
+        &installed.image_encoder,
+        &installed.text_encoder,
+        &tokenizer_json,
+        installed.spec,
+    )
+    .map_err(|e| e.to_string())
+}
+
+/// Index every not-yet-current video/image asset, emitting a `search://index`
+/// event as each completes. The single-worker sequential loop mirrors the
+/// coordinator's `ensureWorker` queue (`SearchIndexCoordinator.swift:139-160`) —
+/// one asset at a time, in manifest order — kept simple here (Tauri already runs
+/// the command off the UI thread; a background queue is a later refinement).
+fn index_assets(
+    app: AppHandle,
+    engine: &MediaEngine,
+    assets: &[ResolvedAsset],
+    embedder: &opentake_media::search::OrtEmbedder,
+) -> Result<(), String> {
+    use opentake_media::search::Embedder;
+    use opentake_media::search::{
+        index_image, index_video, needs_index, CancelToken, SamplerOptions,
+    };
+
+    let spec = Embedder::spec(embedder).clone();
+    let cache_root = engine.cache_root();
+    let cancel = CancelToken::new();
+    let opts = SamplerOptions::default();
+
+    // Only assets that actually need work (idempotent), preserving order.
+    let pending: Vec<&ResolvedAsset> = assets
+        .iter()
+        .filter(|a| is_visual(a.kind) && needs_index(cache_root, &a.path, &spec))
+        .collect();
+    let total = pending.len();
+    if total == 0 {
+        let _ = app.emit(
+            "search://index",
+            IndexProgress {
+                completed: 0,
+                total: 0,
+                fraction: 1.0,
+            },
+        );
+        return Ok(());
+    }
+
+    for (i, a) in pending.iter().enumerate() {
+        // Per-asset progress: forward the sampler's fraction into the batch.
+        let base = i;
+        let on_progress = |frac: f64| {
+            let _ = app.emit(
+                "search://index",
+                IndexProgress {
+                    completed: base,
+                    total,
+                    fraction: (base as f64 + frac.clamp(0.0, 1.0)) / total as f64,
+                },
+            );
+        };
+
+        // A per-asset failure (offline file, decode error) is skipped — one bad
+        // clip must not abort the batch (upstream `failedIds.insert` + continue).
+        let result = match a.kind {
+            ClipType::Image => match engine.image_thumbnail(&a.path) {
+                // Reuse the decoded thumbnail as the still's frame; a full-res
+                // decode is unnecessary for a single squash-resized embedding.
+                Ok(frame) => index_image(cache_root, &a.path, &frame, embedder, &cancel),
+                Err(e) => Err(e),
+            },
+            ClipType::Video => {
+                // Probe the source for its true duration/dimensions so the sampler
+                // walks the whole clip (the manifest duration may be stale).
+                let (duration, width, height) = match engine.probe(&a.path) {
+                    Ok(p) => (p.duration_secs, p.width.unwrap_or(0), p.height.unwrap_or(0)),
+                    Err(e) => {
+                        eprintln!("[search] probe failed {}: {e}", a.path.display());
+                        emit_completed(&app, i + 1, total);
+                        continue;
+                    }
+                };
+                index_video(
+                    cache_root,
+                    &a.path,
+                    duration,
+                    width,
+                    height,
+                    embedder,
+                    &opts,
+                    &cancel,
+                    Some(&on_progress),
+                )
+            }
+            _ => Ok(()),
+        };
+        if let Err(e) = result {
+            eprintln!("[search] index failed {}: {e}", a.path.display());
+        }
+        emit_completed(&app, i + 1, total);
+    }
+    Ok(())
+}
+
+/// Emit a batch-completed progress tick (`completed`/`total`, fraction settled).
+fn emit_completed(app: &AppHandle, completed: usize, total: usize) {
+    let fraction = if total > 0 {
+        completed as f64 / total as f64
+    } else {
+        1.0
+    };
+    let _ = app.emit(
+        "search://index",
+        IndexProgress {
+            completed,
+            total,
+            fraction,
+        },
+    );
+}
+
+/// A post-index status snapshot (model installed + indexable/indexed counts).
+fn index_status_snapshot(engine: &MediaEngine, assets: &[ResolvedAsset]) -> SearchIndexStatusDto {
+    let manifest = search_config::manifest();
+    let model_installed =
+        opentake_media::search::model_download::installed(engine.models_dir(), &manifest).is_some();
+    let spec = search_config::embedder_spec();
+    let visual: Vec<&ResolvedAsset> = assets.iter().filter(|a| is_visual(a.kind)).collect();
+    let indexed = visual
+        .iter()
+        .filter(|a| !opentake_media::search::needs_index(engine.cache_root(), &a.path, &spec))
+        .count();
+    SearchIndexStatusDto {
+        model_installed,
+        indexable: visual.len(),
+        indexed,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // --- pure DTO / merge / cap logic (no ort, no ffmpeg) ---
+
+    #[test]
+    fn seconds_to_frame_truncates_like_upstream() {
+        // Int(s*fps) truncation, not rounding: 1.99s @ 30fps → 59, not 60.
+        assert_eq!(seconds_to_frame(1.99, 30), 59);
+        assert_eq!(seconds_to_frame(2.0, 30), 60);
+        assert_eq!(seconds_to_frame(0.0, 30), 0);
+        // Non-positive fps falls back to 30.
+        assert_eq!(seconds_to_frame(1.0, 0), 30);
+        assert_eq!(seconds_to_frame(-5.0, 30), 0); // negative clamps to 0
+    }
+
+    #[test]
+    fn file_matches_is_case_insensitive_substring_in_order() {
+        let entries = vec![
+            ("a".into(), "Sunset Beach.mp4".into()),
+            ("b".into(), "harbor.mov".into()),
+            ("c".into(), "SUNSET timelapse.mp4".into()),
+        ];
+        let hits = file_matches(&entries, "sunset");
+        // Both "Sunset" assets match, in manifest order; "harbor" doesn't.
+        assert_eq!(hits.len(), 2);
+        assert_eq!(hits[0].media_id, "a");
+        assert_eq!(hits[1].media_id, "c");
+        assert!(hits.iter().all(|h| h.score == 1.0));
+    }
+
+    #[test]
+    fn file_matches_empty_query_is_empty() {
+        let entries = vec![("a".into(), "x.mp4".into())];
+        assert!(file_matches(&entries, "").is_empty());
+        assert!(file_matches(&entries, "   ").is_empty());
+    }
+
+    #[test]
+    fn spoken_dto_projects_fields_with_fixed_score() {
+        let h = opentake_media::SpokenHit {
+            asset_id: "a1".into(),
+            start: 1.5,
+            end: 2.5,
+            text: "the budget plan".into(),
+        };
+        let dto = spoken_dto(&h);
+        assert_eq!(dto.media_id, "a1");
+        assert_eq!(dto.start_sec, 1.5);
+        assert_eq!(dto.end_sec, 2.5);
+        assert_eq!(dto.text, "the budget plan");
+        assert_eq!(dto.score, 1.0);
+    }
+
+    #[test]
+    fn moment_dto_marks_zero_length_shot_as_image() {
+        // A still: shot_start == shot_end → is_image true, no meaningful range.
+        let still = opentake_media::Hit {
+            asset_id: "img".into(),
+            time: 0.0,
+            shot_start: 0.0,
+            shot_end: 0.0,
+            score: 0.9,
+        };
+        let d = moment_dto(&still, 30);
+        assert!(d.is_image);
+        assert_eq!(d.frame, 0);
+
+        // A video shot: range present, frame = trunc(shot_start*fps).
+        let vid = opentake_media::Hit {
+            asset_id: "vid".into(),
+            time: 3.2,
+            shot_start: 3.0,
+            shot_end: 6.0,
+            score: 0.8,
+        };
+        let d = moment_dto(&vid, 30);
+        assert!(!d.is_image);
+        assert_eq!(d.frame, 90);
+        assert_eq!(d.start_sec, 3.0);
+        assert_eq!(d.end_sec, 6.0);
+    }
+
+    #[test]
+    fn is_visual_and_is_spoken_partition_kinds_like_upstream() {
+        assert!(is_visual(ClipType::Video));
+        assert!(is_visual(ClipType::Image));
+        assert!(!is_visual(ClipType::Audio));
+        assert!(is_spoken(ClipType::Video));
+        assert!(is_spoken(ClipType::Audio));
+        assert!(!is_spoken(ClipType::Image)); // images have nothing spoken
+    }
+
+    // --- DTO serde round-trips (camelCase wire contract) ---
+
+    #[test]
+    fn moment_hit_dto_is_camel_case_and_round_trips() {
+        let dto = MomentHitDto {
+            media_id: "m1".into(),
+            frame: 90,
+            start_sec: 3.0,
+            end_sec: 6.0,
+            score: 0.8,
+            is_image: false,
+        };
+        let json = serde_json::to_string(&dto).unwrap();
+        assert!(json.contains("\"mediaId\":\"m1\""));
+        assert!(json.contains("\"startSec\":3.0"));
+        assert!(json.contains("\"endSec\":6.0"));
+        assert!(json.contains("\"isImage\":false"));
+        let back: MomentHitDto = serde_json::from_str(&json).unwrap();
+        assert_eq!(dto, back);
+    }
+
+    #[test]
+    fn spoken_hit_dto_camel_case_round_trips() {
+        let dto = SpokenHitDto {
+            media_id: "m1".into(),
+            start_sec: 1.0,
+            end_sec: 2.0,
+            text: "hello".into(),
+            score: 1.0,
+        };
+        let json = serde_json::to_string(&dto).unwrap();
+        assert!(json.contains("\"mediaId\":\"m1\""));
+        assert!(json.contains("\"startSec\":1.0"));
+        let back: SpokenHitDto = serde_json::from_str(&json).unwrap();
+        assert_eq!(dto, back);
+    }
+
+    #[test]
+    fn search_results_dto_round_trips_all_groups() {
+        let dto = SearchResultsDto {
+            moments: vec![MomentHitDto {
+                media_id: "v".into(),
+                frame: 0,
+                start_sec: 0.0,
+                end_sec: 1.0,
+                score: 0.7,
+                is_image: false,
+            }],
+            spoken: vec![SpokenHitDto {
+                media_id: "a".into(),
+                start_sec: 0.0,
+                end_sec: 1.0,
+                text: "x".into(),
+                score: 1.0,
+            }],
+            files: vec![FileHitDto {
+                media_id: "f".into(),
+                score: 1.0,
+            }],
+        };
+        let json = serde_json::to_string(&dto).unwrap();
+        assert!(json.contains("\"moments\":"));
+        assert!(json.contains("\"spoken\":"));
+        assert!(json.contains("\"files\":"));
+        let back: SearchResultsDto = serde_json::from_str(&json).unwrap();
+        assert_eq!(dto, back);
+    }
+
+    #[test]
+    fn search_model_status_dto_camel_case() {
+        let dto = SearchModelStatusDto {
+            installed: false,
+            model: "siglip2-base-patch16-256".into(),
+            bytes: 0,
+        };
+        let json = serde_json::to_string(&dto).unwrap();
+        assert!(json.contains("\"installed\":false"));
+        assert!(json.contains("\"model\":\"siglip2-base-patch16-256\""));
+        let back: SearchModelStatusDto = serde_json::from_str(&json).unwrap();
+        assert_eq!(dto, back);
+    }
+
+    #[test]
+    fn search_index_status_dto_camel_case_round_trips() {
+        let dto = SearchIndexStatusDto {
+            model_installed: true,
+            indexable: 5,
+            indexed: 2,
+        };
+        let json = serde_json::to_string(&dto).unwrap();
+        assert!(json.contains("\"modelInstalled\":true"));
+        assert!(json.contains("\"indexable\":5"));
+        assert!(json.contains("\"indexed\":2"));
+        let back: SearchIndexStatusDto = serde_json::from_str(&json).unwrap();
+        assert_eq!(dto, back);
+    }
+
+    #[test]
+    fn download_and_index_progress_are_camel_case() {
+        let d = DownloadProgress { fraction: 0.5 };
+        assert_eq!(serde_json::to_string(&d).unwrap(), "{\"fraction\":0.5}");
+        let ip = IndexProgress {
+            completed: 1,
+            total: 4,
+            fraction: 0.25,
+        };
+        let json = serde_json::to_string(&ip).unwrap();
+        assert!(json.contains("\"completed\":1"));
+        assert!(json.contains("\"total\":4"));
+        assert!(json.contains("\"fraction\":0.25"));
+    }
+}
diff --git a/web/src/components/media/MediaPanel.tsx b/web/src/components/media/MediaPanel.tsx
index 398fd8a..1d11a04 100644
--- a/web/src/components/media/MediaPanel.tsx
+++ b/web/src/components/media/MediaPanel.tsx
@@ -47,6 +47,7 @@ import { saveDialog } from "../../lib/dialog";
 import type { MediaFolder, MediaItem } from "../../lib/types";
 import { MediaTabBar, MediaSubTabBar } from "./MediaTabBar";
 import { CaptionsTab } from "./CaptionsTab";
+import { MediaSearchResults } from "./MediaSearch";
 import { useFavoritesStore, useIsFavorite } from "./favorites";
 
 /** MIME-ish type used on dataTransfer when dragging a media item to the timeline. */
@@ -292,7 +293,17 @@ function MediaTab({ kind }: { kind: MediaTabKind }) {
         )}
       </div>
 
-      {isEmpty ? (
+      {query !== "" ? (
+        // Smart search: three result groups (Moments / Spoken / Files) + the
+        // index-status affordance. `filteredItems` is the name-matched Files group
+        // (already scoped to the current main/subtab). Moments/Spoken come from
+        // the backend query; they degrade to empty with no model, leaving Files.
+        <MediaSearchResults
+          query={query}
+          nameMatches={filteredItems}
+          hasIndexableAssets={items.some((i) => i.type === "video" || i.type === "image")}
+        />
+      ) : isEmpty ? (
         <EmptyState subTab={subTab} insideFolder={browsing && folderId !== null} />
       ) : (
         <MediaGrid
diff --git a/web/src/components/media/MediaSearch.tsx b/web/src/components/media/MediaSearch.tsx
new file mode 100644
index 0000000..546af08
--- /dev/null
+++ b/web/src/components/media/MediaSearch.tsx
@@ -0,0 +1,602 @@
+/**
+ * Smart media search (1:1 with upstream `MediaTab+Search.swift` +
+ * `MediaTab+IndexStatus.swift`). Upgrades the media panel's plain filename filter
+ * into three result groups that rank independently and are never blended:
+ *
+ *  - **Moments** — visual (SigLIP2 semantic) hits with a frame thumbnail,
+ *    DRAGGABLE onto the timeline as a trimmed source-range clip.
+ *  - **Spoken** — transcript keyword hits with a thumbnail + timecode, also
+ *    draggable as a trimmed range.
+ *  - **Files** — filename matches (the pre-existing behavior), the zero-setup
+ *    fallback that works with no model.
+ *
+ * When the visual index is unavailable, an index-status affordance appears
+ * (download the on-device model → build the index → progress ring), mirroring
+ * upstream's `searchIndexStatus`. Moments/Spoken degrade gracefully to empty
+ * while Files keeps working, so plain name filtering never needs setup.
+ *
+ * The visual/spoken groups come from the Rust `search_query` command (best-effort
+ * — empty outside Tauri / without a model); Files reuses the caller's already
+ * name-filtered item list so it stays instant and offline.
+ */
+
+import { useCallback, useEffect, useRef, useState } from "react";
+import { Sparkles, AlertTriangle, Mic, Film, FileText } from "lucide-react";
+import { Icon } from "../ui/Icon";
+import { useT } from "../../i18n";
+import { formatTimecode } from "../../lib/geometry";
+import { assetUrl } from "../../lib/asset";
+import { setDraggingMedia } from "../../lib/mediaDragState";
+import { setDraggingMomentRange } from "../../lib/momentDragState";
+import { MEDIA_DND_TYPE } from "./MediaPanel";
+import { useMediaStore } from "../../store/mediaStore";
+import { useProjectStore } from "../../store/projectStore";
+import { useEditorUiStore } from "../../store/uiStore";
+import { addMediaToTimeline } from "../../store/editActions";
+import {
+  generateThumbnail,
+  searchIndexStatus,
+  searchIndexStart,
+  downloadSearchModel,
+  onSearchModelProgress,
+  onSearchIndexProgress,
+  searchQuery as searchQueryApi,
+} from "../../lib/api";
+import type { MediaItem, MomentHit, SpokenHit, SearchResults } from "../../lib/types";
+
+/** Debounce before firing the backend visual/spoken query (upstream 250ms). */
+const SEARCH_DEBOUNCE_MS = 250;
+/** Combined SigLIP2 model size shown before download when the manifest is a
+ *  placeholder (bytes 0). ~380 MB is the two fp32 ONNX encoders + tokenizer. */
+const FALLBACK_MODEL_MB = 380;
+
+function formatModelBytes(bytes: number): string {
+  const mb = bytes > 0 ? bytes / (1024 * 1024) : FALLBACK_MODEL_MB;
+  return `${Math.round(mb)} MB`;
+}
+
+/** The visual-index lifecycle the affordance renders. */
+type IndexPhase =
+  | { kind: "hidden" }
+  | { kind: "needsModel" }
+  | { kind: "downloading"; fraction: number }
+  | { kind: "readyToIndex" }
+  | { kind: "indexing"; done: number; total: number; fraction: number }
+  | { kind: "failed" };
+
+/**
+ * The full search view: index-status affordance + the three result groups.
+ * `nameMatches` is the caller's already name-filtered items (the Files group).
+ */
+export function MediaSearchResults({
+  query,
+  nameMatches,
+  hasIndexableAssets,
+}: {
+  query: string;
+  nameMatches: MediaItem[];
+  hasIndexableAssets: boolean;
+}) {
+  const t = useT();
+  const [results, setResults] = useState<SearchResults>({ moments: [], spoken: [], files: [] });
+  const phase = useSearchIndexPhase(hasIndexableAssets);
+
+  // Debounced backend query for Moments + Spoken. Files come from `nameMatches`.
+  const reqId = useRef(0);
+  useEffect(() => {
+    const q = query.trim();
+    if (q === "") {
+      setResults({ moments: [], spoken: [], files: [] });
+      return;
+    }
+    const id = ++reqId.current;
+    const handle = window.setTimeout(() => {
+      void searchQueryApi(q).then((r) => {
+        // Ignore a stale response (a newer query superseded this one).
+        if (id === reqId.current) setResults(r);
+      });
+    }, SEARCH_DEBOUNCE_MS);
+    return () => window.clearTimeout(handle);
+  }, [query]);
+
+  const { moments, spoken } = results;
+  const isEmpty = moments.length === 0 && spoken.length === 0 && nameMatches.length === 0;
+
+  return (
+    <div style={{ flex: 1, overflowY: "auto", display: "flex", flexDirection: "column" }}>
+      <SearchIndexAffordance phase={phase} />
+
+      {moments.length > 0 && (
+        <Group icon={Film} label={t("search.group.moments")} count={moments.length}>
+          <ResultsGrid>
+            {moments.map((hit, i) => (
+              <MomentCard key={`${hit.mediaId}:${hit.frame}:${i}`} hit={hit} />
+            ))}
+          </ResultsGrid>
+        </Group>
+      )}
+
+      {spoken.length > 0 && (
+        <Group icon={Mic} label={t("search.group.spoken")} count={spoken.length}>
+          <div style={{ display: "flex", flexDirection: "column", gap: "var(--space-xs)", padding: "0 var(--space-sm) var(--space-sm)" }}>
+            {spoken.map((hit, i) => (
+              <SpokenRow key={`${hit.mediaId}:${hit.startSec}:${i}`} hit={hit} />
+            ))}
+          </div>
+        </Group>
+      )}
+
+      {nameMatches.length > 0 && (
+        <Group icon={FileText} label={t("search.group.files")} count={nameMatches.length}>
+          <ResultsGrid>
+            {nameMatches.map((item) => (
+              <FileCard key={item.id} item={item} />
+            ))}
+          </ResultsGrid>
+        </Group>
+      )}
+
+      {isEmpty && (
+        <div
+          style={{
+            flex: 1,
+            display: "flex",
+            alignItems: "center",
+            justifyContent: "center",
+            color: "var(--text-tertiary)",
+            fontSize: "var(--fs-sm)",
+            padding: "var(--space-xl)",
+            textAlign: "center",
+          }}
+        >
+          {t("search.noMatches", { query: query.trim() })}
+        </div>
+      )}
+    </div>
+  );
+}
+
+/** Poll model + index status and subscribe to progress, deriving the affordance
+ *  phase. Mirrors upstream `searchIndexStatus`'s state machine. */
+function useSearchIndexPhase(hasIndexableAssets: boolean): IndexPhase {
+  const [phase, setPhase] = useState<IndexPhase>({ kind: "hidden" });
+  const mediaCount = useMediaStore((s) => s.items.length);
+
+  // Re-evaluate whenever the library changes (new assets → maybe need indexing).
+  const refresh = useCallback(async () => {
+    const status = await searchIndexStatus();
+    if (!status.modelInstalled) {
+      setPhase(hasIndexableAssets ? { kind: "needsModel" } : { kind: "hidden" });
+      return;
+    }
+    if (status.indexable > 0 && status.indexed < status.indexable) {
+      setPhase({ kind: "readyToIndex" });
+    } else {
+      setPhase({ kind: "hidden" });
+    }
+  }, [hasIndexableAssets]);
+
+  useEffect(() => {
+    let cancelled = false;
+    void refresh().catch(() => {
+      if (!cancelled) setPhase({ kind: "hidden" });
+    });
+    return () => {
+      cancelled = true;
+    };
+  }, [refresh, mediaCount]);
+
+  // Live download + indexing progress events keep the ring moving.
+  useEffect(() => {
+    let offDownload = () => {};
+    let offIndex = () => {};
+    void onSearchModelProgress((fraction) => {
+      setPhase((p) =>
+        p.kind === "downloading" || p.kind === "needsModel" ? { kind: "downloading", fraction } : p,
+      );
+    }).then((off) => (offDownload = off));
+    void onSearchIndexProgress(({ completed, total, fraction }) => {
+      if (total === 0) {
+        void refresh();
+        return;
+      }
+      setPhase({ kind: "indexing", done: completed, total, fraction });
+      // On the final tick, settle back to the resting state.
+      if (completed >= total) void refresh();
+    }).then((off) => (offIndex = off));
+    return () => {
+      offDownload();
+      offIndex();
+    };
+  }, [refresh]);
+
+  // Expose setters through a module ref so the button handlers can drive it.
+  phaseSetterRef.current = setPhase;
+  return phase;
+}
+
+/** Lets the affordance's buttons flip the phase optimistically before the async
+ *  command's first progress event lands (module ref — avoids prop threading). */
+const phaseSetterRef: { current: ((p: IndexPhase) => void) | null } = { current: null };
+
+/** The status affordance: a download/enable button (no model) or a progress ring
+ *  (downloading / indexing). Hidden when nothing needs attention (upstream
+ *  `MediaTab+IndexStatus.swift`). */
+function SearchIndexAffordance({ phase }: { phase: IndexPhase }) {
+  const t = useT();
+
+  const onDownload = useCallback(() => {
+    phaseSetterRef.current?.({ kind: "downloading", fraction: 0 });
+    void downloadSearchModel()
+      .then(() => phaseSetterRef.current?.({ kind: "readyToIndex" }))
+      .catch(() => phaseSetterRef.current?.({ kind: "failed" }));
+  }, []);
+
+  const onIndex = useCallback(() => {
+    phaseSetterRef.current?.({ kind: "indexing", done: 0, total: 1, fraction: 0 });
+    void searchIndexStart()
+      .then(() => phaseSetterRef.current?.({ kind: "hidden" }))
+      .catch(() => phaseSetterRef.current?.({ kind: "hidden" }));
+  }, []);
+
+  if (phase.kind === "hidden") return null;
+
+  const barStyle: React.CSSProperties = {
+    display: "flex",
+    alignItems: "center",
+    gap: "var(--space-xs)",
+    padding: "var(--space-xs) var(--space-sm)",
+    margin: "var(--space-xs) var(--space-sm) 0",
+    borderRadius: "var(--radius-sm)",
+    background: "var(--bg-raised)",
+    border: "var(--bw-thin) solid var(--border-subtle)",
+    fontSize: "var(--fs-xs)",
+    color: "var(--text-secondary)",
+  };
+
+  if (phase.kind === "needsModel") {
+    return (
+      <button
+        type="button"
+        onClick={onDownload}
+        title={t("search.smartSearchHint", { size: formatModelBytes(0) })}
+        style={{ ...barStyle, cursor: "pointer", textAlign: "left" }}
+      >
+        <Icon icon={Sparkles} size={13} />
+        <span style={{ fontWeight: "var(--fw-medium)" }}>{t("search.smartSearch")}</span>
+      </button>
+    );
+  }
+  if (phase.kind === "readyToIndex") {
+    return (
+      <button
+        type="button"
+        onClick={onIndex}
+        title={t("search.indexHint")}
+        style={{ ...barStyle, cursor: "pointer", textAlign: "left" }}
+      >
+        <Icon icon={Sparkles} size={13} />
+        <span style={{ fontWeight: "var(--fw-medium)" }}>{t("search.index")}</span>
+      </button>
+    );
+  }
+  if (phase.kind === "failed") {
+    return (
+      <button
+        type="button"
+        onClick={onDownload}
+        title={t("search.retryHint")}
+        style={{ ...barStyle, cursor: "pointer", textAlign: "left", color: "var(--status-error)" }}
+      >
+        <Icon icon={AlertTriangle} size={13} />
+        <span style={{ fontWeight: "var(--fw-medium)" }}>{t("search.retry")}</span>
+      </button>
+    );
+  }
+  // downloading | indexing → progress ring + label.
+  const fraction = phase.fraction;
+  const label =
+    phase.kind === "downloading"
+      ? t("search.downloading", { percent: Math.round(phase.fraction * 100) })
+      : t("search.indexing", { done: Math.min(phase.done + 1, phase.total), total: phase.total });
+  return (
+    <div style={barStyle} title={phase.kind === "downloading" ? t("search.downloadingHint") : t("search.indexingHint")}>
+      <ProgressRing value={fraction} />
+      <span style={{ color: "var(--text-tertiary)" }}>{label}</span>
+    </div>
+  );
+}
+
+/** A small SVG progress ring (upstream `progressRing`). */
+function ProgressRing({ value }: { value: number }) {
+  const v = Math.max(0.03, Math.min(1, value));
+  const size = 14;
+  const stroke = 2;
+  const r = (size - stroke) / 2;
+  const c = 2 * Math.PI * r;
+  return (
+    <svg width={size} height={size} viewBox={`0 0 ${size} ${size}`} style={{ flex: "0 0 auto" }}>
+      <circle cx={size / 2} cy={size / 2} r={r} fill="none" stroke="var(--border-subtle)" strokeWidth={stroke} />
+      <circle
+        cx={size / 2}
+        cy={size / 2}
+        r={r}
+        fill="none"
+        stroke="var(--text-secondary)"
+        strokeWidth={stroke}
+        strokeLinecap="round"
+        strokeDasharray={c}
+        strokeDashoffset={c * (1 - v)}
+        transform={`rotate(-90 ${size / 2} ${size / 2})`}
+      />
+    </svg>
+  );
+}
+
+/** A collapsible-looking group header + body (upstream `momentHeader`). */
+function Group({
+  icon,
+  label,
+  count,
+  children,
+}: {
+  icon: typeof Film;
+  label: string;
+  count: number;
+  children: React.ReactNode;
+}) {
+  return (
+    <div style={{ display: "flex", flexDirection: "column" }}>
+      <div
+        style={{
+          display: "flex",
+          alignItems: "center",
+          gap: "var(--space-xs)",
+          padding: "var(--space-sm) var(--space-md)",
+          color: "var(--text-secondary)",
+        }}
+      >
+        <Icon icon={icon} size={12} />
+        <span style={{ fontSize: "var(--fs-xs)", fontWeight: "var(--fw-semibold)" }}>{label}</span>
+        <span className="tabular" style={{ fontSize: "var(--fs-xs)", color: "var(--text-tertiary)" }}>
+          {count}
+        </span>
+      </div>
+      {children}
+    </div>
+  );
+}
+
+/** The adaptive grid the Moments + Files groups use (upstream `resultsGrid`). */
+function ResultsGrid({ children }: { children: React.ReactNode }) {
+  return (
+    <div
+      style={{
+        display: "grid",
+        gridTemplateColumns: "repeat(auto-fill, minmax(112px, 1fr))",
+        gap: "var(--space-sm)",
+        padding: "0 var(--space-sm) var(--space-md)",
+      }}
+    >
+      {children}
+    </div>
+  );
+}
+
+/** Async frame thumbnail for a search hit at a specific source-second time. */
+function HitThumbnail({ mediaId, timeSec, alt }: { mediaId: string; timeSec: number; alt: string }) {
+  const [path, setPath] = useState<string | null>(null);
+  useEffect(() => {
+    let cancelled = false;
+    void generateThumbnail(mediaId, { timeSecs: timeSec, includeSprite: false }).then((r) => {
+      if (!cancelled) setPath(r?.thumbnailPath ?? null);
+    });
+    return () => {
+      cancelled = true;
+    };
+  }, [mediaId, timeSec]);
+  const src = assetUrl(path);
+  return (
+    <div
+      style={{
+        position: "relative",
+        aspectRatio: "16 / 9",
+        background: "var(--bg-placeholder)",
+        borderRadius: "var(--radius-sm)",
+        overflow: "hidden",
+        display: "flex",
+        alignItems: "center",
+        justifyContent: "center",
+      }}
+    >
+      {src ? (
+        <img src={src} alt={alt} draggable={false} style={{ width: "100%", height: "100%", objectFit: "cover" }} />
+      ) : (
+        <Icon icon={Film} size={18} strokeWidth={1.5} />
+      )}
+    </div>
+  );
+}
+
+/** Look up a media item by id (for the name + drag payload). */
+function useMediaItem(mediaId: string): MediaItem | undefined {
+  return useMediaStore((s) => s.items.find((m) => m.id === mediaId));
+}
+
+/** A visual "Moments" card: frame thumb + name + timecode range, draggable to the
+ *  timeline as a trimmed source-range clip (upstream `momentCard`). */
+function MomentCard({ hit }: { hit: MomentHit }) {
+  const t = useT();
+  const item = useMediaItem(hit.mediaId);
+  const fps = useProjectStore((s) => s.timeline.fps);
+  const setPreviewMedia = useEditorUiStore((s) => s.setPreviewMedia);
+  if (!item) return null;
+
+  const onDragStart = (e: React.DragEvent) => {
+    e.dataTransfer.setData(MEDIA_DND_TYPE, item.id);
+    e.dataTransfer.effectAllowed = "copy";
+    setDraggingMedia(item);
+    // Stills drag as the whole asset (no meaningful range).
+    if (!hit.isImage) setDraggingMomentRange({ startSec: hit.startSec, endSec: hit.endSec });
+    else setDraggingMomentRange(null);
+  };
+  const onDragEnd = () => {
+    setDraggingMedia(null);
+    setDraggingMomentRange(null);
+  };
+
+  const startFrames = Math.round(hit.startSec * fps);
+  const endFrames = Math.round(hit.endSec * fps);
+
+  return (
+    <div
+      draggable
+      onDragStart={onDragStart}
+      onDragEnd={onDragEnd}
+      onClick={() => setPreviewMedia(item.id)}
+      title={t("search.dragToTimeline")}
+      style={{ display: "flex", flexDirection: "column", gap: 3, cursor: "grab" }}
+    >
+      <HitThumbnail mediaId={hit.mediaId} timeSec={hit.startSec} alt={item.name} />
+      <span
+        style={{
+          fontSize: "var(--fs-xs)",
+          color: "var(--text-secondary)",
+          overflow: "hidden",
+          textOverflow: "ellipsis",
+          whiteSpace: "nowrap",
+        }}
+      >
+        {item.name}
+      </span>
+      {!hit.isImage && (
+        <span className="tabular" style={{ fontSize: "var(--fs-micro)", color: "var(--text-tertiary)" }}>
+          {formatTimecode(startFrames, fps)}–{formatTimecode(endFrames, fps)}
+        </span>
+      )}
+    </div>
+  );
+}
+
+/** A "Spoken" transcript row: thumb + text + name·timecode, draggable as a
+ *  trimmed range (upstream `spokenRow`). */
+function SpokenRow({ hit }: { hit: SpokenHit }) {
+  const t = useT();
+  const item = useMediaItem(hit.mediaId);
+  const fps = useProjectStore((s) => s.timeline.fps);
+  const setPreviewMedia = useEditorUiStore((s) => s.setPreviewMedia);
+  if (!item) return null;
+
+  const onDragStart = (e: React.DragEvent) => {
+    e.dataTransfer.setData(MEDIA_DND_TYPE, item.id);
+    e.dataTransfer.effectAllowed = "copy";
+    setDraggingMedia(item);
+    setDraggingMomentRange({ startSec: hit.startSec, endSec: hit.endSec });
+  };
+  const onDragEnd = () => {
+    setDraggingMedia(null);
+    setDraggingMomentRange(null);
+  };
+
+  return (
+    <div
+      draggable
+      onDragStart={onDragStart}
+      onDragEnd={onDragEnd}
+      onClick={() => setPreviewMedia(item.id)}
+      title={t("search.dragToTimeline")}
+      style={{
+        display: "flex",
+        gap: "var(--space-sm)",
+        cursor: "grab",
+        alignItems: "flex-start",
+      }}
+    >
+      <div style={{ width: 96, flex: "0 0 auto" }}>
+        <HitThumbnail mediaId={hit.mediaId} timeSec={hit.startSec} alt={item.name} />
+      </div>
+      <div style={{ display: "flex", flexDirection: "column", gap: 2, minWidth: 0 }}>
+        <span
+          style={{
+            fontSize: "var(--fs-xs)",
+            color: "var(--text-primary)",
+            display: "-webkit-box",
+            WebkitLineClamp: 3,
+            WebkitBoxOrient: "vertical",
+            overflow: "hidden",
+          }}
+        >
+          {hit.text}
+        </span>
+        <span
+          className="tabular"
+          style={{
+            fontSize: "var(--fs-micro)",
+            color: "var(--text-tertiary)",
+            overflow: "hidden",
+            textOverflow: "ellipsis",
+            whiteSpace: "nowrap",
+          }}
+        >
+          {item.name} · {formatTimecode(Math.round(hit.startSec * fps), fps)}
+        </span>
+      </div>
+    </div>
+  );
+}
+
+/** A "Files" name-match card: thumb + name, draggable as the whole asset (the
+ *  pre-existing behavior; upstream `fileCard`). */
+function FileCard({ item }: { item: MediaItem }) {
+  const setPreviewMedia = useEditorUiStore((s) => s.setPreviewMedia);
+  const thumb = item.missing ? null : assetUrl(item.thumbnail);
+
+  const onDragStart = (e: React.DragEvent) => {
+    e.dataTransfer.setData(MEDIA_DND_TYPE, item.id);
+    e.dataTransfer.effectAllowed = "copy";
+    setDraggingMedia(item);
+    setDraggingMomentRange(null); // whole asset
+  };
+  const onDragEnd = () => setDraggingMedia(null);
+
+  return (
+    <div
+      draggable
+      onDragStart={onDragStart}
+      onDragEnd={onDragEnd}
+      onClick={() => setPreviewMedia(item.id)}
+      onDoubleClick={() => void addMediaToTimeline(item)}
+      title={item.name}
+      style={{ display: "flex", flexDirection: "column", gap: 3, cursor: "grab" }}
+    >
+      <div
+        style={{
+          aspectRatio: "16 / 9",
+          background: "var(--bg-placeholder)",
+          borderRadius: "var(--radius-sm)",
+          overflow: "hidden",
+          display: "flex",
+          alignItems: "center",
+          justifyContent: "center",
+        }}
+      >
+        {thumb ? (
+          <img src={thumb} alt={item.name} draggable={false} style={{ width: "100%", height: "100%", objectFit: "cover" }} />
+        ) : (
+          <Icon icon={Film} size={18} strokeWidth={1.5} />
+        )}
+      </div>
+      <span
+        style={{
+          fontSize: "var(--fs-xs)",
+          color: "var(--text-secondary)",
+          overflow: "hidden",
+          textOverflow: "ellipsis",
+          whiteSpace: "nowrap",
+        }}
+      >
+        {item.name}
+      </span>
+    </div>
+  );
+}
diff --git a/web/src/components/timeline/TimelineContainer.tsx b/web/src/components/timeline/TimelineContainer.tsx
index 22156c9..6e81ed2 100644
--- a/web/src/components/timeline/TimelineContainer.tsx
+++ b/web/src/components/timeline/TimelineContainer.tsx
@@ -38,6 +38,7 @@ import { ClipContextMenu } from "./ClipContextMenu";
 import { SwapMediaPicker } from "./SwapMediaPicker";
 import { MEDIA_DND_TYPE } from "../media/MediaPanel";
 import { getDraggingMedia, setDraggingMedia } from "../../lib/mediaDragState";
+import { getDraggingMomentRange, setDraggingMomentRange } from "../../lib/momentDragState";
 import { maybeSnapFeedback } from "../../lib/haptic";
 import { useProjectStore } from "../../store/projectStore";
 import { useEditorUiStore } from "../../store/uiStore";
@@ -1353,7 +1354,13 @@ export function TimelineContainer() {
       const item = getDraggingMedia();
       if (!item) return;
       const { docX, docY } = toDoc(e);
-      const durationFrames = edit.mediaDurationFrames(item, timeline.fps);
+      // A search "Moments"/"Spoken" hit drags a trimmed source range: size the
+      // ghost to that range (unless it's a still, which places the whole asset).
+      const momentRange = getDraggingMomentRange();
+      const durationFrames =
+        momentRange && item.type !== "image" && item.duration > 0
+          ? edit.momentDurationFrames(momentRange, timeline.fps)
+          : edit.mediaDurationFrames(item, timeline.fps);
       const rawStart = frameAt(docX, zoomScale);
       // Snap the start OR end edge to a clip edge / playhead (multi-probe, sticky
       // — same engine as a clip move), so the ghost clicks onto neighbours.
@@ -1418,11 +1425,14 @@ export function TimelineContainer() {
       e.stopPropagation();
       const id = e.dataTransfer.getData(MEDIA_DND_TYPE);
       const item = useMediaStore.getState().items.find((m) => m.id === id);
+      // A search-hit drag carries a source-second range → place a trimmed clip.
+      const momentRange = getDraggingMomentRange();
       // Land exactly where the ghost showed: reuse the resolved plan from the
       // last dragover (drop is always preceded by a dragover at the same point).
       const plan = mediaGhostRef.current;
       clearMediaGhost();
       setDraggingMedia(null);
+      setDraggingMomentRange(null);
       // Dropping onto the timeline is an HTML5 `drop` (no pointerdown), so the
       // media-preview→timeline switch in TimelineRegion's onPointerDownCapture
       // never fires. Clear the selected media here so the preview shows the
@@ -1433,7 +1443,17 @@ export function TimelineContainer() {
       if (plan) {
         const preferredTrackIndex = plan.newTrackIndex !== null ? null : plan.trackIndex;
         const insertTrackAt = plan.newTrackIndex !== null ? plan.newTrackIndex : undefined;
-        void edit.addMediaToTimelineAt(item, plan.startFrame, preferredTrackIndex, insertTrackAt);
+        if (momentRange) {
+          void edit.addMomentToTimelineAt(
+            item,
+            plan.startFrame,
+            preferredTrackIndex,
+            momentRange,
+            insertTrackAt,
+          );
+        } else {
+          void edit.addMediaToTimelineAt(item, plan.startFrame, preferredTrackIndex, insertTrackAt);
+        }
         return;
       }
       // Fallback (no prior ghost, e.g. a foreign drag): resolve from the point.
@@ -1442,7 +1462,11 @@ export function TimelineContainer() {
       const target = dropTargetAt(timeline, docY, trackHeights);
       const preferredTrackIndex = target.kind === "existing" ? target.trackIndex : null;
       const insertTrackAt = target.kind === "newTrack" ? target.index : undefined;
-      void edit.addMediaToTimelineAt(item, startFrame, preferredTrackIndex, insertTrackAt);
+      if (momentRange) {
+        void edit.addMomentToTimelineAt(item, startFrame, preferredTrackIndex, momentRange, insertTrackAt);
+      } else {
+        void edit.addMediaToTimelineAt(item, startFrame, preferredTrackIndex, insertTrackAt);
+      }
     },
     [toDoc, zoomScale, timeline, trackHeights, clearMediaGhost],
   );
diff --git a/web/src/i18n/dict.ts b/web/src/i18n/dict.ts
index 7635544..8feb9f8 100644
--- a/web/src/i18n/dict.ts
+++ b/web/src/i18n/dict.ts
@@ -156,6 +156,24 @@ const zh: Dict = {
   "media.offline": "媒体离线",
   "media.relink": "重新链接",
 
+  // 智能搜索（视觉语义 + 口播 + 文件名，对应上游 MediaTab+Search / IndexStatus）
+  "search.group.moments": "画面",
+  "search.group.spoken": "口播",
+  "search.group.files": "文件",
+  "search.smartSearch": "智能搜索",
+  "search.smartSearchHint": "下载一个 {size} 的本地模型，即可按画面内容搜索媒体。",
+  "search.downloading": "下载中 {percent}%",
+  "search.downloadingHint": "正在下载驱动视觉搜索的本地模型。",
+  "search.preparing": "准备中…",
+  "search.indexing": "分析中 {done}/{total}",
+  "search.indexingHint": "正在分析媒体以支持搜索。",
+  "search.index": "建立索引",
+  "search.indexHint": "分析本项目的视频/图片，即可按画面内容搜索。",
+  "search.retry": "重试",
+  "search.retryHint": "视觉搜索模型下载失败。请检查网络后重试。",
+  "search.noMatches": "没有匹配 “{query}” 的结果",
+  "search.dragToTimeline": "拖到时间线以添加此片段",
+
   // 字幕标签（自动转写 + 生成字幕，对应上游 CaptionTab）
   "captions.source": "来源",
   "captions.sourceHelp": "有选中片段时用选中片段，否则用全部可转写音频。选择某条轨道可限定范围。",
@@ -627,6 +645,24 @@ const en: Dict = {
   "media.offline": "Media Offline",
   "media.relink": "Relink",
 
+  // Smart search (visual semantic + spoken + filename, upstream MediaTab+Search / IndexStatus)
+  "search.group.moments": "Moments",
+  "search.group.spoken": "Spoken",
+  "search.group.files": "Files",
+  "search.smartSearch": "Smart search",
+  "search.smartSearchHint": "Downloads a {size} on-device model so you can search media by what's on screen.",
+  "search.downloading": "Downloading {percent}%",
+  "search.downloadingHint": "Downloading the on-device model that powers visual search.",
+  "search.preparing": "Preparing…",
+  "search.indexing": "Indexing {done}/{total}",
+  "search.indexingHint": "Analyzing media so you can search it.",
+  "search.index": "Build index",
+  "search.indexHint": "Analyze this project's video/images so you can search by what's on screen.",
+  "search.retry": "Retry",
+  "search.retryHint": "Visual search model download failed. Check your connection and try again.",
+  "search.noMatches": "No matches for “{query}”",
+  "search.dragToTimeline": "Drag to the timeline to add this segment",
+
   // Captions tab (auto-transcribe + generate captions, upstream CaptionTab)
   "captions.source": "Source",
   "captions.sourceHelp": "Uses selected clips when available, otherwise all captionable audio. Choose a track to limit captions.",
diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts
index 2ab8eb7..f0ae86d 100644
--- a/web/src/lib/api.ts
+++ b/web/src/lib/api.ts
@@ -16,6 +16,9 @@ import type {
   GenerateCaptionsResult,
   MediaList,
   ModelStatus,
+  SearchIndexStatus,
+  SearchModelStatus,
+  SearchResults,
   SecretStatus,
   TimelineSnapshot,
   Transcript,
@@ -425,6 +428,87 @@ export async function generateCaptions(
   throw new Error("caption generation requires the desktop app (whisper)");
 }
 
+// MARK: - Semantic search (SigLIP2 visual model + index + query, search-wiring)
+
+/** Whether the SigLIP2 visual-search model is installed. Never downloads. The
+ *  media panel calls this to decide whether to show the "Smart search" download
+ *  affordance. Outside Tauri there is no backend, so report "not installed". */
+export async function searchModelStatus(): Promise<SearchModelStatus> {
+  await ensureTauri();
+  if (invokeImpl) return invokeImpl<SearchModelStatus>("search_model_status");
+  return { installed: false, model: "", bytes: 0 };
+}
+
+/** Download the SigLIP2 model (idempotent), emitting `search://progress` events
+ *  as bytes arrive, SHA-256-verified. Rejects outside Tauri (no backend). */
+export async function downloadSearchModel(): Promise<SearchModelStatus> {
+  await ensureTauri();
+  if (invokeImpl) return invokeImpl<SearchModelStatus>("download_search_model");
+  throw new Error("search model download requires the desktop app");
+}
+
+/** Subscribe to search-model-download progress (`fraction` in 0..=1). No-op
+ *  outside Tauri. */
+export async function onSearchModelProgress(
+  handler: (fraction: number) => void,
+): Promise<() => void> {
+  await ensureTauri();
+  if (!listenImpl) return () => {};
+  return listenImpl("search://progress", (e) => {
+    const p = e.payload as { fraction?: number } | undefined;
+    if (p && typeof p.fraction === "number") handler(p.fraction);
+  });
+}
+
+/** Snapshot how much of the project's video/image media is indexed. Never
+ *  indexes. Outside Tauri report an empty/uninstalled state. */
+export async function searchIndexStatus(): Promise<SearchIndexStatus> {
+  await ensureTauri();
+  if (invokeImpl) return invokeImpl<SearchIndexStatus>("search_index_status");
+  return { modelInstalled: false, indexable: 0, indexed: 0 };
+}
+
+/** Index every not-yet-current video/image asset (sampled frames → SigLIP2
+ *  embeddings), emitting `search://index` progress. Idempotent. Rejects outside
+ *  Tauri or when the model isn't installed. */
+export async function searchIndexStart(): Promise<SearchIndexStatus> {
+  await ensureTauri();
+  if (invokeImpl) return invokeImpl<SearchIndexStatus>("search_index_start");
+  throw new Error("visual indexing requires the desktop app");
+}
+
+/** Subscribe to indexing progress: `completed`/`total` assets + overall
+ *  `fraction` (0..=1). No-op outside Tauri. */
+export async function onSearchIndexProgress(
+  handler: (progress: { completed: number; total: number; fraction: number }) => void,
+): Promise<() => void> {
+  await ensureTauri();
+  if (!listenImpl) return () => {};
+  return listenImpl("search://index", (e) => {
+    const p = e.payload as
+      | { completed?: number; total?: number; fraction?: number }
+      | undefined;
+    if (
+      p &&
+      typeof p.completed === "number" &&
+      typeof p.total === "number" &&
+      typeof p.fraction === "number"
+    ) {
+      handler({ completed: p.completed, total: p.total, fraction: p.fraction });
+    }
+  });
+}
+
+/** Run the three-group content query — Moments (visual), Spoken (transcript),
+ *  Files (name). Visual is best-effort (empty without a model); Spoken + Files
+ *  always work, so plain filename filtering is the zero-setup fallback. Outside
+ *  Tauri returns empty groups (the panel falls back to its in-memory name filter). */
+export async function searchQuery(query: string): Promise<SearchResults> {
+  await ensureTauri();
+  if (invokeImpl) return invokeImpl<SearchResults>("search_query", { query });
+  return { moments: [], spoken: [], files: [] };
+}
+
 /**
  * Relink an offline asset to a newly chosen file, KEEPING its id so every clip
  * that references it recovers in place (the fix for "lost media stays red after
diff --git a/web/src/lib/momentDragState.ts b/web/src/lib/momentDragState.ts
new file mode 100644
index 0000000..0110937
--- /dev/null
+++ b/web/src/lib/momentDragState.ts
@@ -0,0 +1,27 @@
+/**
+ * Shared drag state for "search Moments/Spoken hit → timeline" drags. A search
+ * hit drags onto the timeline as a *trimmed* source-range clip (only the shot /
+ * spoken segment lands), mirroring upstream's `assetDragString(forAssetId:
+ * segment:)`. The hit still uses {@link MEDIA_DND_TYPE} so the existing timeline
+ * drop machinery (ghost sizing, track resolution) works unchanged; this module
+ * stashes the source-second range the drop reads to place a trimmed clip instead
+ * of the whole asset. Module-level (not a store) so reads/writes never re-render.
+ *
+ * Cleared whenever the gesture ends (drop or a plain media-card drag starting).
+ */
+
+import type { SourceRange } from "../store/editActions";
+
+let range: SourceRange | null = null;
+
+/** Record the source-second range being dragged from a search hit (or clear
+ *  with `null`). A still image (no range) simply never sets this. */
+export function setDraggingMomentRange(next: SourceRange | null): void {
+  range = next;
+}
+
+/** The source-second range of the search hit currently dragged, or `null` when
+ *  the active drag is a plain full-asset drag (or none). */
+export function getDraggingMomentRange(): SourceRange | null {
+  return range;
+}
diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts
index 468f726..1b80e3a 100644
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -452,6 +452,66 @@ export interface GenerateCaptionsResult {
   captionCount: number;
 }
 
+// MARK: - Semantic search (mirror of src-tauri search.rs DTOs)
+
+/** Whether the SigLIP2 visual-search model is installed, plus enough to prompt a
+ *  one-time download (mirror of Rust `SearchModelStatusDto`). */
+export interface SearchModelStatus {
+  installed: boolean;
+  /** Model identity, e.g. "siglip2-base-patch16-256". */
+  model: string;
+  /** Approximate combined download size in bytes (image + text encoder + tokenizer). */
+  bytes: number;
+}
+
+/** Visual-index coverage for the project's video/image assets (mirror of Rust
+ *  `SearchIndexStatusDto`). Drives the panel's "index now" affordance + progress. */
+export interface SearchIndexStatus {
+  /** The model must be installed before anything can be indexed. */
+  modelInstalled: boolean;
+  /** Count of video/image assets in the project. */
+  indexable: number;
+  /** How many already have a current on-disk embedding index. */
+  indexed: number;
+}
+
+/** One visual ("Moments") hit. `frame` is the shot-start in **source frames**
+ *  (thumb + preview anchor); `startSec`/`endSec` are the source-second range used
+ *  to drag a trimmed clip onto the timeline (mirror of Rust `MomentHitDto`). */
+export interface MomentHit {
+  mediaId: string;
+  frame: number;
+  startSec: number;
+  endSec: number;
+  score: number;
+  /** True for still images (no time range → drag as a plain asset). */
+  isImage: boolean;
+}
+
+/** One spoken ("Spoken") transcript hit (mirror of Rust `SpokenHitDto`). */
+export interface SpokenHit {
+  mediaId: string;
+  startSec: number;
+  endSec: number;
+  text: string;
+  score: number;
+}
+
+/** One filename ("Files") match (mirror of Rust `FileHitDto`). */
+export interface FileHit {
+  mediaId: string;
+  score: number;
+}
+
+/** The three-group query result: Moments (visual), Spoken (transcript), Files
+ *  (name), ranked independently and never blended (mirror of Rust
+ *  `SearchResultsDto`). */
+export interface SearchResults {
+  moments: MomentHit[];
+  spoken: SpokenHit[];
+  files: FileHit[];
+}
+
 // MARK: - Media catalog (mirror of src-tauri MediaItemDto / MediaListDto)
 
 /** One media-library item as returned by `get_media` / `import_*`. `type` is the
diff --git a/web/src/store/editActions.test.ts b/web/src/store/editActions.test.ts
index 2cdbb6e..39f094b 100644
--- a/web/src/store/editActions.test.ts
+++ b/web/src/store/editActions.test.ts
@@ -173,8 +173,10 @@ vi.mock("../lib/api", () => ({
 import {
   addMediaToTimeline,
   addMediaToTimelineAt,
+  addMomentToTimelineAt,
   insertTrack,
   mediaDurationFrames,
+  momentDurationFrames,
   pasteClipsAtPlayhead,
   resolveMediaDropTrack,
   swapTracks,
@@ -417,3 +419,62 @@ describe("mediaDurationFrames", () => {
     expect(mediaDurationFrames(item, 30)).toBe(1);
   });
 });
+
+describe("momentDurationFrames", () => {
+  it("returns the range length in frames", () => {
+    expect(momentDurationFrames({ startSec: 3, endSec: 6 }, 30)).toBe(90);
+  });
+
+  it("never returns less than one frame for a tiny range", () => {
+    expect(momentDurationFrames({ startSec: 3, endSec: 3.001 }, 30)).toBe(1);
+  });
+});
+
+describe("addMomentToTimelineAt (trimmed source-range drop from a search hit)", () => {
+  beforeEach(() => {
+    srv.reset();
+    useProjectStore.getState().setMirror(EMPTY, 0);
+    useEditorUiStore.setState({ activeFrame: 0, currentFrame: 0, selectedClipIds: new Set() });
+  });
+
+  /** The first video clip's [trimStart, duration, trimEnd] after a placement. */
+  function firstVideoTrim(): [number, number, number] {
+    const tl = useProjectStore.getState().timeline;
+    const track = tl.tracks.find((t) => t.type === "video");
+    const c = track?.clips[0];
+    return c ? [c.trimStartFrame, c.durationFrames, c.trimEndFrame] : [-1, -1, -1];
+  }
+
+  it("places only the source range as a trimmed clip", async () => {
+    // 10s @ 30fps = 300 source frames. Range [3s,6s] → trimStart 90, duration 90,
+    // trimEnd 300-90-90 = 120. Lands at timeline frame 0.
+    const item: MediaItem = { id: "v", name: "v", type: "video", duration: 10, hasAudio: false };
+    await addMomentToTimelineAt(item, 0, null, { startSec: 3, endSec: 6 });
+    expect(visualClipStarts()).toEqual([0]);
+    expect(firstVideoTrim()).toEqual([90, 90, 120]);
+  });
+
+  it("clamps a range that runs past the source end", async () => {
+    // 5s = 150 frames. Range [4s, 9s] would want duration 150 but only 30 frames
+    // of source remain after trimStart 120 → duration clamps to 30, trimEnd 0.
+    const item: MediaItem = { id: "v", name: "v", type: "video", duration: 5, hasAudio: false };
+    await addMomentToTimelineAt(item, 0, null, { startSec: 4, endSec: 9 });
+    expect(firstVideoTrim()).toEqual([120, 30, 0]);
+  });
+
+  it("falls back to the whole asset for a still image (no range)", async () => {
+    // Images have no meaningful sub-range → placed full (default 5s = 150 frames),
+    // untrimmed.
+    const item: MediaItem = { id: "i", name: "i", type: "image", duration: 0, hasAudio: false };
+    await addMomentToTimelineAt(item, 0, null, { startSec: 0, endSec: 0 });
+    expect(firstVideoTrim()).toEqual([0, 150, 0]);
+  });
+
+  it("lands the trimmed clip at the drop start frame", async () => {
+    const item: MediaItem = { id: "v", name: "v", type: "video", duration: 10, hasAudio: false };
+    await addMomentToTimelineAt(item, 45, null, { startSec: 1, endSec: 2 });
+    expect(visualClipStarts()).toEqual([45]);
+    // 1s..2s → trimStart 30, duration 30, trimEnd 300-30-30 = 240.
+    expect(firstVideoTrim()).toEqual([30, 30, 240]);
+  });
+});
diff --git a/web/src/store/editActions.ts b/web/src/store/editActions.ts
index 9593f78..48fffe8 100644
--- a/web/src/store/editActions.ts
+++ b/web/src/store/editActions.ts
@@ -595,6 +595,37 @@ export function addMediaToTimelineAt(
   return enqueueMediaAdd(() => addMediaToTimelineAtInner(item, startFrame, preferredTrackIndex, insertTrackAt));
 }
 
+/** A source-media sub-range (seconds) to place from a search "Moments"/"Spoken"
+ *  hit: only `[startSec, endSec)` of the asset lands on the timeline as a trimmed
+ *  clip, mirroring upstream's `assetDragString(forAssetId:segment:)`. */
+export interface SourceRange {
+  startSec: number;
+  endSec: number;
+}
+
+/** Frames a moment clip occupies on the timeline for a source `[startSec,endSec)`
+ *  range: the range length in frames, clamped to at least one frame. */
+export function momentDurationFrames(range: SourceRange, fps: number): number {
+  return Math.max(1, Math.round((range.endSec - range.startSec) * fps));
+}
+
+/** Place only `range` of `item` on the timeline at `startFrame` — a trimmed clip
+ *  (drag from a visual/spoken search hit). Reuses the same track resolution as a
+ *  full-asset drop, then overrides the entry's trim/duration from the range.
+ *  A still image (or a range that covers the whole/none of the source) falls back
+ *  to the plain full-asset placement. */
+export function addMomentToTimelineAt(
+  item: MediaItem,
+  startFrame: number,
+  preferredTrackIndex: number | null,
+  range: SourceRange,
+  insertTrackAt?: number,
+): Promise<void> {
+  return enqueueMediaAdd(() =>
+    addMomentToTimelineAtInner(item, startFrame, preferredTrackIndex, range, insertTrackAt),
+  );
+}
+
 async function addMediaToTimelineInner(item: MediaItem): Promise<void> {
   let timeline = useProjectStore.getState().timeline;
   if (firstCompatibleTrackIndex(timeline, item.type) === null) {
@@ -656,6 +687,95 @@ async function addMediaToTimelineAtInner(
   if (isTauri) await forceRefresh();
 }
 
+/** Build the trimmed clip entry for a source `[startSec,endSec)` moment range on
+ *  `item`, resolving the target track like a full-asset drop. Returns null when
+ *  no compatible track exists (the caller then inserts one and retries). */
+function entryForMomentAt(
+  timeline: Timeline,
+  item: MediaItem,
+  startFrame: number,
+  preferredTrackIndex: number | null,
+  range: SourceRange,
+): ClipEntryReq | null {
+  const fps = timeline.fps;
+  const totalSource = mediaDurationFrames(item, fps);
+  const trimStartFrame = Math.max(0, Math.min(totalSource, Math.round(range.startSec * fps)));
+  const rangeFrames = momentDurationFrames(range, fps);
+  // Clamp the visible span so trimStart + duration never exceed the source.
+  const durationFrames = Math.max(1, Math.min(rangeFrames, totalSource - trimStartFrame));
+  const trimEndFrame = Math.max(0, totalSource - trimStartFrame - durationFrames);
+  const trackIndex = firstOpenCompatibleTrackIndex(
+    timeline,
+    item.type,
+    startFrame,
+    durationFrames,
+    preferredTrackIndex,
+  );
+  if (trackIndex === null) return null;
+  return {
+    mediaRef: item.id,
+    mediaType: item.type,
+    sourceClipType: item.type,
+    trackIndex,
+    startFrame: Math.max(0, startFrame),
+    durationFrames,
+    trimStartFrame,
+    trimEndFrame,
+    hasAudio: item.hasAudio,
+    addLinkedAudio: item.type === "video" && item.hasAudio,
+    transform: fitTransformForMedia(item.width, item.height, timeline.width, timeline.height),
+  };
+}
+
+async function addMomentToTimelineAtInner(
+  item: MediaItem,
+  startFrame: number,
+  preferredTrackIndex: number | null,
+  range: SourceRange,
+  insertTrackAt?: number,
+): Promise<void> {
+  // Stills (or a degenerate range) have no meaningful sub-range → full asset.
+  const spanSec = range.endSec - range.startSec;
+  if (item.type === "image" || spanSec <= 0 || item.duration <= 0) {
+    return addMediaToTimelineAtInner(item, startFrame, preferredTrackIndex, insertTrackAt);
+  }
+
+  let timeline = useProjectStore.getState().timeline;
+  if (insertTrackAt !== undefined) {
+    const res = await insertTrack(item.type === "audio" ? "audio" : "video", insertTrackAt);
+    await forceRefresh();
+    timeline = useProjectStore.getState().timeline;
+    const insertedTrackId = res?.affectedClipIds[0];
+    const insertedIndex = insertedTrackId
+      ? timeline.tracks.findIndex((track) => track.id === insertedTrackId)
+      : -1;
+    if (insertedIndex >= 0) preferredTrackIndex = insertedIndex;
+  }
+  let entry = entryForMomentAt(timeline, item, Math.max(0, startFrame), preferredTrackIndex, range);
+  if (!entry) {
+    const fallbackInsertAt = preferredTrackIndex ?? undefined;
+    const res = await insertTrack(item.type === "audio" ? "audio" : "video", fallbackInsertAt);
+    await forceRefresh();
+    timeline = useProjectStore.getState().timeline;
+    const insertedTrackId = res?.affectedClipIds[0];
+    const insertedIndex = insertedTrackId
+      ? timeline.tracks.findIndex((track) => track.id === insertedTrackId)
+      : -1;
+    if (insertedIndex >= 0) {
+      preferredTrackIndex = insertedIndex;
+    } else if (fallbackInsertAt !== undefined) {
+      preferredTrackIndex = Math.max(0, Math.min(fallbackInsertAt, timeline.tracks.length - 1));
+    }
+    entry = entryForMomentAt(timeline, item, Math.max(0, startFrame), preferredTrackIndex, range);
+  }
+  if (!entry) return;
+  const res = await addClips([entry]);
+  if (res && res.affectedClipIds.length > 0) {
+    useEditorUiStore.getState().selectClips(new Set(res.affectedClipIds));
+  }
+  if (isTauri) await forceRefresh();
+}
+
 // MARK: - Text tool (Toolbar "T" button, SPEC §4)
 
 /** Default text clip duration: 3 seconds at the timeline's fps. */