diff --git a/crates/opentake-agent/src/mcp/dispatch.rs b/crates/opentake-agent/src/mcp/dispatch.rs index de38968..d3aef22 100644 --- a/crates/opentake-agent/src/mcp/dispatch.rs +++ b/crates/opentake-agent/src/mcp/dispatch.rs @@ -37,7 +37,7 @@ use serde_json::Value; use crate::mcp::core_handle::CoreHandle; use crate::mcp::gen_catalog; use crate::mcp::media_bridge::{ - frame_to_block, ImportSource, InspectResult, MediaBridge, TranscriptSource, + frame_to_block, ImportSource, InspectResult, MediaBridge, SearchCandidate, TranscriptSource, }; use crate::plugin::registry::PluginRegistry; use crate::signal::engine; @@ -208,19 +208,19 @@ impl Dispatcher { ToolName::SmartReframe => self.smart_reframe(args), ToolName::TightenSilences => self.tighten_silences(args, before), - // --- Render + import + transcript (wired to the injected MediaBridge) --- + // --- Render + import + transcript + search (wired to the injected MediaBridge) --- ToolName::InspectTimeline => self.inspect_timeline(args, before), ToolName::ImportMedia => self.import_media(args, manifest), ToolName::GetTranscript => self.get_transcript(args, before, manifest), ToolName::AddCaptions => self.add_captions(args, before, manifest), + ToolName::SearchMedia => self.search_media(args, manifest), // --- Not yet implementable in this phase (honest stubs) --- - // Media reads (inspect/search) still need the analysis backend; - // generation/upscale need the async GenClient + BYOK auth. - // Motion graphics (#34) now routes through the planned Motion Canvas - // plugin: render mp4 -> import media -> place clip. + // inspect_media still needs the analysis backend; generation/upscale + // need the async GenClient + BYOK auth. Motion graphics (#34) now + // routes through the planned Motion Canvas plugin: render mp4 -> + // import media -> place clip. ToolName::InspectMedia - | ToolName::SearchMedia | ToolName::GenerateVideo | ToolName::GenerateImage | ToolName::GenerateAudio @@ -405,6 +405,134 @@ impl Dispatcher { Ok(ToolResult::ok(outcome.message)) } + /// `search_media`: content search over the library — visual (SigLIP2 + /// semantic) + spoken (transcript keyword), ranked independently and never + /// blended. 1:1 port of `ToolExecutor+Search.searchMedia` + /// (`ToolExecutor+Search.swift:6-32`): validate `query`/`scope`/`limit` + + /// optional `mediaRef` restrict here, resolve the candidate set from the + /// manifest, run both searches behind the [`MediaBridge`], and shape the + /// upstream JSON envelope (`status`/`indexableAssets`/`indexedAssets`/ + /// `moments`/`spoken`). Scores are uncalibrated (ordering only). When the + /// visual index isn't ready, `moments` may be empty — the model is told via + /// `status` and the `indexableAssets`/`indexedAssets` counts, and Spoken + + /// Files-style name lookups still work. + fn search_media( + &self, + args: &Value, + manifest: &MediaManifest, + ) -> Result { + use serde_json::json; + let a: SearchMediaArgs = decode_tool_args(args, "")?; + let query = a.query.trim().to_string(); + if query.is_empty() { + return Ok(ToolResult::error("search_media: query is empty")); + } + // scope ∈ {visual, spoken, both}, default both (upstream). + let scope = a.scope.as_deref().unwrap_or("both"); + if !matches!(scope, "visual" | "spoken" | "both") { + return Ok(ToolResult::error(format!( + "search_media: scope must be visual, spoken, or both (got '{scope}')" + ))); + } + // limit default 10, clamped to 1..=50 (upstream `min(max(limit,1),50)`). + let limit = a.limit.unwrap_or(10).clamp(1, 50) as usize; + + // Optional `mediaRef` restricts the search to one existing asset. + let restrict: Option = match a.media_ref.as_deref() { + Some(ref_id) => { + let entry = manifest.entries.iter().find(|e| e.id == ref_id); + match entry { + Some(e) => Some(e.id.clone()), + None => { + return Ok(ToolResult::error(format!( + "search_media: media not found: {ref_id}" + ))); + } + } + } + None => None, + }; + + // Build the candidate set from the manifest (kind → visual/spoken). + use opentake_domain::ClipType; + let candidates: Vec = manifest + .entries + .iter() + .filter(|e| restrict.as_deref().is_none_or(|r| r == e.id)) + .map(|e| SearchCandidate { + media_ref: e.id.clone(), + is_visual: matches!(e.kind, ClipType::Video | ClipType::Image), + is_spoken: matches!(e.kind, ClipType::Video | ClipType::Audio), + }) + .collect(); + + let Some(bridge) = self.bridge.as_ref() else { + return Ok(ToolResult::error( + "search_media: search is not available in this build", + )); + }; + let result = bridge + .search_media(&candidates, &query, scope, limit) + .map_err(|e| ToolError::new(e.message))?; + + // Shape the upstream JSON. `name` per hit is looked up from the manifest. + let name_of = |media_ref: &str| -> String { + manifest + .entries + .iter() + .find(|e| e.id == media_ref) + .map(|e| e.name.clone()) + .unwrap_or_default() + }; + + let mut payload = serde_json::Map::new(); + if scope != "spoken" { + // Visual group: status + counts always present; moments when ready. + payload.insert("status".into(), json!(result.status.as_str())); + payload.insert("indexableAssets".into(), json!(result.indexable_assets)); + if let Some(indexed) = result.indexed_assets { + payload.insert("indexedAssets".into(), json!(indexed)); + } + let moments: Vec = result + .moments + .iter() + .map(|h| { + let mut m = serde_json::Map::new(); + m.insert("mediaRef".into(), json!(h.media_ref)); + m.insert("name".into(), json!(name_of(&h.media_ref))); + m.insert("score".into(), json!(h.score as f64)); + if h.is_image { + m.insert("type".into(), json!("image")); + } else { + m.insert("startSeconds".into(), json!(h.start_seconds)); + m.insert("endSeconds".into(), json!(h.end_seconds)); + } + Value::Object(m) + }) + .collect(); + payload.insert("moments".into(), json!(moments)); + } + if scope != "visual" { + let spoken: Vec = result + .spoken + .iter() + .map(|h| { + json!({ + "mediaRef": h.media_ref, + "name": name_of(&h.media_ref), + "startSeconds": h.start_seconds, + "endSeconds": h.end_seconds, + "text": h.text, + }) + }) + .collect(); + payload.insert("spoken".into(), json!(spoken)); + } + + let out = round_floats_3dp(Value::Object(payload)); + Ok(ToolResult::ok(out.to_string())) + } + /// `get_transcript`: the live timeline transcript in project frames. Walks /// every caption-eligible audio/video clip, transcribes each unique source /// once (cached, via the [`MediaBridge`]), maps each word through the clip's @@ -2343,6 +2471,9 @@ mod tests { use std::sync::Arc; use crate::mcp::core_handle::CoreHandle; + use crate::mcp::media_bridge::{ + SearchIndexState, SearchMediaResult, SearchSpokenHit, SearchVisualHit, + }; /// A faithful [`CoreHandle`] over a real in-memory [`AppCore`], seeded with a /// video track and one media asset so `add_clips` can run end to end. @@ -3358,8 +3489,15 @@ mod tests { /// Records the media_refs passed to the last `transcribe_sources` call, /// so tests can assert dedup. transcribe_calls: Mutex>>, + /// Canned `search_media` result; when `None` the trait default (disabled) + /// runs. Records the `(query, scope, limit, candidate ids)` of each call. + search_result: Mutex>, + search_calls: Mutex>, } + /// One recorded `search_media` call: `(query, scope, limit, candidate ids)`. + type SearchCall = (String, String, usize, Vec); + impl FakeBridge { fn with_transcript(self, media_ref: &str, t: TranscriptionResult) -> Self { self.transcripts @@ -3447,6 +3585,31 @@ mod tests { message: format!("Imported via {tag}."), }) } + + fn search_media( + &self, + candidates: &[SearchCandidate], + query: &str, + scope: &str, + limit: usize, + ) -> Result { + self.search_calls.lock().unwrap().push(( + query.to_string(), + scope.to_string(), + limit, + candidates.iter().map(|c| c.media_ref.clone()).collect(), + )); + if let Some(result) = self.search_result.lock().unwrap().clone() { + return Ok(result); + } + Ok(SearchMediaResult { + status: SearchIndexState::Disabled, + indexable_assets: 0, + indexed_assets: None, + moments: Vec::new(), + spoken: Vec::new(), + }) + } } /// A dispatcher whose timeline has a single 60-frame clip and a `FakeBridge` @@ -3719,6 +3882,223 @@ mod tests { ); } + // MARK: - search_media (visual + spoken content search via the MediaBridge) + + fn image_entry(id: &str, name: &str) -> MediaManifestEntry { + let mut e = entry(id, name); + e.kind = ClipType::Image; + e + } + + /// A dispatcher over a manifest with a video (`v`), audio (`a`), and image + /// (`i`) asset, plus a `FakeBridge` seeded with `result`. Returns both so + /// tests can assert the recorded call + JSON shape. + fn search_dispatcher(result: SearchMediaResult) -> (Dispatcher, Arc) { + let tl = Timeline::new(); + let mut m = MediaManifest::new(); + m.entries.push(entry("v", "Harbor Sunset")); + m.entries.push(audio_entry("a", "Interview")); + m.entries.push(image_entry("i", "Poster")); + let handle = Arc::new(StateHandle::new(tl, m)); + let bridge = Arc::new(FakeBridge::default()); + *bridge.search_result.lock().unwrap() = Some(result); + let d = Dispatcher::with_bridge( + handle, + Arc::new(RwLock::new(PluginRegistry::new())), + Some(bridge.clone() as Arc), + ); + (d, bridge) + } + + fn sample_search_result() -> SearchMediaResult { + SearchMediaResult { + status: SearchIndexState::Ready, + indexable_assets: 2, + indexed_assets: Some(2), + moments: vec![ + SearchVisualHit { + media_ref: "v".into(), + start_seconds: 3.0, + end_seconds: 6.0, + score: 0.82, + is_image: false, + }, + SearchVisualHit { + media_ref: "i".into(), + start_seconds: 0.0, + end_seconds: 0.0, + score: 0.5, + is_image: true, + }, + ], + spoken: vec![SearchSpokenHit { + media_ref: "a".into(), + start_seconds: 12.0, + end_seconds: 14.0, + text: "the budget plan".into(), + }], + } + } + + #[test] + fn search_media_shapes_upstream_json_with_both_groups() { + let (d, bridge) = search_dispatcher(sample_search_result()); + let r = d.dispatch( + "search_media", + serde_json::json!({ "query": "sunset harbor" }), + ); + assert!(!r.is_error, "{}", r.text_joined()); + let v: serde_json::Value = serde_json::from_str(&first_text(&r)).unwrap(); + + // Visual group: status + counts + moments. + assert_eq!(v["status"], "ready"); + assert_eq!(v["indexableAssets"], 2); + assert_eq!(v["indexedAssets"], 2); + let moments = v["moments"].as_array().unwrap(); + assert_eq!(moments.len(), 2); + // Video hit carries a source-second range + name; no `type`. + assert_eq!(moments[0]["mediaRef"], "v"); + assert_eq!(moments[0]["name"], "Harbor Sunset"); + assert_eq!(moments[0]["startSeconds"], 3.0); + assert_eq!(moments[0]["endSeconds"], 6.0); + assert!(moments[0].get("type").is_none()); + // Image hit is `type: image`, no range. + assert_eq!(moments[1]["mediaRef"], "i"); + assert_eq!(moments[1]["type"], "image"); + assert!(moments[1].get("startSeconds").is_none()); + + // Spoken group: mediaRef/name/range/text. + let spoken = v["spoken"].as_array().unwrap(); + assert_eq!(spoken.len(), 1); + assert_eq!(spoken[0]["mediaRef"], "a"); + assert_eq!(spoken[0]["name"], "Interview"); + assert_eq!(spoken[0]["text"], "the budget plan"); + + // Default scope=both, limit=10 forwarded; all three ids are candidates. + let calls = bridge.search_calls.lock().unwrap(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].1, "both"); + assert_eq!(calls[0].2, 10); + assert_eq!(calls[0].3.len(), 3); + } + + #[test] + fn search_media_scope_visual_omits_spoken() { + let (d, _bridge) = search_dispatcher(sample_search_result()); + let r = d.dispatch( + "search_media", + serde_json::json!({ "query": "harbor", "scope": "visual" }), + ); + assert!(!r.is_error, "{}", r.text_joined()); + let v: serde_json::Value = serde_json::from_str(&first_text(&r)).unwrap(); + assert!(v.get("moments").is_some()); + assert!(v.get("spoken").is_none()); // upstream: visual scope omits spoken + assert!(v.get("status").is_some()); + } + + #[test] + fn search_media_scope_spoken_omits_visual_status() { + let (d, _bridge) = search_dispatcher(sample_search_result()); + let r = d.dispatch( + "search_media", + serde_json::json!({ "query": "budget", "scope": "spoken" }), + ); + assert!(!r.is_error, "{}", r.text_joined()); + let v: serde_json::Value = serde_json::from_str(&first_text(&r)).unwrap(); + assert!(v.get("spoken").is_some()); + // Spoken-only skips the visual group entirely (no status/moments). + assert!(v.get("status").is_none()); + assert!(v.get("moments").is_none()); + } + + #[test] + fn search_media_limit_is_clamped_1_to_50() { + let (d, bridge) = search_dispatcher(sample_search_result()); + // Over-max clamps to 50. + let _ = d.dispatch( + "search_media", + serde_json::json!({ "query": "x", "limit": 999 }), + ); + // Under-min clamps to 1. + let _ = d.dispatch( + "search_media", + serde_json::json!({ "query": "x", "limit": 0 }), + ); + let calls = bridge.search_calls.lock().unwrap(); + assert_eq!(calls[0].2, 50); + assert_eq!(calls[1].2, 1); + } + + #[test] + fn search_media_media_ref_restricts_candidates() { + let (d, bridge) = search_dispatcher(sample_search_result()); + let r = d.dispatch( + "search_media", + serde_json::json!({ "query": "x", "mediaRef": "v" }), + ); + assert!(!r.is_error, "{}", r.text_joined()); + let calls = bridge.search_calls.lock().unwrap(); + // Only the one restricted asset is a candidate. + assert_eq!(calls[0].3, vec!["v".to_string()]); + } + + #[test] + fn search_media_unknown_media_ref_errors() { + let (d, _bridge) = search_dispatcher(sample_search_result()); + let r = d.dispatch( + "search_media", + serde_json::json!({ "query": "x", "mediaRef": "nope" }), + ); + assert!(r.is_error); + assert!( + r.text_joined().contains("media not found"), + "{}", + r.text_joined() + ); + } + + #[test] + fn search_media_empty_query_errors() { + let (d, _bridge) = search_dispatcher(sample_search_result()); + let r = d.dispatch("search_media", serde_json::json!({ "query": " " })); + assert!(r.is_error); + assert!( + r.text_joined().contains("query is empty"), + "{}", + r.text_joined() + ); + } + + #[test] + fn search_media_invalid_scope_errors() { + let (d, _bridge) = search_dispatcher(sample_search_result()); + let r = d.dispatch( + "search_media", + serde_json::json!({ "query": "x", "scope": "sideways" }), + ); + assert!(r.is_error); + assert!( + r.text_joined().contains("scope must be"), + "{}", + r.text_joined() + ); + } + + #[test] + fn search_media_without_bridge_reports_unavailable() { + let mut m = MediaManifest::new(); + m.entries.push(entry("v", "Clip")); + let handle = Arc::new(StateHandle::new(Timeline::new(), m)); + let d = Dispatcher::new(handle, Arc::new(RwLock::new(PluginRegistry::new()))); + let r = d.dispatch("search_media", serde_json::json!({ "query": "x" })); + assert!(r.is_error); + assert!( + r.text_joined().contains("not available in this build"), + "{}", + r.text_joined() + ); + } + // MARK: - get_transcript (timeline transcript via the MediaBridge) fn word(text: &str, start: f64, end: f64) -> TranscriptionWord { diff --git a/crates/opentake-agent/src/mcp/media_bridge.rs b/crates/opentake-agent/src/mcp/media_bridge.rs index 3cc6316..8100877 100644 --- a/crates/opentake-agent/src/mcp/media_bridge.rs +++ b/crates/opentake-agent/src/mcp/media_bridge.rs @@ -144,6 +144,101 @@ pub struct TranscriptSourceResult { pub error: Option, } +/// One visual ("Moments") hit for `search_media` — a source-second range in one +/// asset, or a still image (no range). Source-second timings, ready to convert to +/// `trimStartFrame`/`trimEndFrame` (upstream `visualResults`' `moments` entries). +#[derive(Debug, Clone)] +pub struct SearchVisualHit { + /// Asset id (`mediaRef`). + pub media_ref: String, + /// Shot-start in source seconds (omitted for stills). + pub start_seconds: f64, + /// Shot-end in source seconds (omitted for stills). + pub end_seconds: f64, + /// Uncalibrated similarity score (ordering only). + pub score: f32, + /// True for still images: no time range → upstream sets `type: "image"`. + pub is_image: bool, +} + +/// One spoken ("Spoken") hit for `search_media`: a transcript segment matching +/// every query term (upstream `spokenResults` entries). +#[derive(Debug, Clone)] +pub struct SearchSpokenHit { + pub media_ref: String, + pub start_seconds: f64, + pub end_seconds: f64, + pub text: String, +} + +/// The visual index's state for the `search_media` `status` field, mirroring +/// upstream's `visualStatus` string enum (`ToolExecutor+Search.swift:91-100`). +/// The dispatcher serializes the exact upstream spelling. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SearchIndexState { + /// Model installed, everything (currently) indexed. + Ready, + /// Model installed, indexing still in progress. + Indexing, + /// Model not yet downloaded. + ModelNotInstalled, + /// Model download in flight. + DownloadingModel, + /// Model loading/preparing. + Preparing, + /// Visual search disabled (no backend / build without it). + Disabled, + /// Model load or download failed. + Failed, +} + +impl SearchIndexState { + /// The upstream string spelling for the `status` field. + pub fn as_str(self) -> &'static str { + match self { + SearchIndexState::Ready => "ready", + SearchIndexState::Indexing => "indexing", + SearchIndexState::ModelNotInstalled => "modelNotInstalled", + SearchIndexState::DownloadingModel => "downloadingModel", + SearchIndexState::Preparing => "preparing", + SearchIndexState::Disabled => "disabled", + SearchIndexState::Failed => "failed", + } + } +} + +/// One asset to search for [`MediaBridge::search_media`]: the dispatcher resolves +/// the candidate set (optionally restricted to one `mediaRef`) and hands these +/// down, since only the bridge can resolve ids to files + read the caches. +#[derive(Debug, Clone)] +pub struct SearchCandidate { + /// Asset id (`mediaRef`). + pub media_ref: String, + /// True for video/image (visual-searchable). + pub is_visual: bool, + /// True for video/audio (spoken-searchable). + pub is_spoken: bool, +} + +/// The full `search_media` result the bridge returns; the dispatcher shapes it +/// into the upstream JSON envelope (`status`/`indexableAssets`/`indexedAssets`/ +/// `moments`/`spoken`). Groups rank independently and are never blended. +#[derive(Debug, Clone)] +pub struct SearchMediaResult { + /// The visual index state for the `status` field. + pub status: SearchIndexState, + /// Count of visual assets in scope (upstream `indexableAssets`). + pub indexable_assets: usize, + /// How many of those already have a current on-disk index + /// (upstream `indexedAssets`); `None` when the model isn't loaded so the + /// count can't be computed (upstream omits the key then). + pub indexed_assets: Option, + /// Visual hits (empty when `scope == "spoken"` or the index isn't ready). + pub moments: Vec, + /// Spoken hits (empty when `scope == "visual"`; work regardless of status). + pub spoken: Vec, +} + /// The injected capability boundary for the render + import tools. `Send + Sync` /// so the [`Dispatcher`](super::dispatch::Dispatcher) can hold `Arc` across threads (matching [`CoreHandle`](super::core_handle)). @@ -191,6 +286,29 @@ pub trait MediaBridge: Send + Sync { "import_media: importing is not available in this build", )) } + + /// Search the media library by content: visual (SigLIP2 semantic) and spoken + /// (transcript keyword). `candidates` is the resolved, in-scope asset set + /// (already filtered to one `mediaRef` when the caller restricted it); + /// `scope` is `"visual"`/`"spoken"`/`"both"` and `limit` the per-group cap. + /// The two groups rank independently and are never blended (upstream). The + /// default reports `disabled` with no hits so a bridge-less build still + /// returns an honest, well-formed result the model can read. + fn search_media( + &self, + _candidates: &[SearchCandidate], + _query: &str, + _scope: &str, + _limit: usize, + ) -> Result { + Ok(SearchMediaResult { + status: SearchIndexState::Disabled, + indexable_assets: 0, + indexed_assets: None, + moments: Vec::new(), + spoken: Vec::new(), + }) + } } /// Turn one [`InspectedFrame`] into an MCP image [`Block`], base64-encoding the diff --git a/crates/opentake-media/src/search/config.rs b/crates/opentake-media/src/search/config.rs index fe9c1d3..2289320 100644 --- a/crates/opentake-media/src/search/config.rs +++ b/crates/opentake-media/src/search/config.rs @@ -20,6 +20,16 @@ pub const EMBEDDING_DIM: usize = 768; pub const IMAGE_SIZE: u32 = 256; pub const CONTEXT_LENGTH: usize = 64; +/// Base URL the ONNX model files are fetched from (`{base}/{file}`), mirroring +/// `WhisperModel.base_url`. Placeholder until the ONNX build is hosted (SPEC +/// T8.0): the download command constructs `{base}/image_encoder.onnx` etc., and +/// SHA-256-verifies each against [`manifest`]'s (currently placeholder) hashes, +/// so a real download only succeeds once both this URL and the manifest +/// hashes/bytes are filled in. The Hugging Face `resolve/main` raw-file endpoint +/// is the intended host (same shape as the whisper model URL). +pub const MODEL_DOWNLOAD_BASE_URL: &str = + "https://huggingface.co/opentake/siglip2-base-patch16-256-onnx/resolve/main"; + /// The [`EmbedderSpec`] for the configured SigLIP2 model. `normalized` defaults /// to `false` to match upstream's assumption that the exported model L2- /// normalizes internally (SPEC §0.8); flip it only if calibration proves the diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index a09434e..c928adb 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -30,11 +30,18 @@ opentake-core = { workspace = true } opentake-project = { workspace = true } opentake-ops = { workspace = true } opentake-domain = { workspace = true } -# Transcription is ON for the shipped app: `whisper-backend` compiles the -# whisper.cpp CPU backend (via cmake — preinstalled on GitHub runners, no CUDA), -# `model-download` pulls the ggml model over HTTPS with SHA-1 verification. Both -# stay optional at the opentake-media level (its own tests run without them). -opentake-media = { workspace = true, features = ["whisper-backend", "model-download"] } +# Transcription + visual semantic search are ON for the shipped app: +# `whisper-backend` compiles the whisper.cpp CPU backend (via cmake — +# preinstalled on GitHub runners, no CUDA); `ort-backend` links ONNX Runtime for +# the SigLIP2 dual-encoder — its `download-binaries` feature fetches a +# *statically* linked prebuilt onnxruntime at build time and bakes it into the +# binary (no runtime .so/.dylib to ship; same build-time-network shape as +# whisper.cpp / model-download); `model-download` pulls the model files over +# HTTPS with checksum verification. All three stay optional at the +# opentake-media level (its own default tests run offline, without them); +# ort-inference tests are feature-gated + runtime-skipped like the GPU/ffmpeg +# integration tests. +opentake-media = { workspace = true, features = ["whisper-backend", "ort-backend", "model-download"] } opentake-render = { workspace = true } opentake-gen = { workspace = true } opentake-agent = { workspace = true } diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 263aa40..323e7cc 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -17,6 +17,7 @@ mod library; mod mcp; mod media; mod render; +mod search; mod secret; mod transcribe; @@ -192,6 +193,11 @@ pub fn run() { transcribe::transcribe_media, transcribe::transcript_get, captions::generate_captions, + search::search_model_status, + search::download_search_model, + search::search_index_status, + search::search_index_start, + search::search_query, library::library_list, library::library_favorite, library::library_unfavorite, diff --git a/src-tauri/src/mcp.rs b/src-tauri/src/mcp.rs index 9e33545..e3add78 100644 --- a/src-tauri/src/mcp.rs +++ b/src-tauri/src/mcp.rs @@ -27,6 +27,7 @@ use base64::Engine as _; use opentake_agent::mcp::core_handle::{AppCoreHandle, CoreHandle}; use opentake_agent::mcp::media_bridge::{ BridgeError, ImportOutcome, ImportSource, InspectResult, InspectedFrame, MediaBridge, + SearchCandidate, SearchIndexState, SearchMediaResult, SearchSpokenHit, SearchVisualHit, TranscriptSource, TranscriptSourceResult, }; use opentake_agent::mcp::server; @@ -247,6 +248,97 @@ impl MediaBridge for TauriMediaBridge { )), } } + + fn search_media( + &self, + candidates: &[SearchCandidate], + query: &str, + scope: &str, + limit: usize, + ) -> Result { + // Resolve every candidate id to its source path from the live manifest. + // Missing (offline) files are kept — their index/transcript reads simply + // yield nothing, matching upstream (a missing file has no results, not an + // error). Unresolvable ids are dropped. + let manifest = self.core.media(); + let project_dir = self.core.project_dir(); + let resolver = opentake_domain::MediaResolver::new(&manifest, project_dir.as_deref()); + let mut visual_paths: Vec<(String, PathBuf)> = Vec::new(); + let mut spoken_paths: Vec<(String, PathBuf)> = Vec::new(); + for c in candidates { + let Some(path) = resolver.expected_path(&c.media_ref) else { + continue; + }; + if c.is_visual { + visual_paths.push((c.media_ref.clone(), path.clone())); + } + if c.is_spoken { + spoken_paths.push((c.media_ref.clone(), path)); + } + } + + let fps = self.core.get_timeline().timeline.fps; + let installed = crate::search::model_installed(&self.engine); + + // Visual group (skipped for scope == "spoken"). + let (status, indexable_assets, indexed_assets, moments) = if scope == "spoken" { + (SearchIndexState::Disabled, 0, None, Vec::new()) + } else { + let (indexable, indexed) = crate::search::visual_coverage(&self.engine, &visual_paths); + // Status mirrors upstream `visualStatus`: without the model it's + // `modelNotInstalled`; with it, `indexing` while any indexable asset + // is still un-indexed, else `ready`. (Download/preparing/failed are + // transient front-end states the panel owns; the tool reports the + // stable installed/ready/indexing view.) + let status = if !installed { + SearchIndexState::ModelNotInstalled + } else if indexable > 0 && indexed < indexable { + SearchIndexState::Indexing + } else { + SearchIndexState::Ready + }; + let moments: Vec = + crate::search::visual_hits_by_id(&self.engine, &visual_paths, query, fps, limit) + .into_iter() + .map(|h| SearchVisualHit { + media_ref: h.media_id, + start_seconds: h.start_sec, + end_seconds: h.end_sec, + score: h.score, + is_image: h.is_image, + }) + .collect(); + // `indexedAssets` is only meaningful when the model is loaded + // (upstream sets it only when an embedder spec exists). + let indexed_opt = if installed { Some(indexed) } else { None }; + (status, indexable, indexed_opt, moments) + }; + + // Spoken group (skipped for scope == "visual"). Works regardless of the + // visual index — keyword search over cached transcripts. + let spoken: Vec = if scope == "visual" { + Vec::new() + } else { + self.engine + .search_spoken(query, &spoken_paths, limit) + .into_iter() + .map(|h| SearchSpokenHit { + media_ref: h.asset_id, + start_seconds: h.start, + end_seconds: h.end, + text: h.text, + }) + .collect() + }; + + Ok(SearchMediaResult { + status, + indexable_assets, + indexed_assets, + moments, + spoken, + }) + } } impl TauriMediaBridge { diff --git a/src-tauri/src/search.rs b/src-tauri/src/search.rs new file mode 100644 index 0000000..8e2506c --- /dev/null +++ b/src-tauri/src/search.rs @@ -0,0 +1,839 @@ +//! Visual + spoken semantic search command surface. +//! +//! Wires the built-but-previously-unreachable SigLIP2 visual-search engine +//! (`opentake_media::search`) to the app, alongside the already-wired spoken +//! (transcript keyword) search. Upstream is `Search/SearchIndexCoordinator.swift` +//! (per-project indexing queue + query) and `MediaTab+Search.swift` (the three +//! result groups: Moments / Spoken / Files). OpenTake substitutes ONNX Runtime +//! for CoreML, so the SigLIP2 model is two explicit `.onnx` files the user +//! downloads once (mirroring the whisper flow in `transcribe.rs`). +//! +//! Commands (all camelCase DTOs, `web/src/lib/types.ts` contract — the repo's #1 +//! bug class — with serde round-trip tests): +//! - [`search_model_status`] — is the SigLIP2 model installed? (+ label / size). +//! - [`download_search_model`] — async download with `search://progress` events, +//! SHA-256 verified exactly as `search::model_download::install` provides. +//! - [`search_index_status`] — how much of the project's visual media is indexed. +//! - [`search_index_start`] — index every not-yet-indexed video/image asset +//! (sampled frames → SigLIP2 embeddings → `PALMEMB1` store), emitting +//! `search://index` progress events. Idempotent (already-current assets skip). +//! - [`search_query`] — run the three-group query: Moments (visual), Spoken +//! (transcript), Files (name match). Matches upstream's groups, caps, and order. +//! +//! The visual index / query path needs the ONNX Runtime backend (feature +//! `ort-backend`, ON for the shipped app). When the model isn't installed, the +//! visual groups degrade to empty and `search_query` still returns Spoken + Files +//! — so plain filename filtering keeps working with zero setup, exactly like the +//! upstream Files group. + +use std::path::PathBuf; + +use serde::{Deserialize, Serialize}; +use tauri::{AppHandle, Emitter, State}; + +use opentake_core::AppCore; +use opentake_domain::{ClipType, MediaResolver}; +use opentake_media::search::config as search_config; +use opentake_media::search::config::{RELATIVE_CUTOFF, SEARCH_LIMIT, VISUAL_MATCH_COSINE_FLOOR}; +use opentake_media::MediaEngine; + +use crate::media::MediaState; + +/// One visual ("Moments") hit projected to the front end. `frame` is the shot's +/// start in **source frames** (upstream drags `shotStart…shotEnd`; the panel +/// thumbnails at `shotStart`). `startSec`/`endSec` carry the full source-second +/// range so the UI can drag it onto the timeline as a trimmed clip. camelCase. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct MomentHitDto { + /// Asset id (the clip layer's `media_ref`). + pub media_id: String, + /// Shot-start frame in source-media frames (thumb + preview anchor). + pub frame: i64, + /// Shot-start in source seconds (drag range lower bound). + pub start_sec: f64, + /// Shot-end in source seconds (drag range upper bound). Equals `start_sec` + /// for stills (zero-length shot). + pub end_sec: f64, + /// Uncalibrated similarity score (ordering only — upstream note). + pub score: f32, + /// True for still images (no time range → drag as a plain asset). + pub is_image: bool, +} + +/// One spoken ("Spoken") hit: an asset's transcript segment matching every query +/// term. Keyword hits are unranked upstream; `score` is a fixed `1.0` so the DTO +/// shape is uniform (ordering within the group follows transcript order). +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct SpokenHitDto { + pub media_id: String, + pub start_sec: f64, + pub end_sec: f64, + pub text: String, + pub score: f32, +} + +/// One filename ("Files") match. `score` is a fixed `1.0` (name matches are +/// unranked; upstream sorts by the panel's sort mode, default insertion order). +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct FileHitDto { + pub media_id: String, + pub score: f32, +} + +/// The full three-group query result, mirroring upstream's Moments / Spoken / +/// Files sections (`MediaTab+Search.swift:12-33`). +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Default)] +#[serde(rename_all = "camelCase")] +pub struct SearchResultsDto { + pub moments: Vec, + pub spoken: Vec, + pub files: Vec, +} + +/// Whether the SigLIP2 model is installed, plus enough to prompt a download. +/// Mirrors `transcribe.rs`'s `ModelStatusDto`. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct SearchModelStatusDto { + /// True when both encoder files + tokenizer are present on disk. + pub installed: bool, + /// Human label for the model (`"siglip2-base-patch16-256"`). + pub model: String, + /// Approximate combined download size in bytes (image + text encoder + + /// tokenizer), for the prompt. `0` until the ONNX assets are hosted. + pub bytes: i64, +} + +/// Visual-index coverage for the project's indexable (video/image) assets. +/// Drives the panel's indexing affordance (upstream `SearchIndexCoordinator`'s +/// `batchTotal`/`batchCompleted`, surfaced here as a snapshot the UI polls or +/// receives via `search://index` events). +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Default)] +#[serde(rename_all = "camelCase")] +pub struct SearchIndexStatusDto { + /// The model must be installed before anything can be indexed. + pub model_installed: bool, + /// Count of video/image assets in the project (upstream `indexableAssets`). + pub indexable: usize, + /// How many of those already have a current on-disk embedding index. + pub indexed: usize, +} + +/// Progress payload for the `search://progress` (model download) event. +#[derive(Clone, Debug, Serialize, PartialEq)] +#[serde(rename_all = "camelCase")] +struct DownloadProgress { + fraction: f64, +} + +/// Progress payload for the `search://index` event: `completed`/`total` assets +/// plus the current asset's fraction (mirrors the coordinator's progress ring +/// math in `MediaTab+IndexStatus.swift`). +#[derive(Clone, Debug, Serialize, PartialEq)] +#[serde(rename_all = "camelCase")] +struct IndexProgress { + completed: usize, + total: usize, + fraction: f64, +} + +// MARK: - Pure helpers (testable without the ONNX backend) + +/// The combined model download size (image + text encoder + tokenizer). +fn model_bytes() -> i64 { + let m = search_config::manifest(); + m.image_encoder.bytes + m.text_encoder.bytes + m.tokenizer.bytes +} + +/// Convert source seconds to a source frame with upstream's **truncating** +/// `secondsToFrame` (`Int(s*fps)`, not rounding). `fps <= 0` falls back to 30. +fn seconds_to_frame(seconds: f64, fps: i32) -> i64 { + let fps = if fps > 0 { fps as f64 } else { 30.0 }; + (seconds.max(0.0) * fps) as i64 +} + +/// Name-substring match for the Files group — case-insensitive `contains`, the +/// zero-setup fallback (upstream `passesFilters`' `localizedCaseInsensitiveContains`). +/// Returns matches in manifest (insertion) order to mirror the default +/// `.dateAdded` sort. Never mutates the input. +fn file_matches(entries: &[(String, String)], query: &str) -> Vec { + let q = query.trim().to_lowercase(); + if q.is_empty() { + return Vec::new(); + } + entries + .iter() + .filter(|(_, name)| name.to_lowercase().contains(&q)) + .map(|(id, _)| FileHitDto { + media_id: id.clone(), + score: 1.0, + }) + .collect() +} + +/// Project a spoken (transcript keyword) hit into its DTO. Score is a fixed +/// `1.0` (keyword matches are unranked; ordering is transcript order). +fn spoken_dto(h: &opentake_media::SpokenHit) -> SpokenHitDto { + SpokenHitDto { + media_id: h.asset_id.clone(), + start_sec: h.start, + end_sec: h.end, + text: h.text.clone(), + score: 1.0, + } +} + +/// Project a visual rank `Hit` into its Moments DTO. `is_image` is true when the +/// shot is zero-length (`shot_start == shot_end`, upstream's still-image row). +fn moment_dto(h: &opentake_media::Hit, fps: i32) -> MomentHitDto { + let is_image = h.shot_end <= h.shot_start; + MomentHitDto { + media_id: h.asset_id.clone(), + frame: seconds_to_frame(h.shot_start, fps), + start_sec: h.shot_start, + end_sec: h.shot_end, + score: h.score, + is_image, + } +} + +/// One indexable/searchable asset resolved from the live manifest: id, absolute +/// source path, and kind (drives visual vs. spoken candidacy). +struct ResolvedAsset { + id: String, + name: String, + path: PathBuf, + kind: ClipType, +} + +/// Resolve every manifest asset to `(id, name, path, kind)`, dropping any whose +/// path can't be resolved. Offline (missing) files are kept — indexing/search +/// skip them at read time, matching upstream (a missing file simply yields no +/// index rather than dropping the asset). +fn resolve_assets(core: &AppCore) -> Vec { + let manifest = core.media(); + let project_dir = core.project_dir(); + let resolver = MediaResolver::new(&manifest, project_dir.as_deref()); + manifest + .entries + .iter() + .filter_map(|e| { + let path = resolver.expected_path(&e.id)?; + Some(ResolvedAsset { + id: e.id.clone(), + name: e.name.clone(), + path, + kind: e.kind, + }) + }) + .collect() +} + +/// A visual asset is a video or image (upstream `type == .video || .image`). +fn is_visual(kind: ClipType) -> bool { + matches!(kind, ClipType::Video | ClipType::Image) +} + +/// A spoken-searchable asset is a video or audio (upstream candidate filter in +/// `scheduleMomentSearch` / `spokenResults`). +fn is_spoken(kind: ClipType) -> bool { + matches!(kind, ClipType::Video | ClipType::Audio) +} + +// MARK: - Commands + +/// `search_model_status`: report whether the SigLIP2 ONNX model is installed. +/// Never downloads. The panel calls this to decide whether to show the +/// "Smart search" download affordance (upstream `MediaTab+IndexStatus.swift`). +#[tauri::command] +pub fn search_model_status(media: State<'_, MediaState>) -> SearchModelStatusDto { + let models_dir = media.engine().models_dir(); + let manifest = search_config::manifest(); + SearchModelStatusDto { + installed: opentake_media::search::model_download::installed(models_dir, &manifest) + .is_some(), + model: manifest.model.clone(), + bytes: model_bytes(), + } +} + +/// `download_search_model`: fetch the SigLIP2 ONNX assets (idempotent), emit +/// `search://progress` events as bytes arrive, and SHA-256-verify each file +/// before installing — exactly the machinery `search::model_download::install` +/// provides. Async (network-bound) so it never blocks the UI. Returns the +/// installed status on success. +#[tauri::command] +pub async fn download_search_model( + app: AppHandle, + media: State<'_, MediaState>, +) -> Result { + let models_dir = media.engine().models_dir().to_path_buf(); + let manifest = search_config::manifest(); + let base_url = search_config::MODEL_DOWNLOAD_BASE_URL; + let on_progress = |fraction: f64| { + let _ = app.emit("search://progress", DownloadProgress { fraction }); + }; + opentake_media::search::model_download::install(&models_dir, &manifest, base_url, on_progress) + .await + .map_err(|e| e.to_string())?; + Ok(SearchModelStatusDto { + installed: true, + model: manifest.model.clone(), + bytes: model_bytes(), + }) +} + +/// `search_index_status`: snapshot how much of the project's indexable (video/ +/// image) media already has a current on-disk embedding index. Never indexes. +/// The panel uses it to decide whether to offer "index now" and to show the +/// progress ring's denominator. +#[tauri::command] +pub fn search_index_status( + core: State<'_, AppCore>, + media: State<'_, MediaState>, +) -> SearchIndexStatusDto { + let engine = media.engine(); + let models_dir = engine.models_dir(); + let manifest = search_config::manifest(); + let model_installed = + opentake_media::search::model_download::installed(models_dir, &manifest).is_some(); + let spec = search_config::embedder_spec(); + let assets = resolve_assets(&core); + let visual: Vec<&ResolvedAsset> = assets.iter().filter(|a| is_visual(a.kind)).collect(); + let indexed = visual + .iter() + .filter(|a| !opentake_media::search::needs_index(engine.cache_root(), &a.path, &spec)) + .count(); + SearchIndexStatusDto { + model_installed, + indexable: visual.len(), + indexed, + } +} + +/// `search_index_start`: index every not-yet-current video/image asset in the +/// project (sampled frames → SigLIP2 embeddings → `PALMEMB1` store), emitting +/// `search://index` progress as each asset completes. Idempotent — already-current +/// assets are skipped by the indexer. Errors if the model isn't installed +/// (guiding the UI to `download_search_model`). Runs the CPU/GPU-bound inference +/// on Tauri's worker thread (the command is sync, so Tauri dispatches it off the +/// UI thread, matching `transcribe_media`). The ONNX backend is always enabled in +/// the shipped app (`opentake-media`'s `ort-backend` feature), so this calls the +/// SigLIP2 embedder directly, mirroring how `transcribe.rs` calls whisper. +#[tauri::command] +pub fn search_index_start( + app: AppHandle, + core: State<'_, AppCore>, + media: State<'_, MediaState>, +) -> Result { + let engine = media.engine(); + let embedder = load_embedder(engine)?; + let assets = resolve_assets(&core); + index_assets(app, engine, &assets, &embedder)?; + // Return a fresh status snapshot so the UI settles on the final counts. + Ok(index_status_snapshot(engine, &assets)) +} + +/// `search_query`: run the three-group content query — Moments (visual, when the +/// model is installed), Spoken (transcript keyword), Files (name match). Matches +/// upstream's groups, caps, and order (`MediaTab+Search.swift`). Visual is +/// best-effort: with no installed model (or an all-unindexed project) `moments` +/// is empty and Spoken + Files still return — so plain filename filtering is the +/// zero-setup fallback. Never errors on a missing model (an empty query returns +/// empty groups). +#[tauri::command] +pub fn search_query( + core: State<'_, AppCore>, + media: State<'_, MediaState>, + query: String, +) -> SearchResultsDto { + let trimmed = query.trim().to_string(); + if trimmed.is_empty() { + return SearchResultsDto::default(); + } + let engine = media.engine(); + let assets = resolve_assets(&core); + let fps = core.get_timeline().timeline.fps; + + // Files: name-substring over every asset (the zero-setup fallback). + let name_entries: Vec<(String, String)> = assets + .iter() + .map(|a| (a.id.clone(), a.name.clone())) + .collect(); + let files = file_matches(&name_entries, &trimmed); + + // Spoken: keyword over cached transcripts of video/audio assets. + let spoken_candidates: Vec<(String, PathBuf)> = assets + .iter() + .filter(|a| is_spoken(a.kind)) + .map(|a| (a.id.clone(), a.path.clone())) + .collect(); + let spoken: Vec = engine + .search_spoken(&trimmed, &spoken_candidates, SEARCH_LIMIT) + .iter() + .map(spoken_dto) + .collect(); + + // Moments: visual rank over on-disk embedding indexes (needs the model). + let moments = search_visual(engine, &assets, &trimmed, fps); + + SearchResultsDto { + moments, + spoken, + files, + } +} + +/// Visual query for the panel: rank the project's visual assets, capped at the +/// panel default [`SEARCH_LIMIT`]. Delegates to [`visual_hits_by_id`]. +fn search_visual( + engine: &MediaEngine, + assets: &[ResolvedAsset], + query: &str, + fps: i32, +) -> Vec { + let id_paths: Vec<(String, PathBuf)> = assets + .iter() + .filter(|a| is_visual(a.kind)) + .map(|a| (a.id.clone(), a.path.clone())) + .collect(); + visual_hits_by_id(engine, &id_paths, query, fps, SEARCH_LIMIT) +} + +/// Rank `query` against the on-disk embedding indexes of the given visual assets +/// (`(id, path)` pairs), returning up to `limit` Moments hits. Loads the +/// installed model, encodes the text query, loads each asset's `.embed` index +/// (skipping missing/stale), and ranks best-per-shot with the `min_score` floor +/// then `limit` + relative cutoff — the exact upstream order. Empty when the +/// model isn't installed, the query can't encode, or nothing is indexed. Shared +/// by the panel query and the `search_media` MCP bridge so both rank identically. +pub(crate) fn visual_hits_by_id( + engine: &MediaEngine, + id_paths: &[(String, PathBuf)], + query: &str, + fps: i32, + limit: usize, +) -> Vec { + use opentake_media::search::embed_store; + + let Ok(embedder) = load_embedder(engine) else { + return Vec::new(); + }; + // Encode the text query once; a failure yields no visual hits. + let Ok(vector) = opentake_media::search::Embedder::encode_text(&embedder, query) else { + return Vec::new(); + }; + + // Load each asset's current index (skip missing/stale silently). + let mut indexes: Vec<(String, embed_store::AssetIndex)> = Vec::new(); + for (id, path) in id_paths { + let Some(key) = embed_store::key(path) else { + continue; + }; + if let Ok(index) = embed_store::load(engine.cache_root(), &key) { + indexes.push((id.clone(), index)); + } + } + if indexes.is_empty() { + return Vec::new(); + } + + opentake_media::search_visual_ranked( + &vector, + &indexes, + limit, + RELATIVE_CUTOFF, + Some(VISUAL_MATCH_COSINE_FLOOR), + ) + .iter() + .map(|h| moment_dto(h, fps)) + .collect() +} + +/// Compute the visual-index coverage for a set of visual asset `(id, path)` +/// pairs: `(indexable, indexed)`. Shared by the `search_media` bridge for its +/// `indexableAssets`/`indexedAssets` fields. `indexed` counts assets whose +/// on-disk embedding index is current for the configured model. +pub(crate) fn visual_coverage( + engine: &MediaEngine, + id_paths: &[(String, PathBuf)], +) -> (usize, usize) { + let spec = search_config::embedder_spec(); + let indexed = id_paths + .iter() + .filter(|(_, path)| !opentake_media::search::needs_index(engine.cache_root(), path, &spec)) + .count(); + (id_paths.len(), indexed) +} + +/// True when the SigLIP2 model is installed. Shared with the `search_media` +/// bridge to pick its `status` string. +pub(crate) fn model_installed(engine: &MediaEngine) -> bool { + let manifest = search_config::manifest(); + opentake_media::search::model_download::installed(engine.models_dir(), &manifest).is_some() +} + +// MARK: - ort-backed indexing internals + +/// Load the installed SigLIP2 embedder, or a structured "model not installed" +/// error the UI turns into a download prompt. Mirrors `transcribe.rs`'s +/// `load_backend`. +fn load_embedder(engine: &MediaEngine) -> Result { + let models_dir = engine.models_dir(); + let manifest = search_config::manifest(); + let installed = opentake_media::search::model_download::installed(models_dir, &manifest) + .ok_or_else(|| { + format!( + "visual search model not installed — download '{}' first", + manifest.model + ) + })?; + let tokenizer_json = installed.tokenizer_folder.join("tokenizer.json"); + opentake_media::search::OrtEmbedder::new( + &installed.image_encoder, + &installed.text_encoder, + &tokenizer_json, + installed.spec, + ) + .map_err(|e| e.to_string()) +} + +/// Index every not-yet-current video/image asset, emitting a `search://index` +/// event as each completes. The single-worker sequential loop mirrors the +/// coordinator's `ensureWorker` queue (`SearchIndexCoordinator.swift:139-160`) — +/// one asset at a time, in manifest order — kept simple here (Tauri already runs +/// the command off the UI thread; a background queue is a later refinement). +fn index_assets( + app: AppHandle, + engine: &MediaEngine, + assets: &[ResolvedAsset], + embedder: &opentake_media::search::OrtEmbedder, +) -> Result<(), String> { + use opentake_media::search::Embedder; + use opentake_media::search::{ + index_image, index_video, needs_index, CancelToken, SamplerOptions, + }; + + let spec = Embedder::spec(embedder).clone(); + let cache_root = engine.cache_root(); + let cancel = CancelToken::new(); + let opts = SamplerOptions::default(); + + // Only assets that actually need work (idempotent), preserving order. + let pending: Vec<&ResolvedAsset> = assets + .iter() + .filter(|a| is_visual(a.kind) && needs_index(cache_root, &a.path, &spec)) + .collect(); + let total = pending.len(); + if total == 0 { + let _ = app.emit( + "search://index", + IndexProgress { + completed: 0, + total: 0, + fraction: 1.0, + }, + ); + return Ok(()); + } + + for (i, a) in pending.iter().enumerate() { + // Per-asset progress: forward the sampler's fraction into the batch. + let base = i; + let on_progress = |frac: f64| { + let _ = app.emit( + "search://index", + IndexProgress { + completed: base, + total, + fraction: (base as f64 + frac.clamp(0.0, 1.0)) / total as f64, + }, + ); + }; + + // A per-asset failure (offline file, decode error) is skipped — one bad + // clip must not abort the batch (upstream `failedIds.insert` + continue). + let result = match a.kind { + ClipType::Image => match engine.image_thumbnail(&a.path) { + // Reuse the decoded thumbnail as the still's frame; a full-res + // decode is unnecessary for a single squash-resized embedding. + Ok(frame) => index_image(cache_root, &a.path, &frame, embedder, &cancel), + Err(e) => Err(e), + }, + ClipType::Video => { + // Probe the source for its true duration/dimensions so the sampler + // walks the whole clip (the manifest duration may be stale). + let (duration, width, height) = match engine.probe(&a.path) { + Ok(p) => (p.duration_secs, p.width.unwrap_or(0), p.height.unwrap_or(0)), + Err(e) => { + eprintln!("[search] probe failed {}: {e}", a.path.display()); + emit_completed(&app, i + 1, total); + continue; + } + }; + index_video( + cache_root, + &a.path, + duration, + width, + height, + embedder, + &opts, + &cancel, + Some(&on_progress), + ) + } + _ => Ok(()), + }; + if let Err(e) = result { + eprintln!("[search] index failed {}: {e}", a.path.display()); + } + emit_completed(&app, i + 1, total); + } + Ok(()) +} + +/// Emit a batch-completed progress tick (`completed`/`total`, fraction settled). +fn emit_completed(app: &AppHandle, completed: usize, total: usize) { + let fraction = if total > 0 { + completed as f64 / total as f64 + } else { + 1.0 + }; + let _ = app.emit( + "search://index", + IndexProgress { + completed, + total, + fraction, + }, + ); +} + +/// A post-index status snapshot (model installed + indexable/indexed counts). +fn index_status_snapshot(engine: &MediaEngine, assets: &[ResolvedAsset]) -> SearchIndexStatusDto { + let manifest = search_config::manifest(); + let model_installed = + opentake_media::search::model_download::installed(engine.models_dir(), &manifest).is_some(); + let spec = search_config::embedder_spec(); + let visual: Vec<&ResolvedAsset> = assets.iter().filter(|a| is_visual(a.kind)).collect(); + let indexed = visual + .iter() + .filter(|a| !opentake_media::search::needs_index(engine.cache_root(), &a.path, &spec)) + .count(); + SearchIndexStatusDto { + model_installed, + indexable: visual.len(), + indexed, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // --- pure DTO / merge / cap logic (no ort, no ffmpeg) --- + + #[test] + fn seconds_to_frame_truncates_like_upstream() { + // Int(s*fps) truncation, not rounding: 1.99s @ 30fps → 59, not 60. + assert_eq!(seconds_to_frame(1.99, 30), 59); + assert_eq!(seconds_to_frame(2.0, 30), 60); + assert_eq!(seconds_to_frame(0.0, 30), 0); + // Non-positive fps falls back to 30. + assert_eq!(seconds_to_frame(1.0, 0), 30); + assert_eq!(seconds_to_frame(-5.0, 30), 0); // negative clamps to 0 + } + + #[test] + fn file_matches_is_case_insensitive_substring_in_order() { + let entries = vec![ + ("a".into(), "Sunset Beach.mp4".into()), + ("b".into(), "harbor.mov".into()), + ("c".into(), "SUNSET timelapse.mp4".into()), + ]; + let hits = file_matches(&entries, "sunset"); + // Both "Sunset" assets match, in manifest order; "harbor" doesn't. + assert_eq!(hits.len(), 2); + assert_eq!(hits[0].media_id, "a"); + assert_eq!(hits[1].media_id, "c"); + assert!(hits.iter().all(|h| h.score == 1.0)); + } + + #[test] + fn file_matches_empty_query_is_empty() { + let entries = vec![("a".into(), "x.mp4".into())]; + assert!(file_matches(&entries, "").is_empty()); + assert!(file_matches(&entries, " ").is_empty()); + } + + #[test] + fn spoken_dto_projects_fields_with_fixed_score() { + let h = opentake_media::SpokenHit { + asset_id: "a1".into(), + start: 1.5, + end: 2.5, + text: "the budget plan".into(), + }; + let dto = spoken_dto(&h); + assert_eq!(dto.media_id, "a1"); + assert_eq!(dto.start_sec, 1.5); + assert_eq!(dto.end_sec, 2.5); + assert_eq!(dto.text, "the budget plan"); + assert_eq!(dto.score, 1.0); + } + + #[test] + fn moment_dto_marks_zero_length_shot_as_image() { + // A still: shot_start == shot_end → is_image true, no meaningful range. + let still = opentake_media::Hit { + asset_id: "img".into(), + time: 0.0, + shot_start: 0.0, + shot_end: 0.0, + score: 0.9, + }; + let d = moment_dto(&still, 30); + assert!(d.is_image); + assert_eq!(d.frame, 0); + + // A video shot: range present, frame = trunc(shot_start*fps). + let vid = opentake_media::Hit { + asset_id: "vid".into(), + time: 3.2, + shot_start: 3.0, + shot_end: 6.0, + score: 0.8, + }; + let d = moment_dto(&vid, 30); + assert!(!d.is_image); + assert_eq!(d.frame, 90); + assert_eq!(d.start_sec, 3.0); + assert_eq!(d.end_sec, 6.0); + } + + #[test] + fn is_visual_and_is_spoken_partition_kinds_like_upstream() { + assert!(is_visual(ClipType::Video)); + assert!(is_visual(ClipType::Image)); + assert!(!is_visual(ClipType::Audio)); + assert!(is_spoken(ClipType::Video)); + assert!(is_spoken(ClipType::Audio)); + assert!(!is_spoken(ClipType::Image)); // images have nothing spoken + } + + // --- DTO serde round-trips (camelCase wire contract) --- + + #[test] + fn moment_hit_dto_is_camel_case_and_round_trips() { + let dto = MomentHitDto { + media_id: "m1".into(), + frame: 90, + start_sec: 3.0, + end_sec: 6.0, + score: 0.8, + is_image: false, + }; + let json = serde_json::to_string(&dto).unwrap(); + assert!(json.contains("\"mediaId\":\"m1\"")); + assert!(json.contains("\"startSec\":3.0")); + assert!(json.contains("\"endSec\":6.0")); + assert!(json.contains("\"isImage\":false")); + let back: MomentHitDto = serde_json::from_str(&json).unwrap(); + assert_eq!(dto, back); + } + + #[test] + fn spoken_hit_dto_camel_case_round_trips() { + let dto = SpokenHitDto { + media_id: "m1".into(), + start_sec: 1.0, + end_sec: 2.0, + text: "hello".into(), + score: 1.0, + }; + let json = serde_json::to_string(&dto).unwrap(); + assert!(json.contains("\"mediaId\":\"m1\"")); + assert!(json.contains("\"startSec\":1.0")); + let back: SpokenHitDto = serde_json::from_str(&json).unwrap(); + assert_eq!(dto, back); + } + + #[test] + fn search_results_dto_round_trips_all_groups() { + let dto = SearchResultsDto { + moments: vec![MomentHitDto { + media_id: "v".into(), + frame: 0, + start_sec: 0.0, + end_sec: 1.0, + score: 0.7, + is_image: false, + }], + spoken: vec![SpokenHitDto { + media_id: "a".into(), + start_sec: 0.0, + end_sec: 1.0, + text: "x".into(), + score: 1.0, + }], + files: vec![FileHitDto { + media_id: "f".into(), + score: 1.0, + }], + }; + let json = serde_json::to_string(&dto).unwrap(); + assert!(json.contains("\"moments\":")); + assert!(json.contains("\"spoken\":")); + assert!(json.contains("\"files\":")); + let back: SearchResultsDto = serde_json::from_str(&json).unwrap(); + assert_eq!(dto, back); + } + + #[test] + fn search_model_status_dto_camel_case() { + let dto = SearchModelStatusDto { + installed: false, + model: "siglip2-base-patch16-256".into(), + bytes: 0, + }; + let json = serde_json::to_string(&dto).unwrap(); + assert!(json.contains("\"installed\":false")); + assert!(json.contains("\"model\":\"siglip2-base-patch16-256\"")); + let back: SearchModelStatusDto = serde_json::from_str(&json).unwrap(); + assert_eq!(dto, back); + } + + #[test] + fn search_index_status_dto_camel_case_round_trips() { + let dto = SearchIndexStatusDto { + model_installed: true, + indexable: 5, + indexed: 2, + }; + let json = serde_json::to_string(&dto).unwrap(); + assert!(json.contains("\"modelInstalled\":true")); + assert!(json.contains("\"indexable\":5")); + assert!(json.contains("\"indexed\":2")); + let back: SearchIndexStatusDto = serde_json::from_str(&json).unwrap(); + assert_eq!(dto, back); + } + + #[test] + fn download_and_index_progress_are_camel_case() { + let d = DownloadProgress { fraction: 0.5 }; + assert_eq!(serde_json::to_string(&d).unwrap(), "{\"fraction\":0.5}"); + let ip = IndexProgress { + completed: 1, + total: 4, + fraction: 0.25, + }; + let json = serde_json::to_string(&ip).unwrap(); + assert!(json.contains("\"completed\":1")); + assert!(json.contains("\"total\":4")); + assert!(json.contains("\"fraction\":0.25")); + } +} diff --git a/web/src/components/media/MediaPanel.tsx b/web/src/components/media/MediaPanel.tsx index 398fd8a..1d11a04 100644 --- a/web/src/components/media/MediaPanel.tsx +++ b/web/src/components/media/MediaPanel.tsx @@ -47,6 +47,7 @@ import { saveDialog } from "../../lib/dialog"; import type { MediaFolder, MediaItem } from "../../lib/types"; import { MediaTabBar, MediaSubTabBar } from "./MediaTabBar"; import { CaptionsTab } from "./CaptionsTab"; +import { MediaSearchResults } from "./MediaSearch"; import { useFavoritesStore, useIsFavorite } from "./favorites"; /** MIME-ish type used on dataTransfer when dragging a media item to the timeline. */ @@ -292,7 +293,17 @@ function MediaTab({ kind }: { kind: MediaTabKind }) { )} - {isEmpty ? ( + {query !== "" ? ( + // Smart search: three result groups (Moments / Spoken / Files) + the + // index-status affordance. `filteredItems` is the name-matched Files group + // (already scoped to the current main/subtab). Moments/Spoken come from + // the backend query; they degrade to empty with no model, leaving Files. + i.type === "video" || i.type === "image")} + /> + ) : isEmpty ? ( ) : ( 0 ? bytes / (1024 * 1024) : FALLBACK_MODEL_MB; + return `${Math.round(mb)} MB`; +} + +/** The visual-index lifecycle the affordance renders. */ +type IndexPhase = + | { kind: "hidden" } + | { kind: "needsModel" } + | { kind: "downloading"; fraction: number } + | { kind: "readyToIndex" } + | { kind: "indexing"; done: number; total: number; fraction: number } + | { kind: "failed" }; + +/** + * The full search view: index-status affordance + the three result groups. + * `nameMatches` is the caller's already name-filtered items (the Files group). + */ +export function MediaSearchResults({ + query, + nameMatches, + hasIndexableAssets, +}: { + query: string; + nameMatches: MediaItem[]; + hasIndexableAssets: boolean; +}) { + const t = useT(); + const [results, setResults] = useState({ moments: [], spoken: [], files: [] }); + const phase = useSearchIndexPhase(hasIndexableAssets); + + // Debounced backend query for Moments + Spoken. Files come from `nameMatches`. + const reqId = useRef(0); + useEffect(() => { + const q = query.trim(); + if (q === "") { + setResults({ moments: [], spoken: [], files: [] }); + return; + } + const id = ++reqId.current; + const handle = window.setTimeout(() => { + void searchQueryApi(q).then((r) => { + // Ignore a stale response (a newer query superseded this one). + if (id === reqId.current) setResults(r); + }); + }, SEARCH_DEBOUNCE_MS); + return () => window.clearTimeout(handle); + }, [query]); + + const { moments, spoken } = results; + const isEmpty = moments.length === 0 && spoken.length === 0 && nameMatches.length === 0; + + return ( +
+ + + {moments.length > 0 && ( + + + {moments.map((hit, i) => ( + + ))} + + + )} + + {spoken.length > 0 && ( + +
+ {spoken.map((hit, i) => ( + + ))} +
+
+ )} + + {nameMatches.length > 0 && ( + + + {nameMatches.map((item) => ( + + ))} + + + )} + + {isEmpty && ( +
+ {t("search.noMatches", { query: query.trim() })} +
+ )} +
+ ); +} + +/** Poll model + index status and subscribe to progress, deriving the affordance + * phase. Mirrors upstream `searchIndexStatus`'s state machine. */ +function useSearchIndexPhase(hasIndexableAssets: boolean): IndexPhase { + const [phase, setPhase] = useState({ kind: "hidden" }); + const mediaCount = useMediaStore((s) => s.items.length); + + // Re-evaluate whenever the library changes (new assets → maybe need indexing). + const refresh = useCallback(async () => { + const status = await searchIndexStatus(); + if (!status.modelInstalled) { + setPhase(hasIndexableAssets ? { kind: "needsModel" } : { kind: "hidden" }); + return; + } + if (status.indexable > 0 && status.indexed < status.indexable) { + setPhase({ kind: "readyToIndex" }); + } else { + setPhase({ kind: "hidden" }); + } + }, [hasIndexableAssets]); + + useEffect(() => { + let cancelled = false; + void refresh().catch(() => { + if (!cancelled) setPhase({ kind: "hidden" }); + }); + return () => { + cancelled = true; + }; + }, [refresh, mediaCount]); + + // Live download + indexing progress events keep the ring moving. + useEffect(() => { + let offDownload = () => {}; + let offIndex = () => {}; + void onSearchModelProgress((fraction) => { + setPhase((p) => + p.kind === "downloading" || p.kind === "needsModel" ? { kind: "downloading", fraction } : p, + ); + }).then((off) => (offDownload = off)); + void onSearchIndexProgress(({ completed, total, fraction }) => { + if (total === 0) { + void refresh(); + return; + } + setPhase({ kind: "indexing", done: completed, total, fraction }); + // On the final tick, settle back to the resting state. + if (completed >= total) void refresh(); + }).then((off) => (offIndex = off)); + return () => { + offDownload(); + offIndex(); + }; + }, [refresh]); + + // Expose setters through a module ref so the button handlers can drive it. + phaseSetterRef.current = setPhase; + return phase; +} + +/** Lets the affordance's buttons flip the phase optimistically before the async + * command's first progress event lands (module ref — avoids prop threading). */ +const phaseSetterRef: { current: ((p: IndexPhase) => void) | null } = { current: null }; + +/** The status affordance: a download/enable button (no model) or a progress ring + * (downloading / indexing). Hidden when nothing needs attention (upstream + * `MediaTab+IndexStatus.swift`). */ +function SearchIndexAffordance({ phase }: { phase: IndexPhase }) { + const t = useT(); + + const onDownload = useCallback(() => { + phaseSetterRef.current?.({ kind: "downloading", fraction: 0 }); + void downloadSearchModel() + .then(() => phaseSetterRef.current?.({ kind: "readyToIndex" })) + .catch(() => phaseSetterRef.current?.({ kind: "failed" })); + }, []); + + const onIndex = useCallback(() => { + phaseSetterRef.current?.({ kind: "indexing", done: 0, total: 1, fraction: 0 }); + void searchIndexStart() + .then(() => phaseSetterRef.current?.({ kind: "hidden" })) + .catch(() => phaseSetterRef.current?.({ kind: "hidden" })); + }, []); + + if (phase.kind === "hidden") return null; + + const barStyle: React.CSSProperties = { + display: "flex", + alignItems: "center", + gap: "var(--space-xs)", + padding: "var(--space-xs) var(--space-sm)", + margin: "var(--space-xs) var(--space-sm) 0", + borderRadius: "var(--radius-sm)", + background: "var(--bg-raised)", + border: "var(--bw-thin) solid var(--border-subtle)", + fontSize: "var(--fs-xs)", + color: "var(--text-secondary)", + }; + + if (phase.kind === "needsModel") { + return ( + + ); + } + if (phase.kind === "readyToIndex") { + return ( + + ); + } + if (phase.kind === "failed") { + return ( + + ); + } + // downloading | indexing → progress ring + label. + const fraction = phase.fraction; + const label = + phase.kind === "downloading" + ? t("search.downloading", { percent: Math.round(phase.fraction * 100) }) + : t("search.indexing", { done: Math.min(phase.done + 1, phase.total), total: phase.total }); + return ( +
+ + {label} +
+ ); +} + +/** A small SVG progress ring (upstream `progressRing`). */ +function ProgressRing({ value }: { value: number }) { + const v = Math.max(0.03, Math.min(1, value)); + const size = 14; + const stroke = 2; + const r = (size - stroke) / 2; + const c = 2 * Math.PI * r; + return ( + + + + + ); +} + +/** A collapsible-looking group header + body (upstream `momentHeader`). */ +function Group({ + icon, + label, + count, + children, +}: { + icon: typeof Film; + label: string; + count: number; + children: React.ReactNode; +}) { + return ( +
+
+ + {label} + + {count} + +
+ {children} +
+ ); +} + +/** The adaptive grid the Moments + Files groups use (upstream `resultsGrid`). */ +function ResultsGrid({ children }: { children: React.ReactNode }) { + return ( +
+ {children} +
+ ); +} + +/** Async frame thumbnail for a search hit at a specific source-second time. */ +function HitThumbnail({ mediaId, timeSec, alt }: { mediaId: string; timeSec: number; alt: string }) { + const [path, setPath] = useState(null); + useEffect(() => { + let cancelled = false; + void generateThumbnail(mediaId, { timeSecs: timeSec, includeSprite: false }).then((r) => { + if (!cancelled) setPath(r?.thumbnailPath ?? null); + }); + return () => { + cancelled = true; + }; + }, [mediaId, timeSec]); + const src = assetUrl(path); + return ( +
+ {src ? ( + {alt} + ) : ( + + )} +
+ ); +} + +/** Look up a media item by id (for the name + drag payload). */ +function useMediaItem(mediaId: string): MediaItem | undefined { + return useMediaStore((s) => s.items.find((m) => m.id === mediaId)); +} + +/** A visual "Moments" card: frame thumb + name + timecode range, draggable to the + * timeline as a trimmed source-range clip (upstream `momentCard`). */ +function MomentCard({ hit }: { hit: MomentHit }) { + const t = useT(); + const item = useMediaItem(hit.mediaId); + const fps = useProjectStore((s) => s.timeline.fps); + const setPreviewMedia = useEditorUiStore((s) => s.setPreviewMedia); + if (!item) return null; + + const onDragStart = (e: React.DragEvent) => { + e.dataTransfer.setData(MEDIA_DND_TYPE, item.id); + e.dataTransfer.effectAllowed = "copy"; + setDraggingMedia(item); + // Stills drag as the whole asset (no meaningful range). + if (!hit.isImage) setDraggingMomentRange({ startSec: hit.startSec, endSec: hit.endSec }); + else setDraggingMomentRange(null); + }; + const onDragEnd = () => { + setDraggingMedia(null); + setDraggingMomentRange(null); + }; + + const startFrames = Math.round(hit.startSec * fps); + const endFrames = Math.round(hit.endSec * fps); + + return ( +
setPreviewMedia(item.id)} + title={t("search.dragToTimeline")} + style={{ display: "flex", flexDirection: "column", gap: 3, cursor: "grab" }} + > + + + {item.name} + + {!hit.isImage && ( + + {formatTimecode(startFrames, fps)}–{formatTimecode(endFrames, fps)} + + )} +
+ ); +} + +/** A "Spoken" transcript row: thumb + text + name·timecode, draggable as a + * trimmed range (upstream `spokenRow`). */ +function SpokenRow({ hit }: { hit: SpokenHit }) { + const t = useT(); + const item = useMediaItem(hit.mediaId); + const fps = useProjectStore((s) => s.timeline.fps); + const setPreviewMedia = useEditorUiStore((s) => s.setPreviewMedia); + if (!item) return null; + + const onDragStart = (e: React.DragEvent) => { + e.dataTransfer.setData(MEDIA_DND_TYPE, item.id); + e.dataTransfer.effectAllowed = "copy"; + setDraggingMedia(item); + setDraggingMomentRange({ startSec: hit.startSec, endSec: hit.endSec }); + }; + const onDragEnd = () => { + setDraggingMedia(null); + setDraggingMomentRange(null); + }; + + return ( +
setPreviewMedia(item.id)} + title={t("search.dragToTimeline")} + style={{ + display: "flex", + gap: "var(--space-sm)", + cursor: "grab", + alignItems: "flex-start", + }} + > +
+ +
+
+ + {hit.text} + + + {item.name} · {formatTimecode(Math.round(hit.startSec * fps), fps)} + +
+
+ ); +} + +/** A "Files" name-match card: thumb + name, draggable as the whole asset (the + * pre-existing behavior; upstream `fileCard`). */ +function FileCard({ item }: { item: MediaItem }) { + const setPreviewMedia = useEditorUiStore((s) => s.setPreviewMedia); + const thumb = item.missing ? null : assetUrl(item.thumbnail); + + const onDragStart = (e: React.DragEvent) => { + e.dataTransfer.setData(MEDIA_DND_TYPE, item.id); + e.dataTransfer.effectAllowed = "copy"; + setDraggingMedia(item); + setDraggingMomentRange(null); // whole asset + }; + const onDragEnd = () => setDraggingMedia(null); + + return ( +
setPreviewMedia(item.id)} + onDoubleClick={() => void addMediaToTimeline(item)} + title={item.name} + style={{ display: "flex", flexDirection: "column", gap: 3, cursor: "grab" }} + > +
+ {thumb ? ( + {item.name} + ) : ( + + )} +
+ + {item.name} + +
+ ); +} diff --git a/web/src/components/timeline/TimelineContainer.tsx b/web/src/components/timeline/TimelineContainer.tsx index 22156c9..6e81ed2 100644 --- a/web/src/components/timeline/TimelineContainer.tsx +++ b/web/src/components/timeline/TimelineContainer.tsx @@ -38,6 +38,7 @@ import { ClipContextMenu } from "./ClipContextMenu"; import { SwapMediaPicker } from "./SwapMediaPicker"; import { MEDIA_DND_TYPE } from "../media/MediaPanel"; import { getDraggingMedia, setDraggingMedia } from "../../lib/mediaDragState"; +import { getDraggingMomentRange, setDraggingMomentRange } from "../../lib/momentDragState"; import { maybeSnapFeedback } from "../../lib/haptic"; import { useProjectStore } from "../../store/projectStore"; import { useEditorUiStore } from "../../store/uiStore"; @@ -1353,7 +1354,13 @@ export function TimelineContainer() { const item = getDraggingMedia(); if (!item) return; const { docX, docY } = toDoc(e); - const durationFrames = edit.mediaDurationFrames(item, timeline.fps); + // A search "Moments"/"Spoken" hit drags a trimmed source range: size the + // ghost to that range (unless it's a still, which places the whole asset). + const momentRange = getDraggingMomentRange(); + const durationFrames = + momentRange && item.type !== "image" && item.duration > 0 + ? edit.momentDurationFrames(momentRange, timeline.fps) + : edit.mediaDurationFrames(item, timeline.fps); const rawStart = frameAt(docX, zoomScale); // Snap the start OR end edge to a clip edge / playhead (multi-probe, sticky // — same engine as a clip move), so the ghost clicks onto neighbours. @@ -1418,11 +1425,14 @@ export function TimelineContainer() { e.stopPropagation(); const id = e.dataTransfer.getData(MEDIA_DND_TYPE); const item = useMediaStore.getState().items.find((m) => m.id === id); + // A search-hit drag carries a source-second range → place a trimmed clip. + const momentRange = getDraggingMomentRange(); // Land exactly where the ghost showed: reuse the resolved plan from the // last dragover (drop is always preceded by a dragover at the same point). const plan = mediaGhostRef.current; clearMediaGhost(); setDraggingMedia(null); + setDraggingMomentRange(null); // Dropping onto the timeline is an HTML5 `drop` (no pointerdown), so the // media-preview→timeline switch in TimelineRegion's onPointerDownCapture // never fires. Clear the selected media here so the preview shows the @@ -1433,7 +1443,17 @@ export function TimelineContainer() { if (plan) { const preferredTrackIndex = plan.newTrackIndex !== null ? null : plan.trackIndex; const insertTrackAt = plan.newTrackIndex !== null ? plan.newTrackIndex : undefined; - void edit.addMediaToTimelineAt(item, plan.startFrame, preferredTrackIndex, insertTrackAt); + if (momentRange) { + void edit.addMomentToTimelineAt( + item, + plan.startFrame, + preferredTrackIndex, + momentRange, + insertTrackAt, + ); + } else { + void edit.addMediaToTimelineAt(item, plan.startFrame, preferredTrackIndex, insertTrackAt); + } return; } // Fallback (no prior ghost, e.g. a foreign drag): resolve from the point. @@ -1442,7 +1462,11 @@ export function TimelineContainer() { const target = dropTargetAt(timeline, docY, trackHeights); const preferredTrackIndex = target.kind === "existing" ? target.trackIndex : null; const insertTrackAt = target.kind === "newTrack" ? target.index : undefined; - void edit.addMediaToTimelineAt(item, startFrame, preferredTrackIndex, insertTrackAt); + if (momentRange) { + void edit.addMomentToTimelineAt(item, startFrame, preferredTrackIndex, momentRange, insertTrackAt); + } else { + void edit.addMediaToTimelineAt(item, startFrame, preferredTrackIndex, insertTrackAt); + } }, [toDoc, zoomScale, timeline, trackHeights, clearMediaGhost], ); diff --git a/web/src/i18n/dict.ts b/web/src/i18n/dict.ts index 7635544..8feb9f8 100644 --- a/web/src/i18n/dict.ts +++ b/web/src/i18n/dict.ts @@ -156,6 +156,24 @@ const zh: Dict = { "media.offline": "媒体离线", "media.relink": "重新链接", + // 智能搜索(视觉语义 + 口播 + 文件名,对应上游 MediaTab+Search / IndexStatus) + "search.group.moments": "画面", + "search.group.spoken": "口播", + "search.group.files": "文件", + "search.smartSearch": "智能搜索", + "search.smartSearchHint": "下载一个 {size} 的本地模型,即可按画面内容搜索媒体。", + "search.downloading": "下载中 {percent}%", + "search.downloadingHint": "正在下载驱动视觉搜索的本地模型。", + "search.preparing": "准备中…", + "search.indexing": "分析中 {done}/{total}", + "search.indexingHint": "正在分析媒体以支持搜索。", + "search.index": "建立索引", + "search.indexHint": "分析本项目的视频/图片,即可按画面内容搜索。", + "search.retry": "重试", + "search.retryHint": "视觉搜索模型下载失败。请检查网络后重试。", + "search.noMatches": "没有匹配 “{query}” 的结果", + "search.dragToTimeline": "拖到时间线以添加此片段", + // 字幕标签(自动转写 + 生成字幕,对应上游 CaptionTab) "captions.source": "来源", "captions.sourceHelp": "有选中片段时用选中片段,否则用全部可转写音频。选择某条轨道可限定范围。", @@ -627,6 +645,24 @@ const en: Dict = { "media.offline": "Media Offline", "media.relink": "Relink", + // Smart search (visual semantic + spoken + filename, upstream MediaTab+Search / IndexStatus) + "search.group.moments": "Moments", + "search.group.spoken": "Spoken", + "search.group.files": "Files", + "search.smartSearch": "Smart search", + "search.smartSearchHint": "Downloads a {size} on-device model so you can search media by what's on screen.", + "search.downloading": "Downloading {percent}%", + "search.downloadingHint": "Downloading the on-device model that powers visual search.", + "search.preparing": "Preparing…", + "search.indexing": "Indexing {done}/{total}", + "search.indexingHint": "Analyzing media so you can search it.", + "search.index": "Build index", + "search.indexHint": "Analyze this project's video/images so you can search by what's on screen.", + "search.retry": "Retry", + "search.retryHint": "Visual search model download failed. Check your connection and try again.", + "search.noMatches": "No matches for “{query}”", + "search.dragToTimeline": "Drag to the timeline to add this segment", + // Captions tab (auto-transcribe + generate captions, upstream CaptionTab) "captions.source": "Source", "captions.sourceHelp": "Uses selected clips when available, otherwise all captionable audio. Choose a track to limit captions.", diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 2ab8eb7..f0ae86d 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -16,6 +16,9 @@ import type { GenerateCaptionsResult, MediaList, ModelStatus, + SearchIndexStatus, + SearchModelStatus, + SearchResults, SecretStatus, TimelineSnapshot, Transcript, @@ -425,6 +428,87 @@ export async function generateCaptions( throw new Error("caption generation requires the desktop app (whisper)"); } +// MARK: - Semantic search (SigLIP2 visual model + index + query, search-wiring) + +/** Whether the SigLIP2 visual-search model is installed. Never downloads. The + * media panel calls this to decide whether to show the "Smart search" download + * affordance. Outside Tauri there is no backend, so report "not installed". */ +export async function searchModelStatus(): Promise { + await ensureTauri(); + if (invokeImpl) return invokeImpl("search_model_status"); + return { installed: false, model: "", bytes: 0 }; +} + +/** Download the SigLIP2 model (idempotent), emitting `search://progress` events + * as bytes arrive, SHA-256-verified. Rejects outside Tauri (no backend). */ +export async function downloadSearchModel(): Promise { + await ensureTauri(); + if (invokeImpl) return invokeImpl("download_search_model"); + throw new Error("search model download requires the desktop app"); +} + +/** Subscribe to search-model-download progress (`fraction` in 0..=1). No-op + * outside Tauri. */ +export async function onSearchModelProgress( + handler: (fraction: number) => void, +): Promise<() => void> { + await ensureTauri(); + if (!listenImpl) return () => {}; + return listenImpl("search://progress", (e) => { + const p = e.payload as { fraction?: number } | undefined; + if (p && typeof p.fraction === "number") handler(p.fraction); + }); +} + +/** Snapshot how much of the project's video/image media is indexed. Never + * indexes. Outside Tauri report an empty/uninstalled state. */ +export async function searchIndexStatus(): Promise { + await ensureTauri(); + if (invokeImpl) return invokeImpl("search_index_status"); + return { modelInstalled: false, indexable: 0, indexed: 0 }; +} + +/** Index every not-yet-current video/image asset (sampled frames → SigLIP2 + * embeddings), emitting `search://index` progress. Idempotent. Rejects outside + * Tauri or when the model isn't installed. */ +export async function searchIndexStart(): Promise { + await ensureTauri(); + if (invokeImpl) return invokeImpl("search_index_start"); + throw new Error("visual indexing requires the desktop app"); +} + +/** Subscribe to indexing progress: `completed`/`total` assets + overall + * `fraction` (0..=1). No-op outside Tauri. */ +export async function onSearchIndexProgress( + handler: (progress: { completed: number; total: number; fraction: number }) => void, +): Promise<() => void> { + await ensureTauri(); + if (!listenImpl) return () => {}; + return listenImpl("search://index", (e) => { + const p = e.payload as + | { completed?: number; total?: number; fraction?: number } + | undefined; + if ( + p && + typeof p.completed === "number" && + typeof p.total === "number" && + typeof p.fraction === "number" + ) { + handler({ completed: p.completed, total: p.total, fraction: p.fraction }); + } + }); +} + +/** Run the three-group content query — Moments (visual), Spoken (transcript), + * Files (name). Visual is best-effort (empty without a model); Spoken + Files + * always work, so plain filename filtering is the zero-setup fallback. Outside + * Tauri returns empty groups (the panel falls back to its in-memory name filter). */ +export async function searchQuery(query: string): Promise { + await ensureTauri(); + if (invokeImpl) return invokeImpl("search_query", { query }); + return { moments: [], spoken: [], files: [] }; +} + /** * Relink an offline asset to a newly chosen file, KEEPING its id so every clip * that references it recovers in place (the fix for "lost media stays red after diff --git a/web/src/lib/momentDragState.ts b/web/src/lib/momentDragState.ts new file mode 100644 index 0000000..0110937 --- /dev/null +++ b/web/src/lib/momentDragState.ts @@ -0,0 +1,27 @@ +/** + * Shared drag state for "search Moments/Spoken hit → timeline" drags. A search + * hit drags onto the timeline as a *trimmed* source-range clip (only the shot / + * spoken segment lands), mirroring upstream's `assetDragString(forAssetId: + * segment:)`. The hit still uses {@link MEDIA_DND_TYPE} so the existing timeline + * drop machinery (ghost sizing, track resolution) works unchanged; this module + * stashes the source-second range the drop reads to place a trimmed clip instead + * of the whole asset. Module-level (not a store) so reads/writes never re-render. + * + * Cleared whenever the gesture ends (drop or a plain media-card drag starting). + */ + +import type { SourceRange } from "../store/editActions"; + +let range: SourceRange | null = null; + +/** Record the source-second range being dragged from a search hit (or clear + * with `null`). A still image (no range) simply never sets this. */ +export function setDraggingMomentRange(next: SourceRange | null): void { + range = next; +} + +/** The source-second range of the search hit currently dragged, or `null` when + * the active drag is a plain full-asset drag (or none). */ +export function getDraggingMomentRange(): SourceRange | null { + return range; +} diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 468f726..1b80e3a 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -452,6 +452,66 @@ export interface GenerateCaptionsResult { captionCount: number; } +// MARK: - Semantic search (mirror of src-tauri search.rs DTOs) + +/** Whether the SigLIP2 visual-search model is installed, plus enough to prompt a + * one-time download (mirror of Rust `SearchModelStatusDto`). */ +export interface SearchModelStatus { + installed: boolean; + /** Model identity, e.g. "siglip2-base-patch16-256". */ + model: string; + /** Approximate combined download size in bytes (image + text encoder + tokenizer). */ + bytes: number; +} + +/** Visual-index coverage for the project's video/image assets (mirror of Rust + * `SearchIndexStatusDto`). Drives the panel's "index now" affordance + progress. */ +export interface SearchIndexStatus { + /** The model must be installed before anything can be indexed. */ + modelInstalled: boolean; + /** Count of video/image assets in the project. */ + indexable: number; + /** How many already have a current on-disk embedding index. */ + indexed: number; +} + +/** One visual ("Moments") hit. `frame` is the shot-start in **source frames** + * (thumb + preview anchor); `startSec`/`endSec` are the source-second range used + * to drag a trimmed clip onto the timeline (mirror of Rust `MomentHitDto`). */ +export interface MomentHit { + mediaId: string; + frame: number; + startSec: number; + endSec: number; + score: number; + /** True for still images (no time range → drag as a plain asset). */ + isImage: boolean; +} + +/** One spoken ("Spoken") transcript hit (mirror of Rust `SpokenHitDto`). */ +export interface SpokenHit { + mediaId: string; + startSec: number; + endSec: number; + text: string; + score: number; +} + +/** One filename ("Files") match (mirror of Rust `FileHitDto`). */ +export interface FileHit { + mediaId: string; + score: number; +} + +/** The three-group query result: Moments (visual), Spoken (transcript), Files + * (name), ranked independently and never blended (mirror of Rust + * `SearchResultsDto`). */ +export interface SearchResults { + moments: MomentHit[]; + spoken: SpokenHit[]; + files: FileHit[]; +} + // MARK: - Media catalog (mirror of src-tauri MediaItemDto / MediaListDto) /** One media-library item as returned by `get_media` / `import_*`. `type` is the diff --git a/web/src/store/editActions.test.ts b/web/src/store/editActions.test.ts index 2cdbb6e..39f094b 100644 --- a/web/src/store/editActions.test.ts +++ b/web/src/store/editActions.test.ts @@ -173,8 +173,10 @@ vi.mock("../lib/api", () => ({ import { addMediaToTimeline, addMediaToTimelineAt, + addMomentToTimelineAt, insertTrack, mediaDurationFrames, + momentDurationFrames, pasteClipsAtPlayhead, resolveMediaDropTrack, swapTracks, @@ -417,3 +419,62 @@ describe("mediaDurationFrames", () => { expect(mediaDurationFrames(item, 30)).toBe(1); }); }); + +describe("momentDurationFrames", () => { + it("returns the range length in frames", () => { + expect(momentDurationFrames({ startSec: 3, endSec: 6 }, 30)).toBe(90); + }); + + it("never returns less than one frame for a tiny range", () => { + expect(momentDurationFrames({ startSec: 3, endSec: 3.001 }, 30)).toBe(1); + }); +}); + +describe("addMomentToTimelineAt (trimmed source-range drop from a search hit)", () => { + beforeEach(() => { + srv.reset(); + useProjectStore.getState().setMirror(EMPTY, 0); + useEditorUiStore.setState({ activeFrame: 0, currentFrame: 0, selectedClipIds: new Set() }); + }); + + /** The first video clip's [trimStart, duration, trimEnd] after a placement. */ + function firstVideoTrim(): [number, number, number] { + const tl = useProjectStore.getState().timeline; + const track = tl.tracks.find((t) => t.type === "video"); + const c = track?.clips[0]; + return c ? [c.trimStartFrame, c.durationFrames, c.trimEndFrame] : [-1, -1, -1]; + } + + it("places only the source range as a trimmed clip", async () => { + // 10s @ 30fps = 300 source frames. Range [3s,6s] → trimStart 90, duration 90, + // trimEnd 300-90-90 = 120. Lands at timeline frame 0. + const item: MediaItem = { id: "v", name: "v", type: "video", duration: 10, hasAudio: false }; + await addMomentToTimelineAt(item, 0, null, { startSec: 3, endSec: 6 }); + expect(visualClipStarts()).toEqual([0]); + expect(firstVideoTrim()).toEqual([90, 90, 120]); + }); + + it("clamps a range that runs past the source end", async () => { + // 5s = 150 frames. Range [4s, 9s] would want duration 150 but only 30 frames + // of source remain after trimStart 120 → duration clamps to 30, trimEnd 0. + const item: MediaItem = { id: "v", name: "v", type: "video", duration: 5, hasAudio: false }; + await addMomentToTimelineAt(item, 0, null, { startSec: 4, endSec: 9 }); + expect(firstVideoTrim()).toEqual([120, 30, 0]); + }); + + it("falls back to the whole asset for a still image (no range)", async () => { + // Images have no meaningful sub-range → placed full (default 5s = 150 frames), + // untrimmed. + const item: MediaItem = { id: "i", name: "i", type: "image", duration: 0, hasAudio: false }; + await addMomentToTimelineAt(item, 0, null, { startSec: 0, endSec: 0 }); + expect(firstVideoTrim()).toEqual([0, 150, 0]); + }); + + it("lands the trimmed clip at the drop start frame", async () => { + const item: MediaItem = { id: "v", name: "v", type: "video", duration: 10, hasAudio: false }; + await addMomentToTimelineAt(item, 45, null, { startSec: 1, endSec: 2 }); + expect(visualClipStarts()).toEqual([45]); + // 1s..2s → trimStart 30, duration 30, trimEnd 300-30-30 = 240. + expect(firstVideoTrim()).toEqual([30, 30, 240]); + }); +}); diff --git a/web/src/store/editActions.ts b/web/src/store/editActions.ts index 9593f78..48fffe8 100644 --- a/web/src/store/editActions.ts +++ b/web/src/store/editActions.ts @@ -595,6 +595,37 @@ export function addMediaToTimelineAt( return enqueueMediaAdd(() => addMediaToTimelineAtInner(item, startFrame, preferredTrackIndex, insertTrackAt)); } +/** A source-media sub-range (seconds) to place from a search "Moments"/"Spoken" + * hit: only `[startSec, endSec)` of the asset lands on the timeline as a trimmed + * clip, mirroring upstream's `assetDragString(forAssetId:segment:)`. */ +export interface SourceRange { + startSec: number; + endSec: number; +} + +/** Frames a moment clip occupies on the timeline for a source `[startSec,endSec)` + * range: the range length in frames, clamped to at least one frame. */ +export function momentDurationFrames(range: SourceRange, fps: number): number { + return Math.max(1, Math.round((range.endSec - range.startSec) * fps)); +} + +/** Place only `range` of `item` on the timeline at `startFrame` — a trimmed clip + * (drag from a visual/spoken search hit). Reuses the same track resolution as a + * full-asset drop, then overrides the entry's trim/duration from the range. + * A still image (or a range that covers the whole/none of the source) falls back + * to the plain full-asset placement. */ +export function addMomentToTimelineAt( + item: MediaItem, + startFrame: number, + preferredTrackIndex: number | null, + range: SourceRange, + insertTrackAt?: number, +): Promise { + return enqueueMediaAdd(() => + addMomentToTimelineAtInner(item, startFrame, preferredTrackIndex, range, insertTrackAt), + ); +} + async function addMediaToTimelineInner(item: MediaItem): Promise { let timeline = useProjectStore.getState().timeline; if (firstCompatibleTrackIndex(timeline, item.type) === null) { @@ -656,6 +687,95 @@ async function addMediaToTimelineAtInner( if (isTauri) await forceRefresh(); } +/** Build the trimmed clip entry for a source `[startSec,endSec)` moment range on + * `item`, resolving the target track like a full-asset drop. Returns null when + * no compatible track exists (the caller then inserts one and retries). */ +function entryForMomentAt( + timeline: Timeline, + item: MediaItem, + startFrame: number, + preferredTrackIndex: number | null, + range: SourceRange, +): ClipEntryReq | null { + const fps = timeline.fps; + const totalSource = mediaDurationFrames(item, fps); + const trimStartFrame = Math.max(0, Math.min(totalSource, Math.round(range.startSec * fps))); + const rangeFrames = momentDurationFrames(range, fps); + // Clamp the visible span so trimStart + duration never exceed the source. + const durationFrames = Math.max(1, Math.min(rangeFrames, totalSource - trimStartFrame)); + const trimEndFrame = Math.max(0, totalSource - trimStartFrame - durationFrames); + const trackIndex = firstOpenCompatibleTrackIndex( + timeline, + item.type, + startFrame, + durationFrames, + preferredTrackIndex, + ); + if (trackIndex === null) return null; + return { + mediaRef: item.id, + mediaType: item.type, + sourceClipType: item.type, + trackIndex, + startFrame: Math.max(0, startFrame), + durationFrames, + trimStartFrame, + trimEndFrame, + hasAudio: item.hasAudio, + addLinkedAudio: item.type === "video" && item.hasAudio, + transform: fitTransformForMedia(item.width, item.height, timeline.width, timeline.height), + }; +} + +async function addMomentToTimelineAtInner( + item: MediaItem, + startFrame: number, + preferredTrackIndex: number | null, + range: SourceRange, + insertTrackAt?: number, +): Promise { + // Stills (or a degenerate range) have no meaningful sub-range → full asset. + const spanSec = range.endSec - range.startSec; + if (item.type === "image" || spanSec <= 0 || item.duration <= 0) { + return addMediaToTimelineAtInner(item, startFrame, preferredTrackIndex, insertTrackAt); + } + + let timeline = useProjectStore.getState().timeline; + if (insertTrackAt !== undefined) { + const res = await insertTrack(item.type === "audio" ? "audio" : "video", insertTrackAt); + await forceRefresh(); + timeline = useProjectStore.getState().timeline; + const insertedTrackId = res?.affectedClipIds[0]; + const insertedIndex = insertedTrackId + ? timeline.tracks.findIndex((track) => track.id === insertedTrackId) + : -1; + if (insertedIndex >= 0) preferredTrackIndex = insertedIndex; + } + let entry = entryForMomentAt(timeline, item, Math.max(0, startFrame), preferredTrackIndex, range); + if (!entry) { + const fallbackInsertAt = preferredTrackIndex ?? undefined; + const res = await insertTrack(item.type === "audio" ? "audio" : "video", fallbackInsertAt); + await forceRefresh(); + timeline = useProjectStore.getState().timeline; + const insertedTrackId = res?.affectedClipIds[0]; + const insertedIndex = insertedTrackId + ? timeline.tracks.findIndex((track) => track.id === insertedTrackId) + : -1; + if (insertedIndex >= 0) { + preferredTrackIndex = insertedIndex; + } else if (fallbackInsertAt !== undefined) { + preferredTrackIndex = Math.max(0, Math.min(fallbackInsertAt, timeline.tracks.length - 1)); + } + entry = entryForMomentAt(timeline, item, Math.max(0, startFrame), preferredTrackIndex, range); + } + if (!entry) return; + const res = await addClips([entry]); + if (res && res.affectedClipIds.length > 0) { + useEditorUiStore.getState().selectClips(new Set(res.affectedClipIds)); + } + if (isTauri) await forceRefresh(); +} + // MARK: - Text tool (Toolbar "T" button, SPEC §4) /** Default text clip duration: 3 seconds at the timeline's fps. */