Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
394 changes: 387 additions & 7 deletions crates/opentake-agent/src/mcp/dispatch.rs

Large diffs are not rendered by default.

118 changes: 118 additions & 0 deletions crates/opentake-agent/src/mcp/media_bridge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,101 @@ pub struct TranscriptSourceResult {
pub error: Option<String>,
}

/// One visual ("Moments") hit for `search_media` — a source-second range in one
/// asset, or a still image (no range). Source-second timings, ready to convert to
/// `trimStartFrame`/`trimEndFrame` (upstream `visualResults`' `moments` entries).
#[derive(Debug, Clone)]
pub struct SearchVisualHit {
/// Asset id (`mediaRef`).
pub media_ref: String,
/// Shot-start in source seconds (omitted for stills).
pub start_seconds: f64,
/// Shot-end in source seconds (omitted for stills).
pub end_seconds: f64,
/// Uncalibrated similarity score (ordering only).
pub score: f32,
/// True for still images: no time range → upstream sets `type: "image"`.
pub is_image: bool,
}

/// One spoken ("Spoken") hit for `search_media`: a transcript segment matching
/// every query term (upstream `spokenResults` entries).
#[derive(Debug, Clone)]
pub struct SearchSpokenHit {
pub media_ref: String,
pub start_seconds: f64,
pub end_seconds: f64,
pub text: String,
}

/// The visual index's state for the `search_media` `status` field, mirroring
/// upstream's `visualStatus` string enum (`ToolExecutor+Search.swift:91-100`).
/// The dispatcher serializes the exact upstream spelling.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SearchIndexState {
/// Model installed, everything (currently) indexed.
Ready,
/// Model installed, indexing still in progress.
Indexing,
/// Model not yet downloaded.
ModelNotInstalled,
/// Model download in flight.
DownloadingModel,
/// Model loading/preparing.
Preparing,
/// Visual search disabled (no backend / build without it).
Disabled,
/// Model load or download failed.
Failed,
}

impl SearchIndexState {
/// The upstream string spelling for the `status` field.
pub fn as_str(self) -> &'static str {
match self {
SearchIndexState::Ready => "ready",
SearchIndexState::Indexing => "indexing",
SearchIndexState::ModelNotInstalled => "modelNotInstalled",
SearchIndexState::DownloadingModel => "downloadingModel",
SearchIndexState::Preparing => "preparing",
SearchIndexState::Disabled => "disabled",
SearchIndexState::Failed => "failed",
}
}
}

/// One asset to search for [`MediaBridge::search_media`]: the dispatcher resolves
/// the candidate set (optionally restricted to one `mediaRef`) and hands these
/// down, since only the bridge can resolve ids to files + read the caches.
#[derive(Debug, Clone)]
pub struct SearchCandidate {
/// Asset id (`mediaRef`).
pub media_ref: String,
/// True for video/image (visual-searchable).
pub is_visual: bool,
/// True for video/audio (spoken-searchable).
pub is_spoken: bool,
}

/// The full `search_media` result the bridge returns; the dispatcher shapes it
/// into the upstream JSON envelope (`status`/`indexableAssets`/`indexedAssets`/
/// `moments`/`spoken`). Groups rank independently and are never blended.
#[derive(Debug, Clone)]
pub struct SearchMediaResult {
/// The visual index state for the `status` field.
pub status: SearchIndexState,
/// Count of visual assets in scope (upstream `indexableAssets`).
pub indexable_assets: usize,
/// How many of those already have a current on-disk index
/// (upstream `indexedAssets`); `None` when the model isn't loaded so the
/// count can't be computed (upstream omits the key then).
pub indexed_assets: Option<usize>,
/// Visual hits (empty when `scope == "spoken"` or the index isn't ready).
pub moments: Vec<SearchVisualHit>,
/// Spoken hits (empty when `scope == "visual"`; work regardless of status).
pub spoken: Vec<SearchSpokenHit>,
}

/// The injected capability boundary for the render + import tools. `Send + Sync`
/// so the [`Dispatcher`](super::dispatch::Dispatcher) can hold `Arc<dyn
/// MediaBridge>` across threads (matching [`CoreHandle`](super::core_handle)).
Expand Down Expand Up @@ -191,6 +286,29 @@ pub trait MediaBridge: Send + Sync {
"import_media: importing is not available in this build",
))
}

/// Search the media library by content: visual (SigLIP2 semantic) and spoken
/// (transcript keyword). `candidates` is the resolved, in-scope asset set
/// (already filtered to one `mediaRef` when the caller restricted it);
/// `scope` is `"visual"`/`"spoken"`/`"both"` and `limit` the per-group cap.
/// The two groups rank independently and are never blended (upstream). The
/// default reports `disabled` with no hits so a bridge-less build still
/// returns an honest, well-formed result the model can read.
fn search_media(
&self,
_candidates: &[SearchCandidate],
_query: &str,
_scope: &str,
_limit: usize,
) -> Result<SearchMediaResult, BridgeError> {
Ok(SearchMediaResult {
status: SearchIndexState::Disabled,
indexable_assets: 0,
indexed_assets: None,
moments: Vec::new(),
spoken: Vec::new(),
})
}
}

/// Turn one [`InspectedFrame`] into an MCP image [`Block`], base64-encoding the
Expand Down
10 changes: 10 additions & 0 deletions crates/opentake-media/src/search/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ pub const EMBEDDING_DIM: usize = 768;
pub const IMAGE_SIZE: u32 = 256;
pub const CONTEXT_LENGTH: usize = 64;

/// Base URL the ONNX model files are fetched from (`{base}/{file}`), mirroring
/// `WhisperModel.base_url`. Placeholder until the ONNX build is hosted (SPEC
/// T8.0): the download command constructs `{base}/image_encoder.onnx` etc., and
/// SHA-256-verifies each against [`manifest`]'s (currently placeholder) hashes,
/// so a real download only succeeds once both this URL and the manifest
/// hashes/bytes are filled in. The Hugging Face `resolve/main` raw-file endpoint
/// is the intended host (same shape as the whisper model URL).
pub const MODEL_DOWNLOAD_BASE_URL: &str =
"https://huggingface.co/opentake/siglip2-base-patch16-256-onnx/resolve/main";

/// The [`EmbedderSpec`] for the configured SigLIP2 model. `normalized` defaults
/// to `false` to match upstream's assumption that the exported model L2-
/// normalizes internally (SPEC §0.8); flip it only if calibration proves the
Expand Down
17 changes: 12 additions & 5 deletions src-tauri/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,18 @@ opentake-core = { workspace = true }
opentake-project = { workspace = true }
opentake-ops = { workspace = true }
opentake-domain = { workspace = true }
# Transcription is ON for the shipped app: `whisper-backend` compiles the
# whisper.cpp CPU backend (via cmake — preinstalled on GitHub runners, no CUDA),
# `model-download` pulls the ggml model over HTTPS with SHA-1 verification. Both
# stay optional at the opentake-media level (its own tests run without them).
opentake-media = { workspace = true, features = ["whisper-backend", "model-download"] }
# Transcription + visual semantic search are ON for the shipped app:
# `whisper-backend` compiles the whisper.cpp CPU backend (via cmake —
# preinstalled on GitHub runners, no CUDA); `ort-backend` links ONNX Runtime for
# the SigLIP2 dual-encoder — its `download-binaries` feature fetches a
# *statically* linked prebuilt onnxruntime at build time and bakes it into the
# binary (no runtime .so/.dylib to ship; same build-time-network shape as
# whisper.cpp / model-download); `model-download` pulls the model files over
# HTTPS with checksum verification. All three stay optional at the
# opentake-media level (its own default tests run offline, without them);
# ort-inference tests are feature-gated + runtime-skipped like the GPU/ffmpeg
# integration tests.
opentake-media = { workspace = true, features = ["whisper-backend", "ort-backend", "model-download"] }
opentake-render = { workspace = true }
opentake-gen = { workspace = true }
opentake-agent = { workspace = true }
Expand Down
6 changes: 6 additions & 0 deletions src-tauri/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ mod library;
mod mcp;
mod media;
mod render;
mod search;
mod secret;
mod transcribe;

Expand Down Expand Up @@ -192,6 +193,11 @@ pub fn run() {
transcribe::transcribe_media,
transcribe::transcript_get,
captions::generate_captions,
search::search_model_status,
search::download_search_model,
search::search_index_status,
search::search_index_start,
search::search_query,
library::library_list,
library::library_favorite,
library::library_unfavorite,
Expand Down
92 changes: 92 additions & 0 deletions src-tauri/src/mcp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use base64::Engine as _;
use opentake_agent::mcp::core_handle::{AppCoreHandle, CoreHandle};
use opentake_agent::mcp::media_bridge::{
BridgeError, ImportOutcome, ImportSource, InspectResult, InspectedFrame, MediaBridge,
SearchCandidate, SearchIndexState, SearchMediaResult, SearchSpokenHit, SearchVisualHit,
TranscriptSource, TranscriptSourceResult,
};
use opentake_agent::mcp::server;
Expand Down Expand Up @@ -247,6 +248,97 @@ impl MediaBridge for TauriMediaBridge {
)),
}
}

fn search_media(
&self,
candidates: &[SearchCandidate],
query: &str,
scope: &str,
limit: usize,
) -> Result<SearchMediaResult, BridgeError> {
// Resolve every candidate id to its source path from the live manifest.
// Missing (offline) files are kept — their index/transcript reads simply
// yield nothing, matching upstream (a missing file has no results, not an
// error). Unresolvable ids are dropped.
let manifest = self.core.media();
let project_dir = self.core.project_dir();
let resolver = opentake_domain::MediaResolver::new(&manifest, project_dir.as_deref());
let mut visual_paths: Vec<(String, PathBuf)> = Vec::new();
let mut spoken_paths: Vec<(String, PathBuf)> = Vec::new();
for c in candidates {
let Some(path) = resolver.expected_path(&c.media_ref) else {
continue;
};
if c.is_visual {
visual_paths.push((c.media_ref.clone(), path.clone()));
}
if c.is_spoken {
spoken_paths.push((c.media_ref.clone(), path));
}
}

let fps = self.core.get_timeline().timeline.fps;
let installed = crate::search::model_installed(&self.engine);

// Visual group (skipped for scope == "spoken").
let (status, indexable_assets, indexed_assets, moments) = if scope == "spoken" {
(SearchIndexState::Disabled, 0, None, Vec::new())
} else {
let (indexable, indexed) = crate::search::visual_coverage(&self.engine, &visual_paths);
// Status mirrors upstream `visualStatus`: without the model it's
// `modelNotInstalled`; with it, `indexing` while any indexable asset
// is still un-indexed, else `ready`. (Download/preparing/failed are
// transient front-end states the panel owns; the tool reports the
// stable installed/ready/indexing view.)
let status = if !installed {
SearchIndexState::ModelNotInstalled
} else if indexable > 0 && indexed < indexable {
SearchIndexState::Indexing
} else {
SearchIndexState::Ready
};
let moments: Vec<SearchVisualHit> =
crate::search::visual_hits_by_id(&self.engine, &visual_paths, query, fps, limit)
.into_iter()
.map(|h| SearchVisualHit {
media_ref: h.media_id,
start_seconds: h.start_sec,
end_seconds: h.end_sec,
score: h.score,
is_image: h.is_image,
})
.collect();
// `indexedAssets` is only meaningful when the model is loaded
// (upstream sets it only when an embedder spec exists).
let indexed_opt = if installed { Some(indexed) } else { None };
(status, indexable, indexed_opt, moments)
};

// Spoken group (skipped for scope == "visual"). Works regardless of the
// visual index — keyword search over cached transcripts.
let spoken: Vec<SearchSpokenHit> = if scope == "visual" {
Vec::new()
} else {
self.engine
.search_spoken(query, &spoken_paths, limit)
.into_iter()
.map(|h| SearchSpokenHit {
media_ref: h.asset_id,
start_seconds: h.start,
end_seconds: h.end,
text: h.text,
})
.collect()
};

Ok(SearchMediaResult {
status,
indexable_assets,
indexed_assets,
moments,
spoken,
})
}
}

impl TauriMediaBridge {
Expand Down
Loading
Loading