From 9d88ad823cb458af3ceb2ae78ce5a1d6657b2412 Mon Sep 17 00:00:00 2001 From: xlgzsgf <51521689+hiqiancheng@users.noreply.github.com> Date: Wed, 3 Jun 2026 10:34:05 +0800 Subject: [PATCH 1/3] feat(agent-service): add native computer use tools Related to #111 --- .../src-tauri/src/commands/built_in_tools.rs | 29 + apps/desktop/src-tauri/src/commands/mod.rs | 3 + .../src/core/built_in_tools/computer.rs | 1710 +++++++++++++++++ .../src-tauri/src/core/built_in_tools/mod.rs | 8 +- .../src/core/built_in_tools/types.rs | 346 ++++ apps/desktop/src-tauri/src/lib.rs | 3 +- apps/desktop/src-tauri/src/testing/mod.rs | 2 + .../src-tauri/tests/computer_commands.rs | 243 +++ .../src/database/artifacts/runtime/seed.sql | 36 + .../AgentService/execution/executor.ts | 55 +- .../services/BuiltInToolService/registry.ts | 2 + .../tools/computer/constants.ts | 397 ++++ .../tools/computer/helper.ts | 247 +++ .../tools/computer/index.ts | 223 +++ .../src/services/BuiltInToolService/types.ts | 5 +- .../services/NativeService/builtInTools.ts | 20 +- .../src/services/NativeService/index.ts | 29 + .../src/services/NativeService/types.ts | 234 +++ .../execution/tool-round-scheduling.test.ts | 289 +++ .../tools/computer/index.test.ts | 367 ++++ .../tests/services/native-service.test.ts | 139 ++ docs/computer-use.md | 137 ++ 22 files changed, 4507 insertions(+), 17 deletions(-) create mode 100644 apps/desktop/src-tauri/src/core/built_in_tools/computer.rs create mode 100644 apps/desktop/src-tauri/tests/computer_commands.rs create mode 100644 apps/desktop/src/services/BuiltInToolService/tools/computer/constants.ts create mode 100644 apps/desktop/src/services/BuiltInToolService/tools/computer/helper.ts create mode 100644 apps/desktop/src/services/BuiltInToolService/tools/computer/index.ts create mode 100644 apps/desktop/tests/services/AgentService/execution/tool-round-scheduling.test.ts create mode 100644 apps/desktop/tests/services/BuiltInToolService/tools/computer/index.test.ts create mode 100644 docs/computer-use.md diff --git a/apps/desktop/src-tauri/src/commands/built_in_tools.rs b/apps/desktop/src-tauri/src/commands/built_in_tools.rs index 1826370d..aeddf0c3 100644 --- a/apps/desktop/src-tauri/src/commands/built_in_tools.rs +++ b/apps/desktop/src-tauri/src/commands/built_in_tools.rs @@ -4,6 +4,8 @@ use crate::core::built_in_tools::{ self, BashExecutionRegistry, BuiltInBashExecutionRequest, BuiltInBashExecutionResponse, + ComputerActionRequest, ComputerActionResponse, ComputerObservationRequest, + ComputerObservationResponse, ComputerRuntime, ComputerSessionRequest, ComputerSessionResponse, }; use tauri::State; @@ -30,3 +32,30 @@ pub fn built_in_tools_cancel_bash( ) -> Result { Ok(registry.cancel(&execution_id)) } + +/// Start or refresh a native computer-use session. +#[tauri::command] +pub fn built_in_tools_computer_session( + request: ComputerSessionRequest, + runtime: State<'_, ComputerRuntime>, +) -> Result { + built_in_tools::computer_session(request, runtime.inner()) +} + +/// Observe the native desktop surface for a computer-use session. +#[tauri::command] +pub fn built_in_tools_computer_observe( + request: ComputerObservationRequest, + runtime: State<'_, ComputerRuntime>, +) -> Result { + built_in_tools::computer_observe(request, runtime.inner()) +} + +/// Execute one bounded computer-use action and return an auditable receipt. +#[tauri::command] +pub fn built_in_tools_computer_act( + request: ComputerActionRequest, + runtime: State<'_, ComputerRuntime>, +) -> Result { + built_in_tools::computer_act(request, runtime.inner()) +} diff --git a/apps/desktop/src-tauri/src/commands/mod.rs b/apps/desktop/src-tauri/src/commands/mod.rs index 987d6465..a138dec5 100644 --- a/apps/desktop/src-tauri/src/commands/mod.rs +++ b/apps/desktop/src-tauri/src/commands/mod.rs @@ -59,6 +59,9 @@ pub fn invoke_handler( paths::get_runtime_info, built_in_tools::built_in_tools_execute_bash, built_in_tools::built_in_tools_cancel_bash, + built_in_tools::built_in_tools_computer_session, + built_in_tools::built_in_tools_computer_observe, + built_in_tools::built_in_tools_computer_act, mcp::mcp_connect_server, mcp::mcp_disconnect_server, mcp::mcp_list_tools, diff --git a/apps/desktop/src-tauri/src/core/built_in_tools/computer.rs b/apps/desktop/src-tauri/src/core/built_in_tools/computer.rs new file mode 100644 index 00000000..3cc6d4ae --- /dev/null +++ b/apps/desktop/src-tauri/src/core/built_in_tools/computer.rs @@ -0,0 +1,1710 @@ +// Copyright (c) 2026. Qian Cheng. Licensed under GPL v3 + +//! Native computer-use runtime. +//! +//! This module intentionally owns the policy and receipt boundary for desktop +//! actions. External CUA or vision providers may later add target candidates, +//! but they should not bypass this runtime's validation and audit shape. + +use std::{ + collections::{HashMap, HashSet}, + sync::Mutex, + time::{Duration, Instant}, +}; + +use super::types::{ + ComputerActionOperation, ComputerActionRequest, ComputerActionResponse, ComputerActionStatus, + ComputerBackgroundCapability, ComputerBounds, ComputerCapabilitySnapshot, + ComputerDisplaySnapshot, ComputerElementSnapshot, ComputerExecutionMode, + ComputerGroundingCapability, ComputerLane, ComputerObservationInclude, ComputerObservationMode, + ComputerObservationRequest, ComputerObservationResponse, ComputerObservationTree, + ComputerResolvedTarget, ComputerRoute, ComputerScreenshotSnapshot, ComputerSessionRequest, + ComputerSessionResponse, ComputerSessionStatus, ComputerTarget, ComputerWindowSnapshot, +}; + +const PROVIDER_CUA: &str = "cua"; +const PROVIDER_OMNIPARSER: &str = "omniparser"; +const PROVIDER_UI_TARS: &str = "ui_tars"; + +#[derive(Debug, Clone)] +struct ComputerSessionState { + target: ComputerTarget, + capabilities: ComputerCapabilitySnapshot, + observed_native_ids: HashSet, + created_at: Instant, +} + +#[derive(Default)] +struct ComputerUseRuntimeState { + sessions: HashMap, + next_observation_id: u64, + next_action_id: u64, +} + +/// Runtime state for native computer-use calls. +pub struct ComputerUseRuntime { + state: Mutex, + action_lock: Mutex<()>, +} + +impl ComputerUseRuntime { + pub fn new() -> Self { + Self { + state: Mutex::new(ComputerUseRuntimeState::default()), + action_lock: Mutex::new(()), + } + } + + pub fn start_session( + &self, + request: ComputerSessionRequest, + ) -> Result { + validate_non_empty("sessionId", &request.session_id)?; + validate_non_empty("reason", &request.reason)?; + + let capabilities = platform_capabilities(); + let status = if capabilities + .routes + .iter() + .any(|route| route == &ComputerRoute::Unsupported) + { + ComputerSessionStatus::Unsupported + } else { + ComputerSessionStatus::Ready + }; + let mut warnings = Vec::new(); + if status == ComputerSessionStatus::Unsupported { + warnings.push("Computer use is not implemented on this platform yet.".to_string()); + } + if request.provider_hints.iter().any(|hint| { + matches!( + hint.as_str(), + PROVIDER_CUA | PROVIDER_OMNIPARSER | PROVIDER_UI_TARS + ) + }) { + warnings.push( + "External grounding providers are adapter hooks only and are not bundled." + .to_string(), + ); + } + + let mut state = self + .state + .lock() + .map_err(|_| "ComputerUseRuntime state lock poisoned".to_string())?; + state.sessions.insert( + request.session_id.clone(), + ComputerSessionState { + observed_native_ids: HashSet::new(), + target: request.target.clone(), + capabilities: capabilities.clone(), + created_at: Instant::now(), + }, + ); + + Ok(ComputerSessionResponse { + session_id: request.session_id, + status, + capabilities, + target: request.target, + warnings, + }) + } + + pub fn observe( + &self, + request: ComputerObservationRequest, + ) -> Result { + validate_non_empty("sessionId", &request.session_id)?; + validate_non_empty("reason", &request.reason)?; + + let (session_target, session_age_warning, observation_id) = { + let mut state = self + .state + .lock() + .map_err(|_| "ComputerUseRuntime state lock poisoned".to_string())?; + let (target, warning) = { + let session = state.sessions.get(&request.session_id).ok_or_else(|| { + format!("computer session '{}' was not found", request.session_id) + })?; + let warning = if session.created_at.elapsed() > Duration::from_secs(10 * 60) { + Some( + "Computer session is older than 10 minutes; refresh if grounding looks stale.", + ) + } else { + None + }; + + (session.target.clone(), warning.map(str::to_string)) + }; + state.next_observation_id += 1; + ( + target, + warning, + format!("obs-{}", state.next_observation_id), + ) + }; + + let target = merge_target(session_target, request.target); + let displays = observe_displays(); + let windows = observe_windows(); + let include_tree = request.include.contains(&ComputerObservationInclude::Tree) + || matches!( + &request.mode, + ComputerObservationMode::Tree | ComputerObservationMode::TreeAndScreenshot + ); + let include_screenshot = request + .include + .contains(&ComputerObservationInclude::Screenshot) + || matches!( + &request.mode, + ComputerObservationMode::Screenshot | ComputerObservationMode::TreeAndScreenshot + ); + let tree = include_tree.then(|| build_tree(&windows)); + let screenshot = include_screenshot.then(|| build_screenshot_placeholder(&displays)); + let observed_native_ids = collect_observation_native_ids(&windows, tree.as_ref()); + let mut warnings = Vec::new(); + if let Some(warning) = session_age_warning { + warnings.push(warning); + } + if include_screenshot { + warnings.push( + "Screenshot payload is represented as metadata; image materialization is adapter-scoped." + .to_string(), + ); + } + + { + let mut state = self + .state + .lock() + .map_err(|_| "ComputerUseRuntime state lock poisoned".to_string())?; + if let Some(session) = state.sessions.get_mut(&request.session_id) { + session.observed_native_ids.extend(observed_native_ids); + } + } + + Ok(ComputerObservationResponse { + observation_id, + session_id: request.session_id, + platform: platform_name().to_string(), + target, + displays, + windows, + tree, + screenshot, + warnings, + }) + } + + pub fn act(&self, request: ComputerActionRequest) -> Result { + validate_non_empty("sessionId", &request.session_id)?; + validate_non_empty("reason", &request.reason)?; + validate_operation(&request.operation, request.value.as_deref())?; + validate_target_shape(&request.target)?; + + let _guard = self + .action_lock + .lock() + .map_err(|_| "ComputerUseRuntime action lock poisoned".to_string())?; + + let (capabilities, observed_native_ids) = { + let state = self + .state + .lock() + .map_err(|_| "ComputerUseRuntime state lock poisoned".to_string())?; + state + .sessions + .get(&request.session_id) + .map(|session| { + ( + session.capabilities.clone(), + session.observed_native_ids.clone(), + ) + }) + .ok_or_else(|| format!("computer session '{}' was not found", request.session_id))? + }; + + let route = resolve_route(&request)?; + let mut normalized_target = NormalizedTarget::from(&request.target); + let lane = resolve_lane(&normalized_target); + let background_safe = request.execution_mode == ComputerExecutionMode::Background + && route == ComputerRoute::Win32Message; + + if route == ComputerRoute::Win32Message + && request.execution_mode != ComputerExecutionMode::Background + { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Blocked, + vec!["route 'win32.message' requires background execution".to_string()], + )?); + } + if route == ComputerRoute::Win32Message && !request.options.allow_background { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Blocked, + vec!["route 'win32.message' requires allowBackground=true".to_string()], + )?); + } + if route == ComputerRoute::Win32SendInput + && request.execution_mode == ComputerExecutionMode::Background + { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Blocked, + vec!["route 'win32.send_input' cannot execute in background mode".to_string()], + )?); + } + if route == ComputerRoute::ScreenCapture { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Blocked, + vec!["route 'screen.capture' cannot execute computer actions".to_string()], + )?); + } + if route == ComputerRoute::Unsupported { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Unsupported, + vec!["computer action route is unsupported on this platform".to_string()], + )?); + } + if !capabilities + .routes + .iter() + .any(|candidate| candidate == &route) + { + return Ok(self.action_response( + request, + route.clone(), + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Unsupported, + vec![format!( + "route '{}' is not available for this session", + route_label(&route) + )], + )?); + } + + if request.execution_mode == ComputerExecutionMode::Background + && normalized_target.has_coordinates() + { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Blocked, + vec!["coordinate targets cannot be executed in background mode".to_string()], + )?); + } + if request.execution_mode == ComputerExecutionMode::Background + && !normalized_target.has_native_reference() + { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Blocked, + vec![ + "background execution requires a native windowId or elementId target" + .to_string(), + ], + )?); + } + if normalized_target.has_native_reference() + && !normalized_target + .native_references() + .iter() + .any(|native_id| observed_native_ids.contains(*native_id)) + { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Blocked, + vec!["native target was not observed in this computer session".to_string()], + )?); + } + if normalized_target.has_native_reference() { + match resolve_native_target_coordinates(&normalized_target) { + Ok(resolved_target) => normalized_target = resolved_target, + Err(error) => { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Blocked, + vec![error], + )?); + } + } + } + + if lane == ComputerLane::Unsupported && request.operation != ComputerActionOperation::Wait { + return Ok(self.action_response( + request, + route, + lane, + background_safe, + false, + false, + normalized_target.resolved(), + ComputerActionStatus::Blocked, + vec!["computer action target could not be resolved".to_string()], + )?); + } + + let dry_run = request.options.dry_run; + let mut warnings = Vec::new(); + let mut cursor_moved = false; + let mut foreground_changed = false; + let mut status = ComputerActionStatus::Success; + + if dry_run { + warnings.push("dryRun=true: action was validated but not executed.".to_string()); + } else if request.operation == ComputerActionOperation::Wait { + std::thread::sleep(Duration::from_millis(request.timeout_ms.min(2_000))); + } else if let Err(error) = execute_native_action(&request, &normalized_target, &route) { + status = ComputerActionStatus::Error; + warnings.push(format!( + "Failed to execute computer action '{}': {error}", + operation_label(&request.operation) + )); + } else { + cursor_moved = + route == ComputerRoute::Win32SendInput && is_pointer_operation(&request.operation); + foreground_changed = request.execution_mode == ComputerExecutionMode::Foreground + && route == ComputerRoute::Win32SendInput; + if route == ComputerRoute::Win32Message { + warnings.push( + "win32.message queued input messages; target handling is not confirmed." + .to_string(), + ); + } + } + + self.action_response( + request, + route, + lane, + background_safe, + cursor_moved, + foreground_changed, + normalized_target.resolved(), + status, + warnings, + ) + } + + fn next_action_id(&self) -> Result { + let mut state = self + .state + .lock() + .map_err(|_| "ComputerUseRuntime state lock poisoned".to_string())?; + state.next_action_id += 1; + Ok(format!("act-{}", state.next_action_id)) + } + + fn action_response( + &self, + request: ComputerActionRequest, + route: ComputerRoute, + lane: ComputerLane, + background_safe: bool, + cursor_moved: bool, + foreground_changed: bool, + target_resolved: ComputerResolvedTarget, + status: ComputerActionStatus, + warnings: Vec, + ) -> Result { + let post_action_observation = if status == ComputerActionStatus::Success + && request.options.post_action_observe + && !request.options.dry_run + { + Some(self.build_post_action_observation(&request.session_id)?) + } else { + None + }; + + Ok(ComputerActionResponse { + action_id: self.next_action_id()?, + session_id: request.session_id, + operation: request.operation, + route, + lane, + background_safe, + cursor_moved, + foreground_changed, + target_resolved, + status, + warnings, + post_action_observation, + }) + } + + fn build_post_action_observation( + &self, + session_id: &str, + ) -> Result { + let (session_target, session_age_warning, observation_id) = { + let mut state = self + .state + .lock() + .map_err(|_| "ComputerUseRuntime state lock poisoned".to_string())?; + let (target, warning) = { + let session = state + .sessions + .get(session_id) + .ok_or_else(|| format!("computer session '{session_id}' was not found"))?; + let warning = if session.created_at.elapsed() > Duration::from_secs(10 * 60) { + Some( + "Computer session is older than 10 minutes; refresh if grounding looks stale.", + ) + } else { + None + }; + + (session.target.clone(), warning.map(str::to_string)) + }; + state.next_observation_id += 1; + ( + target, + warning, + format!("obs-{}", state.next_observation_id), + ) + }; + + let displays = observe_displays(); + let windows = observe_windows(); + let tree = Some(build_tree(&windows)); + let observed_native_ids = collect_observation_native_ids(&windows, tree.as_ref()); + let mut warnings = Vec::new(); + if let Some(warning) = session_age_warning { + warnings.push(warning); + } + + { + let mut state = self + .state + .lock() + .map_err(|_| "ComputerUseRuntime state lock poisoned".to_string())?; + if let Some(session) = state.sessions.get_mut(session_id) { + session.observed_native_ids.extend(observed_native_ids); + } + } + + Ok(ComputerObservationResponse { + observation_id, + session_id: session_id.to_string(), + platform: platform_name().to_string(), + target: session_target, + displays, + windows, + tree, + screenshot: None, + warnings, + }) + } +} + +impl Default for ComputerUseRuntime { + fn default() -> Self { + Self::new() + } +} + +pub type ComputerRuntime = ComputerUseRuntime; + +pub fn computer_session( + request: ComputerSessionRequest, + runtime: &ComputerRuntime, +) -> Result { + runtime.start_session(request) +} + +pub fn computer_observe( + request: ComputerObservationRequest, + runtime: &ComputerRuntime, +) -> Result { + runtime.observe(request) +} + +pub fn computer_act( + request: ComputerActionRequest, + runtime: &ComputerRuntime, +) -> Result { + runtime.act(request) +} + +fn validate_non_empty(label: &str, value: &str) -> Result<(), String> { + if value.trim().is_empty() { + return Err(format!("{label} cannot be empty")); + } + Ok(()) +} + +fn validate_operation( + operation: &ComputerActionOperation, + value: Option<&str>, +) -> Result<(), String> { + match operation { + ComputerActionOperation::Click + | ComputerActionOperation::DoubleClick + | ComputerActionOperation::RightClick + | ComputerActionOperation::Move + | ComputerActionOperation::Drag + | ComputerActionOperation::Scroll + | ComputerActionOperation::Wait => Ok(()), + ComputerActionOperation::TypeText if value.is_some_and(|text| !text.is_empty()) => Ok(()), + ComputerActionOperation::TypeText => Err("value is required for type_text".to_string()), + ComputerActionOperation::PressKey | ComputerActionOperation::Hotkey + if value.is_some_and(|text| !text.is_empty()) => + { + Ok(()) + } + ComputerActionOperation::PressKey | ComputerActionOperation::Hotkey => Err(format!( + "value is required for {}", + operation_label(operation) + )), + } +} + +fn validate_target_shape(target: &ComputerTarget) -> Result<(), String> { + if target.x.is_some() != target.y.is_some() { + return Err("target.x and target.y must be provided together".to_string()); + } + if target.width.is_some() != target.height.is_some() { + return Err("target.width and target.height must be provided together".to_string()); + } + if let Some(coordinates) = target.coordinates.as_ref() { + if coordinates.width.is_some() != coordinates.height.is_some() { + return Err( + "target.coordinates.width and target.coordinates.height must be provided together" + .to_string(), + ); + } + } + Ok(()) +} + +fn resolve_route(request: &ComputerActionRequest) -> Result { + if request.route_hint != ComputerRoute::Auto { + return Ok(request.route_hint.clone()); + } + if request.execution_mode == ComputerExecutionMode::Background { + return Ok(ComputerRoute::Win32Message); + } + Ok(ComputerRoute::Win32SendInput) +} + +fn route_label(route: &ComputerRoute) -> &'static str { + match route { + ComputerRoute::Auto => "auto", + ComputerRoute::Win32SendInput => "win32.send_input", + ComputerRoute::Win32Message => "win32.message", + ComputerRoute::ScreenCapture => "screen.capture", + ComputerRoute::Unsupported => "unsupported", + } +} + +fn operation_label(operation: &ComputerActionOperation) -> &'static str { + match operation { + ComputerActionOperation::Click => "click", + ComputerActionOperation::DoubleClick => "double_click", + ComputerActionOperation::RightClick => "right_click", + ComputerActionOperation::Move => "move", + ComputerActionOperation::Drag => "drag", + ComputerActionOperation::Scroll => "scroll", + ComputerActionOperation::TypeText => "type_text", + ComputerActionOperation::PressKey => "press_key", + ComputerActionOperation::Hotkey => "hotkey", + ComputerActionOperation::Wait => "wait", + } +} + +fn is_pointer_operation(operation: &ComputerActionOperation) -> bool { + matches!( + operation, + ComputerActionOperation::Click + | ComputerActionOperation::DoubleClick + | ComputerActionOperation::RightClick + | ComputerActionOperation::Move + | ComputerActionOperation::Drag + ) +} + +fn resolve_lane(target: &NormalizedTarget) -> ComputerLane { + if target.has_native_reference() { + ComputerLane::NativeTree + } else if target.has_coordinates() { + ComputerLane::VisionFallback + } else { + ComputerLane::Unsupported + } +} + +fn merge_target(session_target: ComputerTarget, request_target: ComputerTarget) -> ComputerTarget { + if request_target.scope.is_none() + && request_target.label.is_none() + && request_target.window_id.is_none() + && request_target.element_id.is_none() + && request_target.x.is_none() + && request_target.coordinates.is_none() + && request_target.window.is_none() + && request_target.element.is_none() + { + return session_target; + } + request_target +} + +#[derive(Debug, Clone)] +struct NormalizedTarget { + x: Option, + y: Option, + element_id: Option, + window_id: Option, +} + +impl NormalizedTarget { + fn from(target: &ComputerTarget) -> Self { + Self { + x: target + .x + .or_else(|| target.coordinates.as_ref().map(|coords| coords.x)), + y: target + .y + .or_else(|| target.coordinates.as_ref().map(|coords| coords.y)), + element_id: target.element_id.clone().or_else(|| { + target + .element + .as_ref() + .and_then(|element| element.id.clone()) + }), + window_id: target + .window_id + .clone() + .or_else(|| target.window.as_ref().and_then(|window| window.id.clone())), + } + } + + fn has_coordinates(&self) -> bool { + self.x.is_some() && self.y.is_some() + } + + fn has_native_reference(&self) -> bool { + self.element_id.is_some() || self.window_id.is_some() + } + + fn native_references(&self) -> Vec<&str> { + [self.element_id.as_deref(), self.window_id.as_deref()] + .into_iter() + .flatten() + .collect() + } + + fn window_message_target_id(&self) -> Option<&str> { + self.window_id.as_deref().or(self.element_id.as_deref()) + } + + fn resolved(&self) -> ComputerResolvedTarget { + ComputerResolvedTarget { + x: self.x, + y: self.y, + element_id: self.element_id.clone(), + window_id: self.window_id.clone(), + confidence: if self.has_native_reference() || self.has_coordinates() { + 1.0 + } else { + 0.0 + }, + } + } +} + +fn collect_observation_native_ids( + windows: &[ComputerWindowSnapshot], + tree: Option<&ComputerObservationTree>, +) -> HashSet { + let mut native_ids: HashSet = windows + .iter() + .map(|window| window.element_id.clone()) + .collect(); + + if let Some(tree) = tree { + for element in &tree.elements { + collect_element_native_ids(element, &mut native_ids); + } + } + + native_ids +} + +fn collect_element_native_ids(element: &ComputerElementSnapshot, native_ids: &mut HashSet) { + native_ids.insert(element.element_id.clone()); + for child in &element.children { + collect_element_native_ids(child, native_ids); + } +} + +fn resolve_native_target_coordinates( + target: &NormalizedTarget, +) -> Result { + let Some(native_id) = target.window_message_target_id() else { + return Ok(target.clone()); + }; + + let (x, y) = resolve_native_window_center(native_id)?; + let mut resolved = target.clone(); + if resolved.x.is_none() { + resolved.x = Some(x); + } + if resolved.y.is_none() { + resolved.y = Some(y); + } + Ok(resolved) +} + +#[cfg(target_os = "windows")] +fn resolve_native_window_center(native_id: &str) -> Result<(i32, i32), String> { + use windows::Win32::{ + Foundation::RECT, + UI::WindowsAndMessaging::{GetWindowRect, IsWindow}, + }; + + let hwnd = parse_window_id(native_id) + .ok_or_else(|| "native window target is no longer valid".to_string())?; + unsafe { + if !IsWindow(hwnd).as_bool() { + return Err("native window target is no longer valid".to_string()); + } + + let mut rect = RECT::default(); + GetWindowRect(hwnd, &mut rect) + .map_err(|_| "native window target is no longer valid".to_string())?; + if rect.right <= rect.left || rect.bottom <= rect.top { + return Err("native window target is no longer valid".to_string()); + } + + Ok(((rect.left + rect.right) / 2, (rect.top + rect.bottom) / 2)) + } +} + +#[cfg(not(target_os = "windows"))] +fn resolve_native_window_center(_native_id: &str) -> Result<(i32, i32), String> { + Err("native window target is no longer valid".to_string()) +} + +fn platform_capabilities() -> ComputerCapabilitySnapshot { + #[cfg(target_os = "windows")] + { + ComputerCapabilitySnapshot { + platform: "windows".to_string(), + lanes: vec![ComputerLane::NativeTree, ComputerLane::VisionFallback], + routes: vec![ + ComputerRoute::Win32SendInput, + ComputerRoute::Win32Message, + ComputerRoute::ScreenCapture, + ], + background: ComputerBackgroundCapability { + supported: true, + routes: vec![ComputerRoute::Win32Message], + limitations: vec![ + "Background actions require native window or element targets.".to_string(), + "Coordinate-only targets are foreground-only.".to_string(), + ], + }, + grounding: ComputerGroundingCapability { + tree: true, + screenshot: true, + click_prediction: false, + external_providers: vec![ + PROVIDER_CUA.to_string(), + PROVIDER_OMNIPARSER.to_string(), + PROVIDER_UI_TARS.to_string(), + ], + }, + } + } + + #[cfg(not(target_os = "windows"))] + { + ComputerCapabilitySnapshot { + platform: platform_name().to_string(), + lanes: vec![ComputerLane::Unsupported], + routes: vec![ComputerRoute::Unsupported], + background: ComputerBackgroundCapability { + supported: false, + routes: Vec::new(), + limitations: vec![ + "Native computer use is implemented for Windows first.".to_string() + ], + }, + grounding: ComputerGroundingCapability { + tree: false, + screenshot: false, + click_prediction: false, + external_providers: vec![ + PROVIDER_CUA.to_string(), + PROVIDER_OMNIPARSER.to_string(), + PROVIDER_UI_TARS.to_string(), + ], + }, + } + } +} + +fn platform_name() -> &'static str { + #[cfg(target_os = "windows")] + { + "windows" + } + #[cfg(target_os = "macos")] + { + "macos" + } + #[cfg(target_os = "linux")] + { + "linux" + } + #[cfg(not(any(target_os = "windows", target_os = "macos", target_os = "linux")))] + { + "unknown" + } +} + +fn observe_displays() -> Vec { + #[cfg(target_os = "windows")] + { + use windows::Win32::UI::WindowsAndMessaging::{ + GetSystemMetrics, SM_CXVIRTUALSCREEN, SM_CYVIRTUALSCREEN, SM_XVIRTUALSCREEN, + SM_YVIRTUALSCREEN, + }; + + unsafe { + return vec![ComputerDisplaySnapshot { + id: "display-0".to_string(), + x: GetSystemMetrics(SM_XVIRTUALSCREEN), + y: GetSystemMetrics(SM_YVIRTUALSCREEN), + width: GetSystemMetrics(SM_CXVIRTUALSCREEN), + height: GetSystemMetrics(SM_CYVIRTUALSCREEN), + scale_factor: 1.0, + primary: true, + }]; + } + } + + #[cfg(not(target_os = "windows"))] + { + Vec::new() + } +} + +fn observe_windows() -> Vec { + #[cfg(target_os = "windows")] + { + windows_foreground_window() + .map(|window| vec![window]) + .unwrap_or_default() + } + + #[cfg(not(target_os = "windows"))] + { + Vec::new() + } +} + +fn build_tree(windows: &[ComputerWindowSnapshot]) -> ComputerObservationTree { + ComputerObservationTree { + lane: if cfg!(target_os = "windows") { + ComputerLane::NativeTree + } else { + ComputerLane::Unsupported + }, + elements: windows + .iter() + .map(|window| ComputerElementSnapshot { + element_id: window.element_id.clone(), + role: "window".to_string(), + name: window.title.clone(), + bounds: Some(window.bounds.clone()), + states: if window.focused { + vec!["focused".to_string()] + } else { + Vec::new() + }, + value: None, + children: window_child_elements(&window.element_id), + }) + .collect(), + } +} + +fn build_screenshot_placeholder( + displays: &[ComputerDisplaySnapshot], +) -> ComputerScreenshotSnapshot { + let (width, height) = displays + .first() + .map(|display| (display.width.max(0), display.height.max(0))) + .unwrap_or((0, 0)); + ComputerScreenshotSnapshot { + format: "png".to_string(), + width, + height, + data_base64: None, + path: None, + } +} + +#[cfg(target_os = "windows")] +fn window_child_elements(window_id: &str) -> Vec { + use std::ffi::c_void; + + use windows::Win32::{ + Foundation::{BOOL, HWND, LPARAM}, + UI::WindowsAndMessaging::EnumChildWindows, + }; + + let Some(parent_hwnd) = parse_window_id(window_id) else { + return Vec::new(); + }; + + let mut elements = Vec::new(); + { + let mut callback = |hwnd| { + if elements.len() >= 64 { + return false; + } + if let Some(element) = window_element_snapshot(hwnd) { + elements.push(element); + } + true + }; + let mut trait_obj: &mut dyn FnMut(HWND) -> bool = &mut callback; + let closure_pointer_pointer: *mut c_void = unsafe { std::mem::transmute(&mut trait_obj) }; + let lparam = LPARAM(closure_pointer_pointer as _); + + unsafe extern "system" fn enumerate_callback(hwnd: HWND, lparam: LPARAM) -> BOOL { + let closure = &mut *(lparam.0 as *mut c_void as *mut &mut dyn FnMut(HWND) -> bool); + closure(hwnd).into() + } + + let _ = unsafe { EnumChildWindows(parent_hwnd, Some(enumerate_callback), lparam) }; + } + + elements +} + +#[cfg(not(target_os = "windows"))] +fn window_child_elements(_window_id: &str) -> Vec { + Vec::new() +} + +#[cfg(target_os = "windows")] +fn windows_foreground_window() -> Option { + use windows::Win32::{ + Foundation::RECT, + UI::WindowsAndMessaging::{GetForegroundWindow, GetWindowRect, IsWindowVisible}, + }; + + unsafe { + let hwnd = GetForegroundWindow(); + if hwnd.0.is_null() { + return None; + } + + let mut rect = RECT::default(); + let _ = GetWindowRect(hwnd, &mut rect); + let title = window_text(hwnd); + Some(ComputerWindowSnapshot { + element_id: hwnd_element_id(hwnd), + title, + process_name: None, + bounds: ComputerBounds { + x: rect.left, + y: rect.top, + width: (rect.right - rect.left).max(0), + height: (rect.bottom - rect.top).max(0), + }, + focused: true, + visible: IsWindowVisible(hwnd).as_bool(), + native: true, + }) + } +} + +#[cfg(target_os = "windows")] +fn window_element_snapshot( + hwnd: windows::Win32::Foundation::HWND, +) -> Option { + use windows::Win32::{ + Foundation::RECT, + UI::WindowsAndMessaging::{GetWindowRect, IsWindowVisible}, + }; + + unsafe { + if !IsWindowVisible(hwnd).as_bool() { + return None; + } + + let mut rect = RECT::default(); + let _ = GetWindowRect(hwnd, &mut rect); + let class_name = window_class_name(hwnd); + let title = window_text(hwnd); + let name = if title.is_empty() { + class_name.clone() + } else { + title + }; + + Some(ComputerElementSnapshot { + element_id: hwnd_element_id(hwnd), + role: role_from_class_name(&class_name).to_string(), + name, + bounds: Some(ComputerBounds { + x: rect.left, + y: rect.top, + width: (rect.right - rect.left).max(0), + height: (rect.bottom - rect.top).max(0), + }), + states: vec!["visible".to_string()], + value: None, + children: Vec::new(), + }) + } +} + +#[cfg(target_os = "windows")] +fn window_text(hwnd: windows::Win32::Foundation::HWND) -> String { + use windows::Win32::UI::WindowsAndMessaging::{GetWindowTextLengthW, GetWindowTextW}; + + unsafe { + let title_len = GetWindowTextLengthW(hwnd); + let mut title_buffer = vec![0u16; title_len.max(0) as usize + 1]; + let copied = if title_buffer.is_empty() { + 0 + } else { + GetWindowTextW(hwnd, &mut title_buffer) + }; + String::from_utf16_lossy(&title_buffer[..copied.max(0) as usize]) + } +} + +#[cfg(target_os = "windows")] +fn window_class_name(hwnd: windows::Win32::Foundation::HWND) -> String { + use windows::Win32::UI::WindowsAndMessaging::GetClassNameW; + + unsafe { + let mut class_buffer = vec![0u16; 256]; + let copied = GetClassNameW(hwnd, &mut class_buffer); + String::from_utf16_lossy(&class_buffer[..copied.max(0) as usize]) + } +} + +#[cfg(target_os = "windows")] +fn hwnd_element_id(hwnd: windows::Win32::Foundation::HWND) -> String { + format!("window:{:x}", hwnd.0 as usize) +} + +#[cfg(target_os = "windows")] +fn role_from_class_name(class_name: &str) -> &'static str { + let normalized = class_name.to_ascii_lowercase(); + if normalized.contains("button") { + "button" + } else if normalized.contains("edit") || normalized.contains("textbox") { + "textbox" + } else if normalized.contains("combo") { + "combobox" + } else if normalized.contains("list") { + "list" + } else if normalized.contains("menu") { + "menu" + } else { + "control" + } +} + +fn execute_native_action( + request: &ComputerActionRequest, + target: &NormalizedTarget, + route: &ComputerRoute, +) -> Result<(), String> { + #[cfg(not(target_os = "windows"))] + { + let _ = (request, target, route); + return Err("native computer action execution is only available on Windows".to_string()); + } + + #[cfg(target_os = "windows")] + { + match route { + ComputerRoute::Win32SendInput => execute_send_input_action(request, target), + ComputerRoute::Win32Message => execute_window_message_action(request, target), + _ => Err(format!( + "route '{}' cannot execute computer actions", + route_label(route) + )), + } + } +} + +#[cfg(target_os = "windows")] +fn execute_send_input_action( + request: &ComputerActionRequest, + target: &NormalizedTarget, +) -> Result<(), String> { + use windows::Win32::UI::Input::KeyboardAndMouse::{ + INPUT, INPUT_0, INPUT_MOUSE, MOUSEEVENTF_LEFTDOWN, MOUSEEVENTF_LEFTUP, MOUSEEVENTF_MOVE, + MOUSEEVENTF_RIGHTDOWN, MOUSEEVENTF_RIGHTUP, MOUSEINPUT, + }; + + if request.operation == ComputerActionOperation::Wait { + return Ok(()); + } + if !matches!( + &request.operation, + ComputerActionOperation::Click + | ComputerActionOperation::DoubleClick + | ComputerActionOperation::RightClick + | ComputerActionOperation::Move + ) { + return Err(format!( + "operation '{}' is validated but not yet implemented by SendInput", + operation_label(&request.operation) + )); + } + + let (x, y) = match (target.x, target.y) { + (Some(x), Some(y)) => (x, y), + _ => return Err("SendInput pointer actions require x/y coordinates".to_string()), + }; + unsafe { + windows::Win32::UI::WindowsAndMessaging::SetCursorPos(x, y) + .map_err(|error| format!("failed to position cursor: {error}"))?; + if request.operation == ComputerActionOperation::Move { + let input = INPUT { + r#type: INPUT_MOUSE, + Anonymous: INPUT_0 { + mi: MOUSEINPUT { + dx: 0, + dy: 0, + mouseData: 0, + dwFlags: MOUSEEVENTF_MOVE, + time: 0, + dwExtraInfo: 0, + }, + }, + }; + send_input_checked(&[input], "move")?; + return Ok(()); + } + + let (down, up) = if request.operation == ComputerActionOperation::RightClick { + (MOUSEEVENTF_RIGHTDOWN, MOUSEEVENTF_RIGHTUP) + } else { + (MOUSEEVENTF_LEFTDOWN, MOUSEEVENTF_LEFTUP) + }; + let pair = [mouse_input(down), mouse_input(up)]; + send_input_checked(&pair, "pointer click")?; + if request.operation == ComputerActionOperation::DoubleClick { + send_input_checked(&pair, "second pointer click")?; + } + } + Ok(()) +} + +#[cfg(target_os = "windows")] +fn send_input_checked( + inputs: &[windows::Win32::UI::Input::KeyboardAndMouse::INPUT], + label: &str, +) -> Result<(), String> { + use windows::Win32::UI::Input::KeyboardAndMouse::{SendInput, INPUT}; + + let sent = unsafe { SendInput(inputs, std::mem::size_of::() as i32) }; + if sent == inputs.len() as u32 { + return Ok(()); + } + + Err(format!( + "SendInput submitted {sent}/{} events for {label}", + inputs.len() + )) +} + +#[cfg(target_os = "windows")] +fn mouse_input( + flags: windows::Win32::UI::Input::KeyboardAndMouse::MOUSE_EVENT_FLAGS, +) -> windows::Win32::UI::Input::KeyboardAndMouse::INPUT { + use windows::Win32::UI::Input::KeyboardAndMouse::{INPUT, INPUT_0, INPUT_MOUSE, MOUSEINPUT}; + INPUT { + r#type: INPUT_MOUSE, + Anonymous: INPUT_0 { + mi: MOUSEINPUT { + dx: 0, + dy: 0, + mouseData: 0, + dwFlags: flags, + time: 0, + dwExtraInfo: 0, + }, + }, + } +} + +#[cfg(target_os = "windows")] +fn execute_window_message_action( + request: &ComputerActionRequest, + target: &NormalizedTarget, +) -> Result<(), String> { + if request.operation == ComputerActionOperation::Wait { + return Ok(()); + } + if !matches!( + &request.operation, + ComputerActionOperation::Click + | ComputerActionOperation::DoubleClick + | ComputerActionOperation::RightClick + ) { + return Err(format!( + "operation '{}' is not background-safe through win32.message", + operation_label(&request.operation) + )); + } + let hwnd = target + .window_id + .as_deref() + .or(target.element_id.as_deref()) + .and_then(parse_window_id) + .ok_or_else(|| "win32.message requires a native window id".to_string())?; + + use windows::Win32::{ + Foundation::{POINT, WPARAM}, + Graphics::Gdi::ScreenToClient, + UI::WindowsAndMessaging::{ + PostMessageW, WM_LBUTTONDOWN, WM_LBUTTONUP, WM_RBUTTONDOWN, WM_RBUTTONUP, + }, + }; + unsafe { + let (down, up) = if request.operation == ComputerActionOperation::RightClick { + (WM_RBUTTONDOWN, WM_RBUTTONUP) + } else { + (WM_LBUTTONDOWN, WM_LBUTTONUP) + }; + let mut point = POINT { + x: target.x.unwrap_or_default(), + y: target.y.unwrap_or_default(), + }; + if !ScreenToClient(hwnd, &mut point).as_bool() { + return Err("failed to resolve target point in client coordinates".to_string()); + } + let lparam = mouse_lparam(point.x, point.y)?; + + PostMessageW(hwnd, down, WPARAM(0), lparam) + .map_err(|error| format!("failed to post mouse down: {error}"))?; + PostMessageW(hwnd, up, WPARAM(0), lparam) + .map_err(|error| format!("failed to post mouse up: {error}"))?; + if request.operation == ComputerActionOperation::DoubleClick { + PostMessageW(hwnd, down, WPARAM(0), lparam) + .map_err(|error| format!("failed to post second mouse down: {error}"))?; + PostMessageW(hwnd, up, WPARAM(0), lparam) + .map_err(|error| format!("failed to post second mouse up: {error}"))?; + } + } + Ok(()) +} + +#[cfg(target_os = "windows")] +fn mouse_lparam(x: i32, y: i32) -> Result { + if !(i16::MIN as i32..=i16::MAX as i32).contains(&x) + || !(i16::MIN as i32..=i16::MAX as i32).contains(&y) + { + return Err("client coordinates exceed win32 message range".to_string()); + } + + let packed = ((y as u16 as u32) << 16) | (x as u16 as u32); + Ok(windows::Win32::Foundation::LPARAM(packed as isize)) +} + +#[cfg(target_os = "windows")] +fn parse_window_id(value: &str) -> Option { + let raw = value.strip_prefix("window:").unwrap_or(value); + let parsed = usize::from_str_radix(raw, 16).ok()?; + Some(windows::Win32::Foundation::HWND(parsed as _)) +} + +#[cfg(test)] +mod tests { + use super::super::types::{ComputerActionOptions, ComputerCapability}; + use super::*; + + fn target() -> ComputerTarget { + ComputerTarget { + scope: Some("foreground".to_string()), + label: Some("Focused window".to_string()), + ..Default::default() + } + } + + fn session_request() -> ComputerSessionRequest { + ComputerSessionRequest { + session_id: "session-1".to_string(), + target: target(), + capabilities: vec![ComputerCapability::NativeTree], + provider_hints: Vec::new(), + reason: "test".to_string(), + timeout_ms: 8000, + } + } + + #[test] + fn start_session_rejects_blank_session_id() { + let runtime = ComputerUseRuntime::new(); + let mut request = session_request(); + request.session_id = " ".to_string(); + + assert_eq!( + runtime.start_session(request).unwrap_err(), + "sessionId cannot be empty" + ); + } + + #[test] + fn dry_run_coordinate_click_returns_send_input_receipt() { + let runtime = ComputerUseRuntime::new(); + runtime.start_session(session_request()).unwrap(); + + let response = runtime + .act(ComputerActionRequest { + session_id: "session-1".to_string(), + operation: ComputerActionOperation::Click, + target: ComputerTarget { + scope: Some("screen".to_string()), + x: Some(10), + y: Some(20), + ..Default::default() + }, + value: None, + execution_mode: ComputerExecutionMode::Foreground, + reason: "test click".to_string(), + route_hint: ComputerRoute::Auto, + timeout_ms: 8000, + options: ComputerActionOptions { + allow_background: false, + dry_run: true, + post_action_observe: false, + }, + }) + .unwrap(); + + assert_eq!(response.route, ComputerRoute::Win32SendInput); + assert_eq!(response.lane, ComputerLane::VisionFallback); + assert_eq!(response.status, ComputerActionStatus::Success); + assert_eq!(response.target_resolved.x, Some(10)); + } + + #[test] + fn screen_capture_route_cannot_execute_actions() { + let runtime = ComputerUseRuntime::new(); + runtime.start_session(session_request()).unwrap(); + + let response = runtime + .act(ComputerActionRequest { + session_id: "session-1".to_string(), + operation: ComputerActionOperation::Click, + target: ComputerTarget { + element_id: Some("window:100".to_string()), + ..Default::default() + }, + value: None, + execution_mode: ComputerExecutionMode::Foreground, + reason: "test route".to_string(), + route_hint: ComputerRoute::ScreenCapture, + timeout_ms: 8000, + options: ComputerActionOptions { + allow_background: false, + dry_run: true, + post_action_observe: false, + }, + }) + .unwrap(); + + assert_eq!(response.route, ComputerRoute::ScreenCapture); + assert_eq!(response.status, ComputerActionStatus::Blocked); + assert_eq!( + response.warnings, + vec!["route 'screen.capture' cannot execute computer actions"] + ); + } + + #[test] + fn window_message_route_requires_background_execution() { + let runtime = ComputerUseRuntime::new(); + runtime.start_session(session_request()).unwrap(); + + let response = runtime + .act(ComputerActionRequest { + session_id: "session-1".to_string(), + operation: ComputerActionOperation::Click, + target: ComputerTarget { + x: Some(10), + y: Some(20), + ..Default::default() + }, + value: None, + execution_mode: ComputerExecutionMode::Foreground, + reason: "test foreground route hint".to_string(), + route_hint: ComputerRoute::Win32Message, + timeout_ms: 8000, + options: ComputerActionOptions { + allow_background: false, + dry_run: true, + post_action_observe: false, + }, + }) + .unwrap(); + + assert_eq!(response.route, ComputerRoute::Win32Message); + assert_eq!(response.status, ComputerActionStatus::Blocked); + assert_eq!( + response.warnings, + vec!["route 'win32.message' requires background execution"] + ); + } + + #[test] + fn background_coordinate_actions_are_rejected() { + let runtime = ComputerUseRuntime::new(); + runtime.start_session(session_request()).unwrap(); + + let response = runtime + .act(ComputerActionRequest { + session_id: "session-1".to_string(), + operation: ComputerActionOperation::Click, + target: ComputerTarget { + x: Some(10), + y: Some(20), + ..Default::default() + }, + value: None, + execution_mode: ComputerExecutionMode::Background, + reason: "test background".to_string(), + route_hint: ComputerRoute::Auto, + timeout_ms: 8000, + options: ComputerActionOptions { + allow_background: true, + dry_run: true, + post_action_observe: false, + }, + }) + .unwrap(); + + assert_eq!(response.route, ComputerRoute::Win32Message); + assert_eq!(response.status, ComputerActionStatus::Blocked); + assert_eq!( + response.warnings, + vec!["coordinate targets cannot be executed in background mode"] + ); + } + + #[test] + fn background_window_actions_require_observed_native_targets() { + let runtime = ComputerUseRuntime::new(); + runtime.start_session(session_request()).unwrap(); + + let response = runtime + .act(ComputerActionRequest { + session_id: "session-1".to_string(), + operation: ComputerActionOperation::Click, + target: ComputerTarget { + element_id: Some("window:100".to_string()), + ..Default::default() + }, + value: None, + execution_mode: ComputerExecutionMode::Background, + reason: "test fabricated background target".to_string(), + route_hint: ComputerRoute::Auto, + timeout_ms: 8000, + options: ComputerActionOptions { + allow_background: true, + dry_run: true, + post_action_observe: false, + }, + }) + .unwrap(); + + assert_eq!(response.route, ComputerRoute::Win32Message); + assert_eq!(response.status, ComputerActionStatus::Blocked); + assert_eq!( + response.warnings, + vec!["native target was not observed in this computer session"] + ); + } + + #[test] + fn background_window_actions_do_not_trust_session_target_native_ids() { + let runtime = ComputerUseRuntime::new(); + let mut request = session_request(); + request.target.element_id = Some("window:0".to_string()); + runtime.start_session(request).unwrap(); + + let response = runtime + .act(ComputerActionRequest { + session_id: "session-1".to_string(), + operation: ComputerActionOperation::Click, + target: ComputerTarget { + element_id: Some("window:0".to_string()), + ..Default::default() + }, + value: None, + execution_mode: ComputerExecutionMode::Background, + reason: "test stale background target".to_string(), + route_hint: ComputerRoute::Auto, + timeout_ms: 8000, + options: ComputerActionOptions { + allow_background: true, + dry_run: true, + post_action_observe: false, + }, + }) + .unwrap(); + + assert_eq!(response.route, ComputerRoute::Win32Message); + assert_eq!(response.status, ComputerActionStatus::Blocked); + assert_eq!( + response.warnings, + vec!["native target was not observed in this computer session"] + ); + } + + #[test] + fn background_window_actions_reject_stale_observed_native_targets() { + let runtime = ComputerUseRuntime::new(); + runtime.start_session(session_request()).unwrap(); + { + let mut state = runtime.state.lock().unwrap(); + state + .sessions + .get_mut("session-1") + .unwrap() + .observed_native_ids + .insert("window:0".to_string()); + } + + let response = runtime + .act(ComputerActionRequest { + session_id: "session-1".to_string(), + operation: ComputerActionOperation::Click, + target: ComputerTarget { + element_id: Some("window:0".to_string()), + ..Default::default() + }, + value: None, + execution_mode: ComputerExecutionMode::Background, + reason: "test stale background target".to_string(), + route_hint: ComputerRoute::Auto, + timeout_ms: 8000, + options: ComputerActionOptions { + allow_background: true, + dry_run: true, + post_action_observe: false, + }, + }) + .unwrap(); + + assert_eq!(response.route, ComputerRoute::Win32Message); + assert_eq!(response.status, ComputerActionStatus::Blocked); + assert_eq!( + response.warnings, + vec!["native window target is no longer valid"] + ); + } + + #[cfg(target_os = "windows")] + #[test] + fn mouse_lparam_packs_client_coordinates() { + let lparam = mouse_lparam(12, 34).unwrap(); + + assert_eq!(lparam.0 as usize, 0x0022_000c); + } + + #[test] + fn post_action_observe_returns_follow_up_observation_for_successful_actions() { + let runtime = ComputerUseRuntime::new(); + runtime.start_session(session_request()).unwrap(); + + let response = runtime + .act(ComputerActionRequest { + session_id: "session-1".to_string(), + operation: ComputerActionOperation::Wait, + target: ComputerTarget { + scope: Some("foreground".to_string()), + ..Default::default() + }, + value: None, + execution_mode: ComputerExecutionMode::Foreground, + reason: "test post-action observation".to_string(), + route_hint: ComputerRoute::Auto, + timeout_ms: 1, + options: ComputerActionOptions { + allow_background: false, + dry_run: false, + post_action_observe: true, + }, + }) + .unwrap(); + + assert_eq!(response.status, ComputerActionStatus::Success); + assert!(response.post_action_observation.is_some()); + } +} diff --git a/apps/desktop/src-tauri/src/core/built_in_tools/mod.rs b/apps/desktop/src-tauri/src/core/built_in_tools/mod.rs index 37e8dba0..86f71bd6 100644 --- a/apps/desktop/src-tauri/src/core/built_in_tools/mod.rs +++ b/apps/desktop/src-tauri/src/core/built_in_tools/mod.rs @@ -3,11 +3,17 @@ //! 内置工具原生能力。 mod bash; +mod computer; #[cfg(target_os = "windows")] mod process_utils; mod registry; mod types; pub use bash::execute_bash; +pub use computer::{computer_act, computer_observe, computer_session, ComputerRuntime}; pub use registry::{BashExecutionRegistry, BuiltInProcessExecutionRegistry}; -pub use types::{BuiltInBashExecutionRequest, BuiltInBashExecutionResponse}; +pub use types::{ + BuiltInBashExecutionRequest, BuiltInBashExecutionResponse, ComputerActionRequest, + ComputerActionResponse, ComputerObservationRequest, ComputerObservationResponse, + ComputerSessionRequest, ComputerSessionResponse, +}; diff --git a/apps/desktop/src-tauri/src/core/built_in_tools/types.rs b/apps/desktop/src-tauri/src/core/built_in_tools/types.rs index 8159b1ea..084ba450 100644 --- a/apps/desktop/src-tauri/src/core/built_in_tools/types.rs +++ b/apps/desktop/src-tauri/src/core/built_in_tools/types.rs @@ -54,3 +54,349 @@ pub struct BuiltInBashExecutionResponse { #[serde(default)] pub compressed: bool, } + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerCapability { + NativeTree, + Screenshot, + BackgroundActions, + VisionFallback, + BrowserDom, + ExternalProvider, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerObservationMode { + Tree, + Screenshot, + TreeAndScreenshot, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerObservationInclude { + Displays, + Windows, + Tree, + Screenshot, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerExecutionMode { + Foreground, + Background, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerRoute { + Auto, + #[serde(rename = "win32.send_input")] + Win32SendInput, + #[serde(rename = "win32.message")] + Win32Message, + #[serde(rename = "screen.capture")] + ScreenCapture, + Unsupported, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerActionOperation { + Click, + DoubleClick, + RightClick, + Move, + Drag, + Scroll, + TypeText, + PressKey, + Hotkey, + Wait, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerLane { + NativeTree, + VisionFallback, + BrowserDom, + ExternalProvider, + Unsupported, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerSessionStatus { + Ready, + Unsupported, + Error, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerActionStatus { + Success, + Unsupported, + Blocked, + Error, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerBounds { + pub x: i32, + pub y: i32, + pub width: i32, + pub height: i32, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerWindowTarget { + #[serde(skip_serializing_if = "Option::is_none")] + pub id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub title: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub process_name: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerElementTarget { + #[serde(skip_serializing_if = "Option::is_none")] + pub id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub role: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub name: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerCoordinateTarget { + pub x: i32, + pub y: i32, + #[serde(skip_serializing_if = "Option::is_none")] + pub width: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub height: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub display_id: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerTarget { + #[serde(skip_serializing_if = "Option::is_none")] + pub scope: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub window: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub element: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub coordinates: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub label: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub window_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub element_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub display_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub x: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub y: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub width: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub height: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ComputerSessionRequest { + pub session_id: String, + pub target: ComputerTarget, + #[serde(default)] + pub capabilities: Vec, + #[serde(default)] + pub provider_hints: Vec, + pub reason: String, + pub timeout_ms: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerCapabilitySnapshot { + pub platform: String, + pub lanes: Vec, + pub routes: Vec, + pub background: ComputerBackgroundCapability, + pub grounding: ComputerGroundingCapability, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerBackgroundCapability { + pub supported: bool, + pub routes: Vec, + pub limitations: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerGroundingCapability { + pub tree: bool, + pub screenshot: bool, + pub click_prediction: bool, + pub external_providers: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerSessionResponse { + pub session_id: String, + pub status: ComputerSessionStatus, + pub capabilities: ComputerCapabilitySnapshot, + pub target: ComputerTarget, + pub warnings: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ComputerObservationRequest { + pub session_id: String, + pub mode: ComputerObservationMode, + pub target: ComputerTarget, + #[serde(default)] + pub include: Vec, + pub reason: String, + pub timeout_ms: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerDisplaySnapshot { + pub id: String, + pub x: i32, + pub y: i32, + pub width: i32, + pub height: i32, + pub scale_factor: f64, + pub primary: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerWindowSnapshot { + pub element_id: String, + pub title: String, + pub process_name: Option, + pub bounds: ComputerBounds, + pub focused: bool, + pub visible: bool, + pub native: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerElementSnapshot { + pub element_id: String, + pub role: String, + pub name: String, + pub bounds: Option, + pub states: Vec, + pub value: Option, + pub children: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerObservationTree { + pub lane: ComputerLane, + pub elements: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerScreenshotSnapshot { + pub format: String, + pub width: i32, + pub height: i32, + pub data_base64: Option, + pub path: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerObservationResponse { + pub observation_id: String, + pub session_id: String, + pub platform: String, + pub target: ComputerTarget, + pub displays: Vec, + pub windows: Vec, + pub tree: Option, + pub screenshot: Option, + pub warnings: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +#[serde(rename_all = "camelCase")] +pub struct ComputerActionOptions { + #[serde(default)] + pub allow_background: bool, + #[serde(default)] + pub dry_run: bool, + #[serde(default)] + pub post_action_observe: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ComputerActionRequest { + pub session_id: String, + pub operation: ComputerActionOperation, + pub target: ComputerTarget, + pub value: Option, + pub execution_mode: ComputerExecutionMode, + pub reason: String, + pub route_hint: ComputerRoute, + pub timeout_ms: u64, + #[serde(default)] + pub options: ComputerActionOptions, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerResolvedTarget { + pub x: Option, + pub y: Option, + pub element_id: Option, + pub window_id: Option, + pub confidence: f64, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ComputerActionResponse { + pub action_id: String, + pub session_id: String, + pub operation: ComputerActionOperation, + pub route: ComputerRoute, + pub lane: ComputerLane, + pub background_safe: bool, + pub cursor_moved: bool, + pub foreground_changed: bool, + pub target_resolved: ComputerResolvedTarget, + pub status: ComputerActionStatus, + pub warnings: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub post_action_observation: Option, +} diff --git a/apps/desktop/src-tauri/src/lib.rs b/apps/desktop/src-tauri/src/lib.rs index abc5a880..43023aa0 100644 --- a/apps/desktop/src-tauri/src/lib.rs +++ b/apps/desktop/src-tauri/src/lib.rs @@ -5,7 +5,7 @@ mod core; #[doc(hidden)] pub mod testing; -use core::built_in_tools::BuiltInProcessExecutionRegistry; +use core::built_in_tools::{BuiltInProcessExecutionRegistry, ComputerRuntime}; use core::database::DatabaseRuntime; use core::mcp::McpClientManager; use core::setup; @@ -110,6 +110,7 @@ pub fn run() { .manage(core::window::status_reminder::SessionStatusReminderNotificationRuntime::new()) .manage(core::window::tray::TrayStatusRuntime::new()) .manage(BuiltInProcessExecutionRegistry::new()) + .manage(ComputerRuntime::new()) .manage(McpClientManager::new()) .manage(core::updater::AppUpdaterState::default()) .on_window_event(|window, event| { diff --git a/apps/desktop/src-tauri/src/testing/mod.rs b/apps/desktop/src-tauri/src/testing/mod.rs index a8bb3f28..703f6fb8 100644 --- a/apps/desktop/src-tauri/src/testing/mod.rs +++ b/apps/desktop/src-tauri/src/testing/mod.rs @@ -8,6 +8,7 @@ use tauri::{ use crate::{ commands, core::{ + built_in_tools::ComputerRuntime, database::DatabaseRuntime, updater::AppUpdaterState, window::{ @@ -30,6 +31,7 @@ pub fn test_builder() -> Builder { .manage(SearchSurfaceRuntime::new()) .manage(SessionStatusReminderNotificationRuntime::for_tests()) .manage(TrayStatusRuntime::new()) + .manage(ComputerRuntime::new()) .manage(AppUpdaterState::default()) } diff --git a/apps/desktop/src-tauri/tests/computer_commands.rs b/apps/desktop/src-tauri/tests/computer_commands.rs new file mode 100644 index 00000000..0b89288c --- /dev/null +++ b/apps/desktop/src-tauri/tests/computer_commands.rs @@ -0,0 +1,243 @@ +mod common; + +use common::{build_test_app, invoke_command_err, invoke_command_ok, TestAppOptions}; +use serde_json::{json, Value}; + +fn foreground_target() -> serde_json::Value { + json!({ + "scope": "foreground", + "label": "Focused window" + }) +} + +#[test] +fn computer_session_requires_non_empty_session_id_and_reports_capabilities() { + let test_app = build_test_app(TestAppOptions::default()).expect("test app"); + + let error = invoke_command_err( + &test_app.main_webview, + "built_in_tools_computer_session", + json!({ + "request": { + "sessionId": " ", + "target": foreground_target(), + "capabilities": ["native_tree", "screenshot", "background_actions"], + "providerHints": [], + "reason": "start desktop grounding", + "timeoutMs": 8000 + } + }), + ); + assert_eq!(error, json!("sessionId cannot be empty")); + + let response: Value = invoke_command_ok( + &test_app.main_webview, + "built_in_tools_computer_session", + json!({ + "request": { + "sessionId": "session-call-1", + "target": foreground_target(), + "capabilities": ["native_tree", "screenshot", "background_actions"], + "providerHints": [], + "reason": "start desktop grounding", + "timeoutMs": 8000 + } + }), + ); + + assert_eq!(response["sessionId"], json!("session-call-1")); + assert!(matches!( + response["status"].as_str(), + Some("ready" | "unsupported") + )); + assert!(response["capabilities"]["platform"].is_string()); + assert!(response["capabilities"]["lanes"].is_array()); + assert!(response["capabilities"]["routes"].is_array()); + assert_eq!(response["target"], foreground_target()); +} + +#[test] +fn computer_observe_returns_platform_and_requested_snapshot_shapes() { + let test_app = build_test_app(TestAppOptions::default()).expect("test app"); + + let _: Value = invoke_command_ok( + &test_app.main_webview, + "built_in_tools_computer_session", + json!({ + "request": { + "sessionId": "session-call-1", + "target": foreground_target(), + "capabilities": ["native_tree", "screenshot"], + "providerHints": [], + "reason": "start desktop grounding", + "timeoutMs": 8000 + } + }), + ); + + let response: Value = invoke_command_ok( + &test_app.main_webview, + "built_in_tools_computer_observe", + json!({ + "request": { + "sessionId": "session-call-1", + "mode": "tree_and_screenshot", + "target": foreground_target(), + "include": ["displays", "windows", "tree", "screenshot"], + "reason": "ground next action", + "timeoutMs": 8000 + } + }), + ); + + assert!(response["observationId"] + .as_str() + .is_some_and(|id| id.starts_with("obs-"))); + assert_eq!(response["sessionId"], json!("session-call-1")); + assert!(response["platform"].is_string()); + assert!(response["displays"].is_array()); + assert!(response["windows"].is_array()); + assert!(response["warnings"].is_array()); + assert!(response.get("tree").is_some()); + assert!(response.get("screenshot").is_some()); +} + +#[test] +fn computer_act_dry_run_returns_stable_receipt() { + let test_app = build_test_app(TestAppOptions::default()).expect("test app"); + + let _: Value = invoke_command_ok( + &test_app.main_webview, + "built_in_tools_computer_session", + json!({ + "request": { + "sessionId": "session-call-1", + "target": foreground_target(), + "capabilities": ["native_tree", "screenshot"], + "providerHints": [], + "reason": "start desktop grounding", + "timeoutMs": 8000 + } + }), + ); + + let response: Value = invoke_command_ok( + &test_app.main_webview, + "built_in_tools_computer_act", + json!({ + "request": { + "sessionId": "session-call-1", + "operation": "click", + "target": { + "scope": "screen", + "x": 120, + "y": 130 + }, + "value": null, + "executionMode": "foreground", + "reason": "validate click routing", + "routeHint": "auto", + "timeoutMs": 8000, + "options": { + "allowBackground": false, + "dryRun": true, + "postActionObserve": false + } + } + }), + ); + + assert!(response["actionId"] + .as_str() + .is_some_and(|id| id.starts_with("act-"))); + assert_eq!(response["sessionId"], json!("session-call-1")); + assert_eq!(response["operation"], json!("click")); + assert_eq!(response["route"], json!("win32.send_input")); + assert_eq!(response["lane"], json!("vision_fallback")); + assert_eq!(response["backgroundSafe"], json!(false)); + assert_eq!(response["cursorMoved"], json!(false)); + assert_eq!(response["foregroundChanged"], json!(false)); + assert_eq!(response["targetResolved"]["x"], json!(120)); + assert_eq!(response["targetResolved"]["y"], json!(130)); + assert_eq!(response["status"], json!("success")); +} + +#[test] +fn computer_act_rejects_invalid_route_and_background_coordinate_actions() { + let test_app = build_test_app(TestAppOptions::default()).expect("test app"); + + let _: Value = invoke_command_ok( + &test_app.main_webview, + "built_in_tools_computer_session", + json!({ + "request": { + "sessionId": "session-call-1", + "target": foreground_target(), + "capabilities": ["native_tree", "screenshot", "background_actions"], + "providerHints": [], + "reason": "start desktop grounding", + "timeoutMs": 8000 + } + }), + ); + + let invalid_route: Value = invoke_command_ok( + &test_app.main_webview, + "built_in_tools_computer_act", + json!({ + "request": { + "sessionId": "session-call-1", + "operation": "click", + "target": { "elementId": "window:100" }, + "value": null, + "executionMode": "foreground", + "reason": "bad route", + "routeHint": "screen.capture", + "timeoutMs": 8000, + "options": { + "allowBackground": false, + "dryRun": true, + "postActionObserve": false + } + } + }), + ); + assert_eq!(invalid_route["route"], json!("screen.capture")); + assert_eq!(invalid_route["status"], json!("blocked")); + assert_eq!( + invalid_route["warnings"], + json!(["route 'screen.capture' cannot execute computer actions"]) + ); + + let background_coordinate: Value = invoke_command_ok( + &test_app.main_webview, + "built_in_tools_computer_act", + json!({ + "request": { + "sessionId": "session-call-1", + "operation": "click", + "target": { + "scope": "screen", + "x": 120, + "y": 130 + }, + "value": null, + "executionMode": "background", + "reason": "unsafe background coordinate", + "routeHint": "auto", + "timeoutMs": 8000, + "options": { + "allowBackground": true, + "dryRun": true, + "postActionObserve": false + } + } + }), + ); + assert_eq!(background_coordinate["route"], json!("win32.message")); + assert_eq!(background_coordinate["status"], json!("blocked")); + assert_eq!( + background_coordinate["warnings"], + json!(["coordinate targets cannot be executed in background mode"]) + ); +} diff --git a/apps/desktop/src/database/artifacts/runtime/seed.sql b/apps/desktop/src/database/artifacts/runtime/seed.sql index 3fe89d1a..b36fecbd 100644 --- a/apps/desktop/src/database/artifacts/runtime/seed.sql +++ b/apps/desktop/src/database/artifacts/runtime/seed.sql @@ -158,3 +158,39 @@ INSERT INTO built_in_tools ( ) SELECT 'ask_user_question', 'AskUserQuestion', '向用户提出结构化问题', 1, 'low', NULL WHERE NOT EXISTS (SELECT 1 FROM built_in_tools WHERE tool_id = 'ask_user_question'); + +INSERT INTO built_in_tools ( + tool_id, display_name, description, enabled, risk_level, config_json +) +SELECT + 'computer_session', + 'ComputerSession', + '初始化桌面控制会话并报告平台能力', + 0, + 'high', + '{"timeoutMs":8000,"defaultExecutionMode":"foreground","providerHints":["native_windows","external_adapter"],"enableVisionFallback":false}' +WHERE NOT EXISTS (SELECT 1 FROM built_in_tools WHERE tool_id = 'computer_session'); + +INSERT INTO built_in_tools ( + tool_id, display_name, description, enabled, risk_level, config_json +) +SELECT + 'computer_observe', + 'ComputerObserve', + '观察桌面、窗口和原生控件树', + 0, + 'high', + '{"timeoutMs":8000,"defaultExecutionMode":"foreground","providerHints":["native_windows","external_adapter"],"enableVisionFallback":false}' +WHERE NOT EXISTS (SELECT 1 FROM built_in_tools WHERE tool_id = 'computer_observe'); + +INSERT INTO built_in_tools ( + tool_id, display_name, description, enabled, risk_level, config_json +) +SELECT + 'computer_act', + 'ComputerAct', + '执行单步桌面鼠标或键盘动作', + 0, + 'high', + '{"timeoutMs":8000,"defaultExecutionMode":"foreground","providerHints":["native_windows","external_adapter"],"enableVisionFallback":false}' +WHERE NOT EXISTS (SELECT 1 FROM built_in_tools WHERE tool_id = 'computer_act'); diff --git a/apps/desktop/src/services/AgentService/execution/executor.ts b/apps/desktop/src/services/AgentService/execution/executor.ts index 33d8ac23..2f827015 100644 --- a/apps/desktop/src/services/AgentService/execution/executor.ts +++ b/apps/desktop/src/services/AgentService/execution/executor.ts @@ -35,6 +35,9 @@ import { PersistenceProjector } from '../outputs/persistence'; import type { TurnEvent } from './runtime'; const BUILT_IN_UPGRADE_TOOL_NAME = 'builtin__upgrade_model'; +const BUILT_IN_TOOL_PREFIX = 'builtin__'; +const BUILT_IN_COMPUTER_ACT_TOOL_ID: BuiltInToolId = 'computer_act'; +const BUILT_IN_COMPUTER_ACT_TOOL_NAME = `${BUILT_IN_TOOL_PREFIX}${BUILT_IN_COMPUTER_ACT_TOOL_ID}`; const MAX_REQUEST_MODEL_SWITCHES = 4; const MODEL_SWITCH_EXCLUDED_TOOL_NAMES = [BUILT_IN_UPGRADE_TOOL_NAME]; const toolArgumentsSchema = z.record(z.string(), z.unknown()); @@ -688,6 +691,7 @@ export class AiRequestExecutor { toolCall: AiToolCall; toolCallMessageId: number | null; persister: PersistenceProjector; + executedBuiltInToolsInRound: Set; } & RequestExecutionCallbacks ): Promise { throwIfAborted(options.signal); @@ -718,7 +722,10 @@ export class AiRequestExecutor { toolArgs, iteration: runtime.iteration, currentModel: runtime.activeModel, - hasExecutedBuiltInTool: (toolId) => runtime.executedBuiltInTools.has(toolId), + hasExecutedBuiltInTool: (toolId) => + toolId === BUILT_IN_COMPUTER_ACT_TOOL_ID + ? options.executedBuiltInToolsInRound.has(toolId) + : runtime.executedBuiltInTools.has(toolId), signal: options.signal, toolCallMessageId: options.toolCallMessageId, sessionId: options.persister.getSessionId(), @@ -767,19 +774,40 @@ export class AiRequestExecutor { options.step.chunkReasoning ); - const toolResults = await Promise.all( - options.step.toolCalls.map((toolCall) => - this.executeToolCall(runtime, { - toolCall, - toolCallMessageId, - persister: options.persister, - signal: options.signal, - onChunk: options.onChunk, - requestToolApproval: options.requestToolApproval, - requestUserQuestions: options.requestUserQuestions, - }) - ) + const shouldExecuteSequentially = options.step.toolCalls.some( + (toolCall) => toolCall.name === BUILT_IN_COMPUTER_ACT_TOOL_NAME ); + const executedBuiltInToolsInRound = new Set(); + const executeToolCall = (toolCall: AiToolCall) => + this.executeToolCall(runtime, { + toolCall, + toolCallMessageId, + persister: options.persister, + executedBuiltInToolsInRound, + signal: options.signal, + onChunk: options.onChunk, + requestToolApproval: options.requestToolApproval, + requestUserQuestions: options.requestUserQuestions, + }); + const toolResults: ToolExecutionResult[] = []; + + if (shouldExecuteSequentially) { + for (const toolCall of options.step.toolCalls) { + const toolResult = await executeToolCall(toolCall); + toolResults.push(toolResult); + + if (toolResult.builtInToolId && !toolResult.isError) { + executedBuiltInToolsInRound.add(toolResult.builtInToolId); + runtime.executedBuiltInTools.add(toolResult.builtInToolId); + } + } + } else { + toolResults.push( + ...(await Promise.all( + options.step.toolCalls.map((toolCall) => executeToolCall(toolCall)) + )) + ); + } let requestedModelSwitch: BuiltInToolControlSignal | null = null; for (const { @@ -808,6 +836,7 @@ export class AiRequestExecutor { ); if (builtInToolId && !isError) { + executedBuiltInToolsInRound.add(builtInToolId); runtime.executedBuiltInTools.add(builtInToolId); } diff --git a/apps/desktop/src/services/BuiltInToolService/registry.ts b/apps/desktop/src/services/BuiltInToolService/registry.ts index efd4eb2d..09a5053e 100644 --- a/apps/desktop/src/services/BuiltInToolService/registry.ts +++ b/apps/desktop/src/services/BuiltInToolService/registry.ts @@ -2,6 +2,7 @@ import { builtInTools as askUserTools } from './tools/askUser'; import { builtInTools as bashTools } from './tools/bash'; +import { builtInTools as computerTools } from './tools/computer'; import { builtInTools as fileSearchTools } from './tools/fileSearch'; import { builtInTools as readTools } from './tools/read'; import { builtInTools as settingTools } from './tools/setting'; @@ -55,6 +56,7 @@ export const builtInToolRegistry = new BuiltInToolRegistry(); builtInToolRegistry.register(askUserTools); builtInToolRegistry.register(bashTools); +builtInToolRegistry.register(computerTools); builtInToolRegistry.register(fileSearchTools); builtInToolRegistry.register(readTools); builtInToolRegistry.register(settingTools); diff --git a/apps/desktop/src/services/BuiltInToolService/tools/computer/constants.ts b/apps/desktop/src/services/BuiltInToolService/tools/computer/constants.ts new file mode 100644 index 00000000..536a3d8a --- /dev/null +++ b/apps/desktop/src/services/BuiltInToolService/tools/computer/constants.ts @@ -0,0 +1,397 @@ +// Copyright (c) 2026. 千诚. Licensed under GPL v3 + +import type { + ComputerActionOperation, + ComputerCapability, + ComputerExecutionMode, + ComputerObservationInclude, + ComputerObservationMode, + ComputerRouteHint, + ComputerTarget, +} from '@services/NativeService'; + +import type { AiToolDefinition } from '@/services/AgentService/contracts/tooling'; + +import { + nonEmptyTrimmedStringSchema, + optionalIntegerInRangeSchema, + z, +} from '../../utils/toolSchema'; + +export const COMPUTER_SESSION_TOOL_NAME = 'computer_session'; +export const COMPUTER_OBSERVE_TOOL_NAME = 'computer_observe'; +export const COMPUTER_ACT_TOOL_NAME = 'computer_act'; + +export interface ComputerToolConfig { + timeoutMs: number; + defaultExecutionMode: ComputerExecutionMode; + providerHints: string[]; + enableVisionFallback: boolean; +} + +export const DEFAULT_COMPUTER_TOOL_CONFIG: ComputerToolConfig = { + timeoutMs: 8000, + defaultExecutionMode: 'foreground', + providerHints: ['native_windows', 'external_adapter'], + enableVisionFallback: false, +}; + +export const computerCapabilityValues = [ + 'native_tree', + 'screenshot', + 'background_actions', + 'vision_fallback', + 'browser_dom', + 'external_provider', +] as const satisfies readonly ComputerCapability[]; + +export const computerObservationModeValues = [ + 'tree', + 'screenshot', + 'tree_and_screenshot', +] as const satisfies readonly ComputerObservationMode[]; + +export const computerObservationIncludeValues = [ + 'displays', + 'windows', + 'tree', + 'screenshot', +] as const satisfies readonly ComputerObservationInclude[]; + +export const computerExecutionModeValues = [ + 'foreground', + 'background', +] as const satisfies readonly ComputerExecutionMode[]; + +export const computerRouteHintValues = [ + 'auto', + 'win32.send_input', + 'win32.message', + 'screen.capture', + 'unsupported', +] as const satisfies readonly ComputerRouteHint[]; + +export const computerActionOperationValues = [ + 'click', + 'double_click', + 'right_click', + 'move', + 'drag', + 'scroll', + 'type_text', + 'press_key', + 'hotkey', + 'wait', +] as const satisfies readonly ComputerActionOperation[]; + +const targetSchemaBase = z + .object({ + scope: z.enum(['foreground', 'screen', 'window', 'element', 'region']).optional(), + label: z.string().trim().min(1).optional(), + window: z + .object({ + id: z.string().trim().min(1).optional(), + title: z.string().trim().min(1).optional(), + processName: z.string().trim().min(1).optional(), + }) + .optional(), + element: z + .object({ + id: z.string().trim().min(1).optional(), + role: z.string().trim().min(1).optional(), + name: z.string().trim().min(1).optional(), + }) + .optional(), + coordinates: z + .object({ + x: z.number().finite(), + y: z.number().finite(), + width: z.number().finite().positive().optional(), + height: z.number().finite().positive().optional(), + displayId: z.string().trim().min(1).optional(), + }) + .optional(), + windowId: z.string().trim().min(1).optional(), + elementId: z.string().trim().min(1).optional(), + displayId: z.string().trim().min(1).optional(), + x: z.number().finite().optional(), + y: z.number().finite().optional(), + width: z.number().finite().positive().optional(), + height: z.number().finite().positive().optional(), + }) + .strict(); + +export const computerTargetSchema: z.ZodType = targetSchemaBase + .superRefine((target, ctx) => { + const hasX = target.x !== undefined; + const hasY = target.y !== undefined; + if (hasX !== hasY) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: 'target.x and target.y must be provided together', + path: hasX ? ['y'] : ['x'], + }); + } + + const hasWidth = target.width !== undefined; + const hasHeight = target.height !== undefined; + if (hasWidth !== hasHeight) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: 'target.width and target.height must be provided together', + path: hasWidth ? ['height'] : ['width'], + }); + } + + if (target.coordinates) { + const hasCoordinateWidth = target.coordinates.width !== undefined; + const hasCoordinateHeight = target.coordinates.height !== undefined; + if (hasCoordinateWidth !== hasCoordinateHeight) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: + 'target.coordinates.width and target.coordinates.height must be provided together', + path: ['coordinates', hasCoordinateWidth ? 'height' : 'width'], + }); + } + } + }) + .transform((target) => target); + +export const computerToolConfigSchema = z + .object({ + timeoutMs: optionalIntegerInRangeSchema(1000, 120000).catch(undefined), + defaultExecutionMode: z.enum(computerExecutionModeValues).optional().catch(undefined), + providerHints: z.array(nonEmptyTrimmedStringSchema).optional().catch(undefined), + enableVisionFallback: z.boolean().optional().catch(undefined), + }) + .transform( + (value): ComputerToolConfig => ({ + timeoutMs: value.timeoutMs ?? DEFAULT_COMPUTER_TOOL_CONFIG.timeoutMs, + defaultExecutionMode: + value.defaultExecutionMode ?? DEFAULT_COMPUTER_TOOL_CONFIG.defaultExecutionMode, + providerHints: + value.providerHints && value.providerHints.length > 0 + ? value.providerHints + : DEFAULT_COMPUTER_TOOL_CONFIG.providerHints, + enableVisionFallback: + value.enableVisionFallback ?? DEFAULT_COMPUTER_TOOL_CONFIG.enableVisionFallback, + }) + ); + +export const computerSessionArgsSchema = z + .object({ + sessionId: z.string().trim().min(1).optional(), + target: computerTargetSchema.default({ scope: 'foreground' }), + capabilities: z.array(z.enum(computerCapabilityValues)).optional(), + providerHints: z.array(nonEmptyTrimmedStringSchema).optional(), + reason: nonEmptyTrimmedStringSchema, + }) + .strict(); + +export const computerObserveArgsSchema = z + .object({ + sessionId: nonEmptyTrimmedStringSchema, + mode: z.enum(computerObservationModeValues).default('tree'), + target: computerTargetSchema.default({ scope: 'foreground' }), + include: z.array(z.enum(computerObservationIncludeValues)).optional(), + reason: nonEmptyTrimmedStringSchema, + }) + .strict(); + +export const computerActArgsSchema = z + .object({ + sessionId: nonEmptyTrimmedStringSchema, + operation: z.enum(computerActionOperationValues), + target: computerTargetSchema, + value: z.string().optional(), + executionMode: z.enum(computerExecutionModeValues).optional(), + routeHint: z.enum(computerRouteHintValues).optional(), + dryRun: z.boolean().optional(), + postActionObserve: z.boolean().optional(), + reason: nonEmptyTrimmedStringSchema, + }) + .strict() + .superRefine((args, ctx) => { + if (args.operation === 'type_text' && !args.value) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: 'value is required for type_text', + path: ['value'], + }); + } + + if ((args.operation === 'press_key' || args.operation === 'hotkey') && !args.value) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `value is required for ${args.operation}`, + path: ['value'], + }); + } + }); + +export const COMPUTER_SESSION_TOOL_DESCRIPTION = [ + 'Start a native-first computer-use session for the local desktop.', + 'Use this before observing or acting on the operating system UI.', + 'The tool reports available grounding lanes, action routes, and background-action support.', +].join(' '); + +export const COMPUTER_OBSERVE_TOOL_DESCRIPTION = [ + 'Observe the local desktop for a computer-use session.', + 'Prefer native UI tree grounding and keep screenshots as fallback when requested.', + 'Use the returned element IDs or coordinates to ground later computer_act calls.', +].join(' '); + +export const COMPUTER_ACT_TOOL_DESCRIPTION = [ + 'Execute one local desktop UI action for a computer-use session.', + 'This tool is native-first and runs at most once per model turn.', + 'Use observed element IDs whenever possible; coordinate targets require both x and y.', +].join(' '); + +const targetInputSchema = { + type: 'object', + description: + 'Desktop target. Use scope foreground for the active window, elementId/windowId from computer_observe for native targets, or x/y coordinates for screen positions.', + properties: { + scope: { + type: 'string', + enum: ['foreground', 'screen', 'window', 'element', 'region'], + description: 'Target scope.', + }, + label: { type: 'string', description: 'Human-readable target label.' }, + window: { + type: 'object', + description: 'Native window target metadata.', + properties: { + id: { type: 'string' }, + title: { type: 'string' }, + processName: { type: 'string' }, + }, + additionalProperties: false, + }, + element: { + type: 'object', + description: 'Native element target metadata.', + properties: { + id: { type: 'string' }, + role: { type: 'string' }, + name: { type: 'string' }, + }, + additionalProperties: false, + }, + coordinates: { + type: 'object', + description: 'Coordinate target. Use only as foreground fallback.', + properties: { + x: { type: 'number' }, + y: { type: 'number' }, + width: { type: 'number' }, + height: { type: 'number' }, + displayId: { type: 'string' }, + }, + required: ['x', 'y'], + additionalProperties: false, + }, + windowId: { type: 'string', description: 'Native window identifier.' }, + elementId: { type: 'string', description: 'Native element identifier.' }, + displayId: { type: 'string', description: 'Display identifier.' }, + x: { type: 'number', description: 'Screen x coordinate. Requires y.' }, + y: { type: 'number', description: 'Screen y coordinate. Requires x.' }, + width: { type: 'number', description: 'Region width. Requires height.' }, + height: { type: 'number', description: 'Region height. Requires width.' }, + }, + additionalProperties: false, +} as const; + +export const COMPUTER_SESSION_TOOL_INPUT_SCHEMA: AiToolDefinition['input_schema'] = { + type: 'object', + properties: { + sessionId: { + type: 'string', + description: + 'Optional stable session id. Omit to let TouchAI create one from the tool call id.', + }, + target: targetInputSchema, + capabilities: { + type: 'array', + items: { type: 'string', enum: [...computerCapabilityValues] }, + description: 'Requested capabilities for this session.', + }, + providerHints: { + type: 'array', + items: { type: 'string' }, + description: 'Optional provider hints for the native bridge.', + }, + reason: { + type: 'string', + description: 'Why this desktop session is needed.', + }, + }, + required: ['reason'], + additionalProperties: false, +}; + +export const COMPUTER_OBSERVE_TOOL_INPUT_SCHEMA: AiToolDefinition['input_schema'] = { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Session id returned by computer_session.' }, + mode: { + type: 'string', + enum: [...computerObservationModeValues], + description: + 'Observation mode. Prefer tree for native grounding; request tree_and_screenshot only when screenshot fallback is needed.', + default: 'tree', + }, + target: targetInputSchema, + include: { + type: 'array', + items: { type: 'string', enum: [...computerObservationIncludeValues] }, + description: 'Observation payload sections to return.', + }, + reason: { type: 'string', description: 'Why this observation is needed.' }, + }, + required: ['sessionId', 'reason'], + additionalProperties: false, +}; + +export const COMPUTER_ACT_TOOL_INPUT_SCHEMA: AiToolDefinition['input_schema'] = { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Session id returned by computer_session.' }, + operation: { + type: 'string', + enum: [...computerActionOperationValues], + description: 'Action operation to execute.', + }, + target: targetInputSchema, + value: { + type: 'string', + description: 'Text for type_text or key/key chord for press_key and hotkey.', + }, + executionMode: { + type: 'string', + enum: [...computerExecutionModeValues], + description: + 'foreground uses ordinary input. background is allowed only for native element or window targets.', + }, + routeHint: { + type: 'string', + enum: [...computerRouteHintValues], + description: 'Optional native route preference. auto is recommended.', + default: 'auto', + }, + dryRun: { + type: 'boolean', + description: 'Validate and resolve the action without applying it.', + default: false, + }, + postActionObserve: { + type: 'boolean', + description: 'Ask the native bridge to observe after the action.', + default: false, + }, + reason: { type: 'string', description: 'Why this desktop action is needed.' }, + }, + required: ['sessionId', 'operation', 'target', 'reason'], + additionalProperties: false, +}; diff --git a/apps/desktop/src/services/BuiltInToolService/tools/computer/helper.ts b/apps/desktop/src/services/BuiltInToolService/tools/computer/helper.ts new file mode 100644 index 00000000..5c25cf67 --- /dev/null +++ b/apps/desktop/src/services/BuiltInToolService/tools/computer/helper.ts @@ -0,0 +1,247 @@ +// Copyright (c) 2026. 千诚. Licensed under GPL v3 + +import type { + ComputerActionReceipt, + ComputerActionRequest, + ComputerActionResponse, + ComputerCapability, + ComputerObservationInclude, + ComputerObservationRequest, + ComputerObservationResponse, + ComputerSessionRequest, + ComputerSessionResponse, + ComputerTarget, +} from '@services/NativeService'; + +import { parseToolArguments, parseToolConfigJson } from '../../utils/toolSchema'; +import { + COMPUTER_ACT_TOOL_NAME, + COMPUTER_OBSERVE_TOOL_NAME, + COMPUTER_SESSION_TOOL_NAME, + computerActArgsSchema, + computerObserveArgsSchema, + computerSessionArgsSchema, + type ComputerToolConfig, + computerToolConfigSchema, + DEFAULT_COMPUTER_TOOL_CONFIG, +} from './constants'; + +const DEFAULT_SESSION_CAPABILITIES: ComputerCapability[] = [ + 'native_tree', + 'screenshot', + 'background_actions', +]; + +const DEFAULT_OBSERVE_INCLUDES: ComputerObservationInclude[] = ['displays', 'windows', 'tree']; + +export function parseComputerToolConfig(configJson: string | null): ComputerToolConfig { + return parseToolConfigJson(computerToolConfigSchema, configJson, DEFAULT_COMPUTER_TOOL_CONFIG); +} + +function cleanTarget(target: ComputerTarget): ComputerTarget { + const normalized: ComputerTarget = { + ...target, + windowId: target.windowId ?? target.window?.id, + elementId: target.elementId ?? target.element?.id, + x: target.x ?? target.coordinates?.x, + y: target.y ?? target.coordinates?.y, + width: target.width ?? target.coordinates?.width, + height: target.height ?? target.coordinates?.height, + displayId: target.displayId ?? target.coordinates?.displayId, + }; + + return Object.fromEntries( + Object.entries(normalized).filter(([, value]) => value !== undefined) + ) as ComputerTarget; +} + +function defaultProviderHints(config: ComputerToolConfig): string[] { + return config.providerHints.filter((providerHint) => { + if (providerHint === 'cua' || providerHint === 'omniparser' || providerHint === 'ui_tars') { + return config.enableVisionFallback; + } + + return true; + }); +} + +function defaultCapabilities(config: ComputerToolConfig): ComputerCapability[] { + return config.enableVisionFallback + ? [...DEFAULT_SESSION_CAPABILITIES, 'vision_fallback'] + : DEFAULT_SESSION_CAPABILITIES; +} + +export function buildComputerSessionRequest( + args: Record, + config: ComputerToolConfig, + callId: string +): ComputerSessionRequest { + const parsedArgs = parseToolArguments( + COMPUTER_SESSION_TOOL_NAME, + computerSessionArgsSchema, + args + ); + + return { + sessionId: parsedArgs.sessionId ?? `session-${callId}`, + target: cleanTarget(parsedArgs.target), + capabilities: parsedArgs.capabilities ?? defaultCapabilities(config), + providerHints: parsedArgs.providerHints ?? defaultProviderHints(config), + reason: parsedArgs.reason, + timeoutMs: config.timeoutMs, + }; +} + +export function buildComputerObservationRequest( + args: Record, + config: ComputerToolConfig +): ComputerObservationRequest { + const parsedArgs = parseToolArguments( + COMPUTER_OBSERVE_TOOL_NAME, + computerObserveArgsSchema, + args + ); + + return { + sessionId: parsedArgs.sessionId, + mode: parsedArgs.mode, + target: cleanTarget(parsedArgs.target), + include: parsedArgs.include ?? DEFAULT_OBSERVE_INCLUDES, + reason: parsedArgs.reason, + timeoutMs: config.timeoutMs, + }; +} + +function isNativeElementTarget(target: ComputerTarget): boolean { + return Boolean(target.elementId || target.windowId || target.element?.id || target.window?.id); +} + +export function buildComputerActionRequest( + args: Record, + config: ComputerToolConfig +): ComputerActionRequest { + const parsedArgs = parseToolArguments(COMPUTER_ACT_TOOL_NAME, computerActArgsSchema, args); + const target = cleanTarget(parsedArgs.target); + const executionMode = parsedArgs.executionMode ?? config.defaultExecutionMode; + + if (executionMode === 'background' && !isNativeElementTarget(target)) { + throw new Error('background execution requires a native elementId or windowId target'); + } + + return { + sessionId: parsedArgs.sessionId, + operation: parsedArgs.operation, + target, + value: parsedArgs.value ?? null, + executionMode, + reason: parsedArgs.reason, + routeHint: parsedArgs.routeHint ?? 'auto', + timeoutMs: config.timeoutMs, + options: { + allowBackground: executionMode === 'background', + dryRun: parsedArgs.dryRun ?? false, + postActionObserve: parsedArgs.postActionObserve ?? false, + }, + }; +} + +function warningLines(warnings?: readonly string[]): string[] { + if (!warnings || warnings.length === 0) { + return []; + } + + return [`Warnings: ${warnings.join('; ')}`]; +} + +export function formatComputerSessionResult(response: ComputerSessionResponse): string { + const lines = [ + `Computer session ${response.status}: ${response.sessionId}`, + `Target: ${response.target.label ?? response.target.scope ?? 'unspecified'}`, + `Lanes: ${response.capabilities.lanes.join(', ') || 'none'}`, + `Routes: ${response.capabilities.routes.join(', ') || 'none'}`, + `Background actions: ${response.capabilities.background.supported ? 'supported' : 'unsupported'}`, + ...warningLines(response.warnings), + ]; + + if (response.status === 'ready') { + lines[0] = `Computer session ready: ${response.sessionId}`; + } + + return lines.join('\n'); +} + +export function formatComputerObservationResult(response: ComputerObservationResponse): string { + const focusedWindow = response.windows.find((window) => window.focused); + const lines = [ + `Observation ${response.observationId} for session ${response.sessionId}`, + `Platform: ${response.platform}`, + `Target: ${response.target.label ?? response.target.scope ?? 'unspecified'}`, + `Displays: ${response.displays.length}`, + `Windows: ${response.windows.length}${focusedWindow ? ` (${focusedWindow.title})` : ''}`, + ]; + + if (response.tree) { + lines.push(`Tree lane: ${response.tree.lane}; elements: ${response.tree.elements.length}`); + } + + if (response.screenshot) { + lines.push( + `Screenshot: ${response.screenshot.width}x${response.screenshot.height} ${response.screenshot.format}` + ); + } + + lines.push(...warningLines(response.warnings)); + return lines.join('\n'); +} + +export function formatComputerActionResult(response: ComputerActionResponse): string { + const receipt = computerActionReceipt(response); + const lines = [ + `Computer action ${receipt.status}: ${response.actionId}`, + JSON.stringify( + { + operation: response.operation, + route: receipt.route, + lane: receipt.lane, + backgroundSafe: receipt.backgroundSafe, + cursorMoved: receipt.cursorMoved, + foregroundChanged: receipt.foregroundChanged, + targetResolved: receipt.targetResolved, + status: receipt.status, + warnings: receipt.warnings, + }, + null, + 2 + ), + ]; + + if (response.postActionObservation) { + const observation = response.postActionObservation; + lines.push( + [ + `Post-action observation: ${observation.observationId}`, + `Windows: ${observation.windows.length}`, + `Tree elements: ${observation.tree?.elements.length ?? 0}`, + ].join('\n') + ); + } + + return lines.join('\n'); +} + +export function computerActionReceipt(response: ComputerActionResponse): ComputerActionReceipt { + if ('receipt' in response) { + return response.receipt; + } + + return { + route: response.route, + lane: response.lane, + backgroundSafe: response.backgroundSafe, + cursorMoved: response.cursorMoved, + foregroundChanged: response.foregroundChanged, + targetResolved: response.targetResolved, + status: response.status, + warnings: response.warnings, + }; +} diff --git a/apps/desktop/src/services/BuiltInToolService/tools/computer/index.ts b/apps/desktop/src/services/BuiltInToolService/tools/computer/index.ts new file mode 100644 index 00000000..fdb2e8bf --- /dev/null +++ b/apps/desktop/src/services/BuiltInToolService/tools/computer/index.ts @@ -0,0 +1,223 @@ +// Copyright (c) 2026. 千诚. Licensed under GPL v3 + +import { native } from '@services/NativeService'; + +import { AiError, AiErrorCode } from '@/services/AgentService/contracts/errors'; +import { normalizeOptionalString, truncateText } from '@/utils/text'; + +import { + type BaseBuiltInToolExecutionContext, + BuiltInTool, + type BuiltInToolConversationSemantic, + type BuiltInToolExecutionResult, + type BuiltInToolGroup, +} from '../../types'; +import { + COMPUTER_ACT_TOOL_DESCRIPTION, + COMPUTER_ACT_TOOL_INPUT_SCHEMA, + COMPUTER_OBSERVE_TOOL_DESCRIPTION, + COMPUTER_OBSERVE_TOOL_INPUT_SCHEMA, + COMPUTER_SESSION_TOOL_DESCRIPTION, + COMPUTER_SESSION_TOOL_INPUT_SCHEMA, + type ComputerToolConfig, + DEFAULT_COMPUTER_TOOL_CONFIG, +} from './constants'; +import { + buildComputerActionRequest, + buildComputerObservationRequest, + buildComputerSessionRequest, + computerActionReceipt, + formatComputerActionResult, + formatComputerObservationResult, + formatComputerSessionResult, + parseComputerToolConfig, +} from './helper'; + +function errorResult(error: unknown): BuiltInToolExecutionResult { + if (error instanceof AiError && error.code === AiErrorCode.REQUEST_CANCELLED) { + throw error; + } + + const errorMessage = error instanceof Error ? error.message : String(error); + return { + result: errorMessage, + isError: true, + status: 'error', + errorMessage, + }; +} + +function throwIfCancelled(signal?: AbortSignal): void { + if (signal?.aborted) { + throw new AiError(AiErrorCode.REQUEST_CANCELLED); + } +} + +function buildComputerSemantic( + action: BuiltInToolConversationSemantic['action'], + fallbackTarget: string, + args: Record +): BuiltInToolConversationSemantic { + const reason = normalizeOptionalString(args.reason, { collapseWhitespace: true }); + return { + action, + target: truncateText(reason || fallbackTarget, 120), + }; +} + +export async function executeComputerSessionTool( + args: Record, + config: ComputerToolConfig, + context: BaseBuiltInToolExecutionContext +): Promise { + try { + throwIfCancelled(context.signal); + const request = buildComputerSessionRequest(args, config, context.callId); + const response = await native.builtInTools.startComputerSession(request); + throwIfCancelled(context.signal); + + return { + result: formatComputerSessionResult(response), + isError: response.status !== 'ready', + status: response.status === 'ready' ? 'success' : 'error', + errorMessage: + response.status === 'ready' ? null : `Computer session ${response.status}`, + }; + } catch (error) { + return errorResult(error); + } +} + +export async function executeComputerObserveTool( + args: Record, + config: ComputerToolConfig, + context: BaseBuiltInToolExecutionContext +): Promise { + try { + throwIfCancelled(context.signal); + const request = buildComputerObservationRequest(args, config); + const response = await native.builtInTools.observeComputer(request); + throwIfCancelled(context.signal); + + return { + result: formatComputerObservationResult(response), + isError: false, + status: 'success', + }; + } catch (error) { + return errorResult(error); + } +} + +export async function executeComputerActTool( + args: Record, + config: ComputerToolConfig, + context: BaseBuiltInToolExecutionContext +): Promise { + try { + if (context.hasExecutedBuiltInTool('computer_act')) { + throw new Error('computer_act can run only once per turn'); + } + + throwIfCancelled(context.signal); + const request = buildComputerActionRequest(args, config); + const response = await native.builtInTools.executeComputerAction(request); + throwIfCancelled(context.signal); + const receipt = computerActionReceipt(response); + + return { + result: formatComputerActionResult(response), + isError: receipt.status !== 'success', + status: receipt.status === 'success' ? 'success' : 'error', + errorMessage: receipt.status === 'success' ? null : `Computer action ${receipt.status}`, + }; + } catch (error) { + return errorResult(error); + } +} + +class ComputerSessionTool extends BuiltInTool { + readonly id = 'computer_session' as const; + readonly displayName = 'Computer Session'; + readonly description = COMPUTER_SESSION_TOOL_DESCRIPTION; + readonly inputSchema = COMPUTER_SESSION_TOOL_INPUT_SCHEMA; + readonly defaultConfig = DEFAULT_COMPUTER_TOOL_CONFIG; + + override parseConfig(configJson: string | null): ComputerToolConfig { + return parseComputerToolConfig(configJson); + } + + override buildConversationSemantic(args: Record) { + return buildComputerSemantic('process', 'computer session', args); + } + + override execute( + args: Record, + config: ComputerToolConfig, + context: BaseBuiltInToolExecutionContext + ) { + return executeComputerSessionTool(args, config, context); + } +} + +class ComputerObserveTool extends BuiltInTool { + readonly id = 'computer_observe' as const; + readonly displayName = 'Computer Observe'; + readonly description = COMPUTER_OBSERVE_TOOL_DESCRIPTION; + readonly inputSchema = COMPUTER_OBSERVE_TOOL_INPUT_SCHEMA; + readonly defaultConfig = DEFAULT_COMPUTER_TOOL_CONFIG; + + override parseConfig(configJson: string | null): ComputerToolConfig { + return parseComputerToolConfig(configJson); + } + + override buildConversationSemantic(args: Record) { + return buildComputerSemantic('process', 'computer observation', args); + } + + override execute( + args: Record, + config: ComputerToolConfig, + context: BaseBuiltInToolExecutionContext + ) { + return executeComputerObserveTool(args, config, context); + } +} + +class ComputerActTool extends BuiltInTool { + readonly id = 'computer_act' as const; + readonly displayName = 'Computer Act'; + readonly description = COMPUTER_ACT_TOOL_DESCRIPTION; + readonly inputSchema = COMPUTER_ACT_TOOL_INPUT_SCHEMA; + readonly defaultConfig = DEFAULT_COMPUTER_TOOL_CONFIG; + + override parseConfig(configJson: string | null): ComputerToolConfig { + return parseComputerToolConfig(configJson); + } + + override buildConversationSemantic(args: Record) { + return buildComputerSemantic('run', 'computer action', args); + } + + override execute( + args: Record, + config: ComputerToolConfig, + context: BaseBuiltInToolExecutionContext + ) { + return executeComputerActTool(args, config, context); + } +} + +export const computerSessionTool = new ComputerSessionTool(); +export const computerObserveTool = new ComputerObserveTool(); +export const computerActTool = new ComputerActTool(); + +export const builtInTools: BuiltInToolGroup = [ + computerSessionTool, + computerObserveTool, + computerActTool, +]; + +export { DEFAULT_COMPUTER_TOOL_CONFIG } from './constants'; +export { parseComputerToolConfig } from './helper'; +export type { ComputerToolConfig }; diff --git a/apps/desktop/src/services/BuiltInToolService/types.ts b/apps/desktop/src/services/BuiltInToolService/types.ts index 68f1572e..ea5b77c8 100644 --- a/apps/desktop/src/services/BuiltInToolService/types.ts +++ b/apps/desktop/src/services/BuiltInToolService/types.ts @@ -26,7 +26,10 @@ export type BuiltInToolId = | 'upgrade_model' | 'show_widget' | 'visualize_read_me' - | 'ask_user_question'; + | 'ask_user_question' + | 'computer_session' + | 'computer_observe' + | 'computer_act'; /** * 所有内置工具共享的最小运行时上下文。 diff --git a/apps/desktop/src/services/NativeService/builtInTools.ts b/apps/desktop/src/services/NativeService/builtInTools.ts index 28db1556..9fdb244e 100644 --- a/apps/desktop/src/services/NativeService/builtInTools.ts +++ b/apps/desktop/src/services/NativeService/builtInTools.ts @@ -1,6 +1,15 @@ import { invoke } from '@tauri-apps/api/core'; -import type { BuiltInBashExecutionRequest, BuiltInBashExecutionResponse } from './types'; +import type { + BuiltInBashExecutionRequest, + BuiltInBashExecutionResponse, + ComputerActionRequest, + ComputerActionResponse, + ComputerObservationRequest, + ComputerObservationResponse, + ComputerSessionRequest, + ComputerSessionResponse, +} from './types'; /** * 原生内置工具桥接层。 @@ -12,4 +21,13 @@ export const builtInTools = { cancelBash(executionId: string): Promise { return invoke('built_in_tools_cancel_bash', { executionId }); }, + startComputerSession(request: ComputerSessionRequest): Promise { + return invoke('built_in_tools_computer_session', { request }); + }, + observeComputer(request: ComputerObservationRequest): Promise { + return invoke('built_in_tools_computer_observe', { request }); + }, + executeComputerAction(request: ComputerActionRequest): Promise { + return invoke('built_in_tools_computer_act', { request }); + }, } as const; diff --git a/apps/desktop/src/services/NativeService/index.ts b/apps/desktop/src/services/NativeService/index.ts index b6b6f7fb..65f0044e 100644 --- a/apps/desktop/src/services/NativeService/index.ts +++ b/apps/desktop/src/services/NativeService/index.ts @@ -30,6 +30,35 @@ export type { BuiltInBashExecutionRequest, BuiltInBashExecutionResponse, ClipboardPayload, + ComputerActionOperation, + ComputerActionOptions, + ComputerActionReceipt, + ComputerActionRequest, + ComputerActionResponse, + ComputerActionResponseWithReceipt, + ComputerBounds, + ComputerCapability, + ComputerCapabilitySnapshot, + ComputerCoordinateTarget, + ComputerDisplaySnapshot, + ComputerElementSnapshot, + ComputerElementTarget, + ComputerExecutionMode, + ComputerLane, + ComputerObservationInclude, + ComputerObservationMode, + ComputerObservationRequest, + ComputerObservationResponse, + ComputerObservationTree, + ComputerResolvedTarget, + ComputerRoute, + ComputerRouteHint, + ComputerScreenshotSnapshot, + ComputerSessionRequest, + ComputerSessionResponse, + ComputerTarget, + ComputerWindowSnapshot, + ComputerWindowTarget, PopupConfig, QuickSearchFileItem, QuickSearchResult, diff --git a/apps/desktop/src/services/NativeService/types.ts b/apps/desktop/src/services/NativeService/types.ts index a3a52379..6fbc0dea 100644 --- a/apps/desktop/src/services/NativeService/types.ts +++ b/apps/desktop/src/services/NativeService/types.ts @@ -35,6 +35,240 @@ export interface BuiltInBashExecutionResponse { compressed?: boolean; } +export type ComputerCapability = + | 'native_tree' + | 'screenshot' + | 'background_actions' + | 'vision_fallback' + | 'browser_dom' + | 'external_provider'; + +export type ComputerObservationMode = 'tree' | 'screenshot' | 'tree_and_screenshot'; + +export type ComputerObservationInclude = 'displays' | 'windows' | 'tree' | 'screenshot'; + +export type ComputerExecutionMode = 'foreground' | 'background'; + +export type ComputerRouteHint = + | 'auto' + | 'win32.send_input' + | 'win32.message' + | 'screen.capture' + | 'unsupported'; + +export type ComputerActionOperation = + | 'click' + | 'double_click' + | 'right_click' + | 'move' + | 'drag' + | 'scroll' + | 'type_text' + | 'press_key' + | 'hotkey' + | 'wait'; + +export type ComputerRoute = ComputerRouteHint; + +export type ComputerLane = + | 'native_tree' + | 'vision_fallback' + | 'browser_dom' + | 'external_provider' + | 'unsupported'; + +export interface ComputerBounds { + x: number; + y: number; + width: number; + height: number; +} + +export interface ComputerWindowTarget { + id?: string; + title?: string; + processName?: string; +} + +export interface ComputerElementTarget { + id?: string; + role?: string; + name?: string; +} + +export interface ComputerCoordinateTarget { + x: number; + y: number; + width?: number; + height?: number; + displayId?: string; +} + +export interface ComputerTarget { + scope?: 'foreground' | 'screen' | 'window' | 'element' | 'region'; + window?: ComputerWindowTarget; + element?: ComputerElementTarget; + coordinates?: ComputerCoordinateTarget; + label?: string; + windowId?: string; + elementId?: string; + displayId?: string; + x?: number; + y?: number; + width?: number; + height?: number; +} + +export interface ComputerSessionRequest { + sessionId: string; + target: ComputerTarget; + capabilities: ComputerCapability[]; + providerHints: string[]; + reason: string; + timeoutMs: number; +} + +export interface ComputerCapabilitySnapshot { + platform: string; + lanes: ComputerLane[]; + routes: ComputerRoute[]; + background: { + supported: boolean; + routes: ComputerRoute[]; + limitations: string[]; + }; + grounding: { + tree: boolean; + screenshot: boolean; + clickPrediction: boolean; + externalProviders: string[]; + }; +} + +export interface ComputerSessionResponse { + sessionId: string; + status: 'ready' | 'unsupported' | 'error'; + capabilities: ComputerCapabilitySnapshot; + target: ComputerTarget; + warnings?: string[]; +} + +export interface ComputerObservationRequest { + sessionId: string; + mode: ComputerObservationMode; + target: ComputerTarget; + include: ComputerObservationInclude[]; + reason: string; + timeoutMs: number; +} + +export interface ComputerDisplaySnapshot { + id: string; + x: number; + y: number; + width: number; + height: number; + scaleFactor: number; + primary: boolean; +} + +export interface ComputerWindowSnapshot { + elementId: string; + title: string; + processName?: string | null; + bounds: ComputerBounds; + focused: boolean; + visible: boolean; + native: boolean; +} + +export interface ComputerElementSnapshot { + elementId: string; + role: string; + name: string; + bounds?: ComputerBounds | null; + states?: string[]; + value?: string | null; + children?: ComputerElementSnapshot[]; +} + +export interface ComputerObservationTree { + lane: ComputerLane; + elements: ComputerElementSnapshot[]; +} + +export interface ComputerScreenshotSnapshot { + format: 'png' | 'jpeg'; + width: number; + height: number; + dataBase64?: string | null; + path?: string | null; +} + +export interface ComputerObservationResponse { + observationId: string; + sessionId: string; + platform: string; + target: ComputerTarget; + displays: ComputerDisplaySnapshot[]; + windows: ComputerWindowSnapshot[]; + tree?: ComputerObservationTree | null; + screenshot?: ComputerScreenshotSnapshot | null; + warnings: string[]; +} + +export interface ComputerActionOptions { + allowBackground: boolean; + dryRun: boolean; + postActionObserve: boolean; +} + +export interface ComputerActionRequest { + sessionId: string; + operation: ComputerActionOperation; + target: ComputerTarget; + value: string | null; + executionMode: ComputerExecutionMode; + reason: string; + routeHint: ComputerRouteHint; + timeoutMs: number; + options: ComputerActionOptions; +} + +export interface ComputerResolvedTarget { + x?: number | null; + y?: number | null; + elementId?: string | null; + windowId?: string | null; + confidence: number; +} + +export interface ComputerActionReceipt { + route: ComputerRoute; + lane: ComputerLane; + backgroundSafe: boolean; + cursorMoved: boolean; + foregroundChanged: boolean; + targetResolved: ComputerResolvedTarget; + status: 'success' | 'unsupported' | 'blocked' | 'error'; + warnings: string[]; +} + +export interface ComputerActionResponseBase { + actionId: string; + sessionId: string; + operation: ComputerActionOperation; + postActionObservation?: ComputerObservationResponse | null; +} + +export interface ComputerActionResponseWithReceipt extends ComputerActionResponseBase { + receipt: ComputerActionReceipt; +} + +export type ComputerActionResponse = + | ComputerActionResponseWithReceipt + | (ComputerActionResponseBase & ComputerActionReceipt); + export interface ShowPopupWindowParams { x: number; y: number; diff --git a/apps/desktop/tests/services/AgentService/execution/tool-round-scheduling.test.ts b/apps/desktop/tests/services/AgentService/execution/tool-round-scheduling.test.ts new file mode 100644 index 00000000..0005d7a3 --- /dev/null +++ b/apps/desktop/tests/services/AgentService/execution/tool-round-scheduling.test.ts @@ -0,0 +1,289 @@ +import type { ModelWithProvider } from '@database/queries/models'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +import type { AiStreamChunk } from '@/services/AgentService/contracts/protocol'; +import { AiRequestExecutor } from '@/services/AgentService/execution/executor'; +import type { AiProvider } from '@/services/AgentService/infrastructure/providers'; +import type { PersistenceProjector } from '@/services/AgentService/outputs/persistence'; +import type { BuiltInToolId } from '@/services/BuiltInToolService'; +import { builtInToolService } from '@/services/BuiltInToolService'; + +const { provider, streamedChunks } = vi.hoisted(() => { + const streamedChunks: { value: AiStreamChunk[][]; requestIndex: number } = { + value: [], + requestIndex: 0, + }; + const provider: AiProvider = { + name: 'test-provider', + driver: 'openai', + async request() { + return { + content: '', + }; + }, + async *stream() { + const chunks = streamedChunks.value[streamedChunks.requestIndex] ?? []; + streamedChunks.requestIndex += 1; + for (const chunk of chunks) { + yield chunk; + } + }, + async testConnection() { + return true; + }, + async listModels() { + return []; + }, + getApiTargets() { + return { + normalizedBaseUrl: 'https://example.test', + sdkBaseUrl: 'https://example.test', + generationTarget: 'https://example.test', + discoveryTarget: 'https://example.test', + }; + }, + }; + + return { provider, streamedChunks }; +}); + +vi.mock('@/services/AgentService/catalog', () => ({ + createProviderForModel: vi.fn(() => provider), + getModel: vi.fn(), + resolveToolDefinitions: vi.fn(async () => []), +})); + +vi.mock('@/services/BuiltInToolService', () => ({ + builtInToolService: { + executeTool: vi.fn(), + }, +})); + +vi.mock('@/services/AgentService/infrastructure/mcp', () => ({ + mcpManager: { + resolveToolCall: vi.fn(async () => null), + }, +})); + +vi.mock('@database/queries', () => ({ + createMcpToolLog: vi.fn(), + updateMcpToolLogByCallId: vi.fn(), +})); + +const model = { + id: 1, + provider_id: 1, + model_id: 'test-model', + name: 'Test Model', + is_default: 0, + last_used_at: null, + attachment: 0, + modalities: null, + open_weights: 0, + reasoning: 0, + release_date: null, + temperature: 1, + tool_call: 1, + knowledge: null, + context_limit: null, + output_limit: null, + is_custom_metadata: 0, + created_at: '', + updated_at: '', + provider_name: 'Test Provider', + provider_driver: 'openai', + api_endpoint: 'https://example.test', + api_key: null, + provider_config_json: null, + provider_enabled: 1, + provider_logo: '', +} satisfies ModelWithProvider; + +function createPersister(): PersistenceProjector { + return { + getSessionId: vi.fn(() => 1), + persistToolCallMessage: vi.fn(async () => 10), + persistToolResultMessage: vi.fn(async () => 20), + persistCheckpoint: vi.fn(async () => undefined), + syncDeliveryManifestRequest: vi.fn(async () => undefined), + } as unknown as PersistenceProjector; +} + +function createStartCheckpoint() { + return { + activeModel: model, + messages: [], + response: '', + reasoning: '', + iteration: 0, + modelSwitchCount: 0, + modelLanguageContext: { + locale: 'zh-CN' as const, + label: 'Simplified Chinese (zh-CN)', + }, + executedBuiltInToolIds: [], + }; +} + +describe('AiRequestExecutor tool round scheduling', () => { + beforeEach(() => { + vi.clearAllMocks(); + streamedChunks.value = []; + streamedChunks.requestIndex = 0; + }); + + it('runs computer_act calls sequentially so later calls observe the first successful execution', async () => { + streamedChunks.value = [ + [ + { + content: '', + done: true, + finishReason: 'tool_calls', + toolCalls: [ + { id: 'call-1', name: 'builtin__computer_act', arguments: '{}' }, + { id: 'call-2', name: 'builtin__computer_act', arguments: '{}' }, + ], + }, + ], + [ + { + content: 'done', + done: true, + finishReason: 'stop', + }, + ], + ]; + + const hasExecutedSnapshots: boolean[] = []; + vi.mocked(builtInToolService.executeTool).mockImplementation(async (options) => { + hasExecutedSnapshots.push(options.hasExecutedBuiltInTool('computer_act')); + return { + toolCall: options.toolCall, + result: 'ok', + isError: false, + toolLogId: null, + toolLogKind: 'builtin', + builtInToolId: 'computer_act' as BuiltInToolId, + }; + }); + + const result = await new AiRequestExecutor().runAttempt({ + startCheckpoint: createStartCheckpoint(), + persister: createPersister(), + }); + + expect(result.type).toBe('completed'); + expect(builtInToolService.executeTool).toHaveBeenCalledTimes(2); + expect(hasExecutedSnapshots).toEqual([false, true]); + }); + + it('allows one successful computer_act in each model tool round', async () => { + streamedChunks.value = [ + [ + { + content: '', + done: true, + finishReason: 'tool_calls', + toolCalls: [{ id: 'call-1', name: 'builtin__computer_act', arguments: '{}' }], + }, + ], + [ + { + content: '', + done: true, + finishReason: 'tool_calls', + toolCalls: [{ id: 'call-2', name: 'builtin__computer_act', arguments: '{}' }], + }, + ], + [ + { + content: 'done', + done: true, + finishReason: 'stop', + }, + ], + ]; + + const hasExecutedSnapshots: boolean[] = []; + vi.mocked(builtInToolService.executeTool).mockImplementation(async (options) => { + hasExecutedSnapshots.push(options.hasExecutedBuiltInTool('computer_act')); + return { + toolCall: options.toolCall, + result: 'ok', + isError: false, + toolLogId: null, + toolLogKind: 'builtin', + builtInToolId: 'computer_act' as BuiltInToolId, + }; + }); + + const result = await new AiRequestExecutor().runAttempt({ + startCheckpoint: createStartCheckpoint(), + persister: createPersister(), + }); + + expect(result.type).toBe('completed'); + expect(builtInToolService.executeTool).toHaveBeenCalledTimes(2); + expect(hasExecutedSnapshots).toEqual([false, false]); + }); + + it('keeps non-computer_act tool calls parallel', async () => { + streamedChunks.value = [ + [ + { + content: '', + done: true, + finishReason: 'tool_calls', + toolCalls: [ + { id: 'call-1', name: 'builtin__setting', arguments: '{}' }, + { id: 'call-2', name: 'builtin__ask_user_question', arguments: '{}' }, + ], + }, + ], + [ + { + content: 'done', + done: true, + finishReason: 'stop', + }, + ], + ]; + + let firstToolResolved = false; + let secondToolStartedBeforeFirstResolved = false; + let resolveFirstTool: (() => void) | null = null; + + vi.mocked(builtInToolService.executeTool).mockImplementation(async (options) => { + if (options.toolCall.id === 'call-1') { + await new Promise((resolve) => { + resolveFirstTool = resolve; + }); + firstToolResolved = true; + } else { + secondToolStartedBeforeFirstResolved = !firstToolResolved; + resolveFirstTool?.(); + } + + return { + toolCall: options.toolCall, + result: 'ok', + isError: false, + toolLogId: null, + toolLogKind: 'builtin', + builtInToolId: + options.toolCall.name === 'builtin__setting' + ? ('setting' as BuiltInToolId) + : ('ask_user_question' as BuiltInToolId), + }; + }); + + const result = await new AiRequestExecutor().runAttempt({ + startCheckpoint: createStartCheckpoint(), + persister: createPersister(), + }); + + expect(result.type).toBe('completed'); + expect(builtInToolService.executeTool).toHaveBeenCalledTimes(2); + expect(secondToolStartedBeforeFirstResolved).toBe(true); + }); +}); diff --git a/apps/desktop/tests/services/BuiltInToolService/tools/computer/index.test.ts b/apps/desktop/tests/services/BuiltInToolService/tools/computer/index.test.ts new file mode 100644 index 00000000..36acfb30 --- /dev/null +++ b/apps/desktop/tests/services/BuiltInToolService/tools/computer/index.test.ts @@ -0,0 +1,367 @@ +import { getLastTauriInvokeCall, getTauriInvokeCalls, mockTauriCommand } from '@tests/utils/tauri'; +import { beforeEach, describe, expect, it } from 'vitest'; + +import { + DEFAULT_COMPUTER_TOOL_CONFIG, + executeComputerActTool, + executeComputerObserveTool, + executeComputerSessionTool, + parseComputerToolConfig, +} from '@/services/BuiltInToolService/tools/computer'; +import type { BaseBuiltInToolExecutionContext } from '@/services/BuiltInToolService/types'; +import type { + ComputerActionResponse, + ComputerObservationResponse, + ComputerSessionResponse, +} from '@/services/NativeService'; + +const SESSION_RESPONSE: ComputerSessionResponse = { + sessionId: 'session-call-1', + status: 'ready', + capabilities: { + platform: 'windows', + lanes: ['native_tree', 'vision_fallback'], + routes: ['win32.send_input', 'win32.message', 'screen.capture'], + background: { + supported: true, + routes: ['win32.message'], + limitations: ['Only native window targets can be background-safe.'], + }, + grounding: { + tree: true, + screenshot: true, + clickPrediction: false, + externalProviders: ['cua', 'omniparser', 'ui_tars'], + }, + }, + target: { + scope: 'foreground', + label: 'Focused window', + }, +}; + +const OBSERVE_RESPONSE: ComputerObservationResponse = { + observationId: 'obs-1', + sessionId: 'session-call-1', + platform: 'windows', + target: { + scope: 'foreground', + label: 'Calculator', + }, + displays: [ + { + id: 'display-1', + x: 0, + y: 0, + width: 1920, + height: 1080, + scaleFactor: 1, + primary: true, + }, + ], + windows: [ + { + elementId: 'window:100', + title: 'Calculator', + processName: 'Calculator.exe', + bounds: { x: 100, y: 100, width: 400, height: 320 }, + focused: true, + visible: true, + native: true, + }, + ], + tree: { + lane: 'native_tree', + elements: [ + { + elementId: 'window:100', + role: 'window', + name: 'Calculator', + bounds: { x: 100, y: 100, width: 400, height: 320 }, + states: ['focused'], + }, + ], + }, + screenshot: { + format: 'png', + width: 1920, + height: 1080, + dataBase64: 'abc', + }, + warnings: [], +}; + +const ACTION_RESPONSE: ComputerActionResponse = { + actionId: 'act-1', + sessionId: 'session-call-1', + operation: 'click', + route: 'win32.send_input', + lane: 'native_tree', + backgroundSafe: false, + cursorMoved: true, + foregroundChanged: true, + targetResolved: { + x: 120, + y: 130, + elementId: 'window:100', + confidence: 1, + }, + status: 'success', + warnings: [], +}; + +function fakeContext(overrides: Partial = {}) { + return { + callId: 'call-1', + signal: undefined, + iteration: 0, + hasExecutedBuiltInTool: () => false, + ...overrides, + } satisfies BaseBuiltInToolExecutionContext; +} + +describe('computer built-in tools', () => { + beforeEach(() => { + mockTauriCommand('built_in_tools_computer_session', SESSION_RESPONSE); + mockTauriCommand('built_in_tools_computer_observe', OBSERVE_RESPONSE); + mockTauriCommand('built_in_tools_computer_act', ACTION_RESPONSE); + }); + + it('starts a native-first computer session with capability preferences', async () => { + const result = await executeComputerSessionTool( + { + target: { scope: 'foreground' }, + capabilities: ['native_tree', 'screenshot', 'background_actions'], + reason: 'operate the focused app', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext() + ); + + expect(result.isError).toBe(false); + expect(result.result).toContain('Computer session ready'); + expect(result.result).toContain('native_tree'); + expect(getLastTauriInvokeCall('built_in_tools_computer_session')?.payload).toEqual({ + request: { + sessionId: 'session-call-1', + target: { scope: 'foreground' }, + capabilities: ['native_tree', 'screenshot', 'background_actions'], + providerHints: ['native_windows', 'external_adapter'], + reason: 'operate the focused app', + timeoutMs: 8000, + }, + }); + }); + + it('observes via tree-first mode while preserving screenshot fallback', async () => { + const result = await executeComputerObserveTool( + { + sessionId: 'session-call-1', + mode: 'tree_and_screenshot', + target: { scope: 'foreground' }, + include: ['windows', 'tree', 'screenshot'], + reason: 'ground the next click', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext() + ); + + expect(result.isError).toBe(false); + expect(result.result).toContain('Observation obs-1'); + expect(result.result).toContain('Calculator'); + expect(getLastTauriInvokeCall('built_in_tools_computer_observe')?.payload).toEqual({ + request: { + sessionId: 'session-call-1', + mode: 'tree_and_screenshot', + target: { scope: 'foreground' }, + include: ['windows', 'tree', 'screenshot'], + reason: 'ground the next click', + timeoutMs: 8000, + }, + }); + }); + + it('defaults observation to native tree data without screenshot payloads', async () => { + const result = await executeComputerObserveTool( + { + sessionId: 'session-call-1', + reason: 'ground the next click', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext() + ); + + expect(result.isError).toBe(false); + expect(getLastTauriInvokeCall('built_in_tools_computer_observe')?.payload).toEqual({ + request: { + sessionId: 'session-call-1', + mode: 'tree', + target: { scope: 'foreground' }, + include: ['displays', 'windows', 'tree'], + reason: 'ground the next click', + timeoutMs: 8000, + }, + }); + }); + + it('routes native element clicks through foreground SendInput by default', async () => { + const result = await executeComputerActTool( + { + sessionId: 'session-call-1', + operation: 'click', + target: { elementId: 'window:100' }, + reason: 'click calculator', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext() + ); + + expect(result.isError).toBe(false); + expect(result.result).toContain('"route": "win32.send_input"'); + expect(getLastTauriInvokeCall('built_in_tools_computer_act')?.payload).toEqual({ + request: { + sessionId: 'session-call-1', + operation: 'click', + target: { elementId: 'window:100' }, + value: null, + executionMode: 'foreground', + reason: 'click calculator', + routeHint: 'auto', + timeoutMs: 8000, + options: { + allowBackground: false, + dryRun: false, + postActionObserve: false, + }, + }, + }); + }); + + it('allows background mode only when the target is a native element', async () => { + await executeComputerActTool( + { + sessionId: 'session-call-1', + operation: 'click', + target: { elementId: 'window:100' }, + executionMode: 'background', + reason: 'click without foreground activation', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext() + ); + + expect(getLastTauriInvokeCall('built_in_tools_computer_act')?.payload).toEqual({ + request: expect.objectContaining({ + executionMode: 'background', + routeHint: 'auto', + options: expect.objectContaining({ + allowBackground: true, + }), + }), + }); + }); + + it('rejects background mode when the target only names a native scope', async () => { + const result = await executeComputerActTool( + { + sessionId: 'session-call-1', + operation: 'click', + target: { scope: 'element' }, + executionMode: 'background', + reason: 'scope-only background click', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext() + ); + + expect(result.isError).toBe(true); + expect(result.result).toContain( + 'background execution requires a native elementId or windowId target' + ); + expect(getTauriInvokeCalls('built_in_tools_computer_act')).toHaveLength(0); + }); + + it('rejects partial coordinate targets before native execution', async () => { + const result = await executeComputerActTool( + { + sessionId: 'session-call-1', + operation: 'click', + target: { x: 100 }, + reason: 'bad partial coordinate', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext() + ); + + expect(result.isError).toBe(true); + expect(result.result).toContain('target.x and target.y must be provided together'); + expect(getTauriInvokeCalls('built_in_tools_computer_act')).toHaveLength(0); + }); + + it('requires text for type_text and key value for keyboard operations', async () => { + const typeResult = await executeComputerActTool( + { + sessionId: 'session-call-1', + operation: 'type_text', + target: { elementId: 'window:100' }, + reason: 'missing text', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext() + ); + const keyResult = await executeComputerActTool( + { + sessionId: 'session-call-1', + operation: 'hotkey', + target: { elementId: 'window:100' }, + reason: 'missing keys', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext() + ); + + expect(typeResult.isError).toBe(true); + expect(typeResult.result).toContain('value is required for type_text'); + expect(keyResult.isError).toBe(true); + expect(keyResult.result).toContain('value is required for hotkey'); + expect(getTauriInvokeCalls('built_in_tools_computer_act')).toHaveLength(0); + }); + + it('rejects the second action in the same turn before native execution', async () => { + const result = await executeComputerActTool( + { + sessionId: 'session-call-1', + operation: 'click', + target: { elementId: 'window:100' }, + reason: 'second click', + }, + DEFAULT_COMPUTER_TOOL_CONFIG, + fakeContext({ + hasExecutedBuiltInTool: (toolId) => toolId === 'computer_act', + }) + ); + + expect(result.isError).toBe(true); + expect(result.result).toContain('computer_act can run only once per turn'); + expect(getTauriInvokeCalls('built_in_tools_computer_act')).toHaveLength(0); + }); + + it('keeps configurable defaults conservative and parseable', () => { + expect( + parseComputerToolConfig( + JSON.stringify({ + timeoutMs: 30000, + defaultExecutionMode: 'background', + providerHints: ['native_windows', 'cua'], + enableVisionFallback: true, + }) + ) + ).toEqual({ + ...DEFAULT_COMPUTER_TOOL_CONFIG, + timeoutMs: 30000, + defaultExecutionMode: 'background', + providerHints: ['native_windows', 'cua'], + enableVisionFallback: true, + }); + }); +}); diff --git a/apps/desktop/tests/services/native-service.test.ts b/apps/desktop/tests/services/native-service.test.ts index 686c7a29..d49b2b4a 100644 --- a/apps/desktop/tests/services/native-service.test.ts +++ b/apps/desktop/tests/services/native-service.test.ts @@ -6,6 +6,12 @@ import { builtInTools, clipboard, type ClipboardPayload, + type ComputerActionRequest, + type ComputerActionResponse, + type ComputerObservationRequest, + type ComputerObservationResponse, + type ComputerSessionRequest, + type ComputerSessionResponse, database, log, mcp, @@ -867,4 +873,137 @@ describe('NativeService supporting boundaries', () => { ) ).resolves.toBe(true); }); + + it('bridges native computer-use session, observation, and action commands', async () => { + const sessionRequest: ComputerSessionRequest = { + sessionId: 'computer-session-1', + target: { + scope: 'window', + window: { id: 'window-1', title: 'Calculator' }, + }, + capabilities: ['native_tree', 'screenshot', 'background_actions'], + providerHints: ['native_windows'], + reason: 'operate calculator', + timeoutMs: 8000, + }; + const sessionResponse: ComputerSessionResponse = { + sessionId: 'computer-session-1', + status: 'ready', + capabilities: { + platform: 'windows', + lanes: ['native_tree'], + routes: ['win32.send_input', 'win32.message'], + background: { + supported: true, + routes: ['win32.message'], + limitations: [], + }, + grounding: { + tree: true, + screenshot: true, + clickPrediction: false, + externalProviders: [], + }, + }, + target: sessionRequest.target, + warnings: [], + }; + const observationRequest: ComputerObservationRequest = { + sessionId: 'computer-session-1', + mode: 'tree_and_screenshot', + target: { + scope: 'element', + element: { id: 'button-equals', role: 'button', name: 'Equals' }, + }, + include: ['windows', 'tree', 'screenshot'], + reason: 'ground the next click', + timeoutMs: 8000, + }; + const observationResponse: ComputerObservationResponse = { + observationId: 'observation-1', + sessionId: 'computer-session-1', + platform: 'windows', + target: observationRequest.target, + displays: [], + windows: [], + tree: { + lane: 'native_tree', + elements: [], + }, + screenshot: { + format: 'png', + width: 1200, + height: 800, + dataBase64: 'abc', + }, + warnings: [], + }; + const actionRequest: ComputerActionRequest = { + sessionId: 'computer-session-1', + operation: 'click', + target: { + scope: 'region', + coordinates: { x: 200, y: 240, width: 80, height: 32, displayId: 'display-1' }, + }, + value: null, + executionMode: 'foreground', + reason: 'click equals', + routeHint: 'auto', + timeoutMs: 8000, + options: { + allowBackground: false, + dryRun: false, + postActionObserve: true, + }, + }; + const actionResponse: ComputerActionResponse = { + actionId: 'action-1', + sessionId: 'computer-session-1', + operation: 'click', + receipt: { + route: 'win32.send_input', + lane: 'native_tree', + backgroundSafe: false, + cursorMoved: true, + foregroundChanged: true, + targetResolved: { + x: 200, + y: 240, + elementId: null, + windowId: 'window-1', + confidence: 1, + }, + status: 'success', + warnings: [], + }, + }; + + mockTauriCommand('built_in_tools_computer_session', sessionResponse); + mockTauriCommand('built_in_tools_computer_observe', observationResponse); + mockTauriCommand('built_in_tools_computer_act', actionResponse); + + await expect( + callAndExpectInvoke( + () => native.builtInTools.startComputerSession(sessionRequest), + 'built_in_tools_computer_session', + { request: sessionRequest } + ) + ).resolves.toEqual(sessionResponse); + + await expect( + callAndExpectInvoke( + () => native.builtInTools.observeComputer(observationRequest), + 'built_in_tools_computer_observe', + { request: observationRequest } + ) + ).resolves.toEqual(observationResponse); + + await expect( + callAndExpectInvoke( + () => native.builtInTools.executeComputerAction(actionRequest), + 'built_in_tools_computer_act', + { request: actionRequest } + ) + ).resolves.toEqual(actionResponse); + }); }); diff --git a/docs/computer-use.md b/docs/computer-use.md new file mode 100644 index 00000000..083fc6df --- /dev/null +++ b/docs/computer-use.md @@ -0,0 +1,137 @@ +# TouchAI Computer Use Architecture + +This document records the implementation boundary for issue #111. + +## Goals + +TouchAI computer use is a native desktop control layer, not a bundled research agent stack. +The core contract should stay stable while the execution backend can evolve from lightweight +Windows primitives to richer external providers. + +The first complete version should provide: + +- a stable model-facing tool surface for session setup, observation, and action execution +- tree-first observation on native Windows surfaces, with screenshot fallback metadata +- auditable action receipts for every desktop side effect +- explicit background-safety semantics +- cross-platform capability reporting instead of silent failure +- adapter seams for Cua, OmniParser, UI-TARS, browser automation, or other future providers + +## Product Research Summary + +The reviewed projects converge on the same shape: observe, plan one bounded action, execute +through a controlled gateway, then observe again. + +- [trycua/cua](https://github.com/trycua/cua) is valuable as a mature sandbox/driver stack, + but bundling it would add Python, VM/sandbox management, and image/runtime weight that do + not fit TouchAI's desktop package by default. +- [CursorTouch/Windows-MCP](https://github.com/CursorTouch/Windows-MCP) and + [CursorTouch/Windows-Use](https://github.com/CursorTouch/Windows-Use) are the closest + design references for Windows: use UI Automation or accessibility trees first, keep vision + optional, and expose schema-shaped actions rather than raw input injection. +- [browser-use](https://github.com/browser-use/browser-use) is mature for browser workflows, + but it should remain a browser adapter, not the desktop automation core. +- [Microsoft OmniParser](https://github.com/microsoft/OmniParser) and UI-TARS style agents + improve screenshot grounding, but they are too heavy to embed silently and do not replace + native execution policy. +- [Anthropic computer use](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use) + and [OpenAI computer use](https://platform.openai.com/docs/guides/tools-computer-use) + define useful provider loops, but both expect the application to own the execution + environment and safety gateway. + +## Architecture + +TouchAI owns the gateway. Model providers and external CUA stacks may propose actions, but +TouchAI resolves, validates, executes, and records them. + +```mermaid +flowchart LR + Model["Model or CUA adapter"] --> Tools["Built-in computer tools"] + Tools --> Policy["Computer policy and routing"] + Policy --> Native["Native desktop runtime"] + Native --> Receipt["Action receipt"] + Receipt --> History["Tool log and conversation history"] + + Policy -. optional .-> External["External grounding providers"] + External -. candidates .-> Policy +``` + +### Layer 1: Model-Facing Tools + +The model sees three built-in tools: + +- `builtin__computer_session`: declare target scope and requested capabilities. +- `builtin__computer_observe`: return a tree-first desktop observation with window, display, + element, and optional screenshot metadata. +- `builtin__computer_act`: execute one bounded action and return an auditable receipt. + +The tool names and input schemas are provider-neutral so OpenAI computer-use models, +Anthropic computer-use models, ordinary vision-language models, and future CUA adapters can +share the same gateway. + +### Layer 2: Policy and Routing + +Policy normalizes target references and selects an execution route: + +- `native_tree`: native element/window references from Windows UI surfaces. +- `vision_fallback`: coordinate targets from screenshots or external grounding. +- `browser_dom`: reserved for browser adapters. +- `external_provider`: reserved for Cua, OmniParser, UI-TARS, or similar systems. + +Routes are explicit: + +- `win32.send_input`: foreground pointer and keyboard execution. +- `win32.message`: background-safe native window messaging when a native target exists. +- `screen.capture`: observation-only screenshot path. +- `unsupported`: capability is unavailable on the current platform or target. + +Every action receipt includes `route`, `lane`, `backgroundSafe`, `cursorMoved`, +`foregroundChanged`, `targetResolved`, `status`, and warnings. + +### Layer 3: Native Runtime + +The native runtime is intentionally lightweight: + +- enumerate display and focused-window metadata +- expose capability snapshots per platform +- serialize mutating desktop actions +- reject unsafe background coordinate actions +- return structured unsupported errors on platforms without an implementation + +Windows is the primary runtime. macOS and Linux must report capabilities clearly before richer +adapters are implemented. + +### Layer 4: External Providers + +External providers are optional adapters, not bundled dependencies. + +Future adapters may provide: + +- screenshot-to-element candidates +- click prediction for ordinary VLMs +- browser DOM observations +- VM or sandbox execution through Cua-like drivers + +Adapters must return normalized observations or target candidates. They must not bypass +TouchAI action policy, logs, or receipts. + +## Safety Defaults + +- Computer tools are seeded disabled by default. +- `computer_act` is limited to one successful action per model tool round. +- Background execution is allowed only when the target is a native element or window route. +- Coordinate-only targets are foreground-only because they cannot be background-safe. +- Text and keyboard operations require an explicit `value`. +- Unsupported platforms and routes return structured errors instead of best-effort input. + +## Non-Goals + +This implementation does not bundle: + +- Cua sandbox or VM images +- Python automation runtimes +- local OmniParser or UI-TARS model weights +- a provider/model selector UI +- autonomous long-running desktop loops + +Those capabilities remain adapter-level extensions over the same tool contract. From cae618efae0ef3dfd1baf19dba1df2f5afba23dd Mon Sep 17 00:00:00 2001 From: xlgzsgf <51521689+hiqiancheng@users.noreply.github.com> Date: Wed, 3 Jun 2026 11:08:50 +0800 Subject: [PATCH 2/3] test(agent-service): align computer tests with platform support Refs #111 --- .../src/core/built_in_tools/computer.rs | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/apps/desktop/src-tauri/src/core/built_in_tools/computer.rs b/apps/desktop/src-tauri/src/core/built_in_tools/computer.rs index 3cc6d4ae..1df68ff1 100644 --- a/apps/desktop/src-tauri/src/core/built_in_tools/computer.rs +++ b/apps/desktop/src-tauri/src/core/built_in_tools/computer.rs @@ -1419,6 +1419,7 @@ mod tests { ); } + #[cfg(target_os = "windows")] #[test] fn dry_run_coordinate_click_returns_send_input_receipt() { let runtime = ComputerUseRuntime::new(); @@ -1453,6 +1454,43 @@ mod tests { assert_eq!(response.target_resolved.x, Some(10)); } + #[cfg(not(target_os = "windows"))] + #[test] + fn auto_route_returns_unsupported_on_non_windows() { + let runtime = ComputerUseRuntime::new(); + runtime.start_session(session_request()).unwrap(); + + let response = runtime + .act(ComputerActionRequest { + session_id: "session-1".to_string(), + operation: ComputerActionOperation::Click, + target: ComputerTarget { + scope: Some("screen".to_string()), + x: Some(10), + y: Some(20), + ..Default::default() + }, + value: None, + execution_mode: ComputerExecutionMode::Foreground, + reason: "test unsupported platform".to_string(), + route_hint: ComputerRoute::Auto, + timeout_ms: 8000, + options: ComputerActionOptions { + allow_background: false, + dry_run: true, + post_action_observe: false, + }, + }) + .unwrap(); + + assert_eq!(response.route, ComputerRoute::Win32SendInput); + assert_eq!(response.status, ComputerActionStatus::Unsupported); + assert_eq!( + response.warnings, + vec!["route 'win32.send_input' is not available for this session"] + ); + } + #[test] fn screen_capture_route_cannot_execute_actions() { let runtime = ComputerUseRuntime::new(); @@ -1522,6 +1560,7 @@ mod tests { ); } + #[cfg(target_os = "windows")] #[test] fn background_coordinate_actions_are_rejected() { let runtime = ComputerUseRuntime::new(); @@ -1557,6 +1596,7 @@ mod tests { ); } + #[cfg(target_os = "windows")] #[test] fn background_window_actions_require_observed_native_targets() { let runtime = ComputerUseRuntime::new(); @@ -1591,6 +1631,7 @@ mod tests { ); } + #[cfg(target_os = "windows")] #[test] fn background_window_actions_do_not_trust_session_target_native_ids() { let runtime = ComputerUseRuntime::new(); @@ -1627,6 +1668,7 @@ mod tests { ); } + #[cfg(target_os = "windows")] #[test] fn background_window_actions_reject_stale_observed_native_targets() { let runtime = ComputerUseRuntime::new(); @@ -1678,6 +1720,7 @@ mod tests { assert_eq!(lparam.0 as usize, 0x0022_000c); } + #[cfg(target_os = "windows")] #[test] fn post_action_observe_returns_follow_up_observation_for_successful_actions() { let runtime = ComputerUseRuntime::new(); From a2c35239bac50be96df058ff0056c0b9a96ff677 Mon Sep 17 00:00:00 2001 From: xlgzsgf <51521689+hiqiancheng@users.noreply.github.com> Date: Wed, 3 Jun 2026 11:37:44 +0800 Subject: [PATCH 3/3] test(agent-service): align computer command tests with platform support Refs #111 --- .../src-tauri/tests/computer_commands.rs | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/apps/desktop/src-tauri/tests/computer_commands.rs b/apps/desktop/src-tauri/tests/computer_commands.rs index 0b89288c..9ac59db0 100644 --- a/apps/desktop/src-tauri/tests/computer_commands.rs +++ b/apps/desktop/src-tauri/tests/computer_commands.rs @@ -159,7 +159,15 @@ fn computer_act_dry_run_returns_stable_receipt() { assert_eq!(response["foregroundChanged"], json!(false)); assert_eq!(response["targetResolved"]["x"], json!(120)); assert_eq!(response["targetResolved"]["y"], json!(130)); - assert_eq!(response["status"], json!("success")); + if cfg!(target_os = "windows") { + assert_eq!(response["status"], json!("success")); + } else { + assert_eq!(response["status"], json!("unsupported")); + assert_eq!( + response["warnings"], + json!(["route 'win32.send_input' is not available for this session"]) + ); + } } #[test] @@ -235,9 +243,17 @@ fn computer_act_rejects_invalid_route_and_background_coordinate_actions() { }), ); assert_eq!(background_coordinate["route"], json!("win32.message")); - assert_eq!(background_coordinate["status"], json!("blocked")); - assert_eq!( - background_coordinate["warnings"], - json!(["coordinate targets cannot be executed in background mode"]) - ); + if cfg!(target_os = "windows") { + assert_eq!(background_coordinate["status"], json!("blocked")); + assert_eq!( + background_coordinate["warnings"], + json!(["coordinate targets cannot be executed in background mode"]) + ); + } else { + assert_eq!(background_coordinate["status"], json!("unsupported")); + assert_eq!( + background_coordinate["warnings"], + json!(["route 'win32.message' is not available for this session"]) + ); + } }