From f5685ffe5aea34b258a99ad3a7e29c8b0c49188e Mon Sep 17 00:00:00 2001 From: Andrew Beniston Date: Wed, 6 May 2026 21:04:17 +0100 Subject: [PATCH 1/3] Add music duck and keep-awake behaviour during dictation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new things you can turn on in Settings. Music During Transcription is now a picker (Leave Playing / Pause / Lower Volume) instead of a boolean. Lower Volume snapshots the system output volume, fades it down to 10% over 200ms, and fades it back up the same way when you stop. The fade ramp runs on a detached Task so the main actor isn't blocked between steps; a new fade cancels any in-flight one, which matters when the user releases the hotkey before the duck-down has finished and the fade-up has to take over cleanly. If the user's volume is already at or below the duck target, the duck is skipped entirely. Keep Mac Awake While Dictating holds an IOPMAssertion of type kIOPMAssertionTypePreventUserIdleDisplaySleep for the duration of a recording so the screen doesn't dim or lock mid-session. Released as soon as recording stops, including in the start() failure path. The volume restore fires the moment the audio engine is fully torn down, not at the end of stop() after final transcription completes. That way the volume snaps back when the user releases the hotkey rather than 1-3s later when the model finishes. Two new files: SystemVolumeController wraps the CoreAudio volume reads and writes (AudioObjectGet/SetPropertyData on the default output device's master volume scalar, with per-channel fallback for devices that don't expose master), SleepPreventionService wraps the IOPMAssertion. The legacy pauseMediaDuringTranscription Bool migrates to a new MediaBehaviorDuringTranscription enum on first read, and remains as a compatibility shim on the SettingsStore so the BackupService payload stays format-compatible — old backups restore as .pause or .none. The duck setting itself doesn't survive a backup/restore round trip, which seems fine since it's a UX preference rather than load-bearing config. --- Sources/Fluid/Persistence/SettingsStore.swift | 71 +++++++++- Sources/Fluid/Services/ASRService.swift | 96 ++++++------- .../Fluid/Services/MediaPlaybackService.swift | 128 +++++++++++++++++- .../Services/SleepPreventionService.swift | 68 ++++++++++ .../Services/SystemVolumeController.swift | 119 ++++++++++++++++ Sources/Fluid/UI/SettingsView.swift | 32 ++++- 6 files changed, 456 insertions(+), 58 deletions(-) create mode 100644 Sources/Fluid/Services/SleepPreventionService.swift create mode 100644 Sources/Fluid/Services/SystemVolumeController.swift diff --git a/Sources/Fluid/Persistence/SettingsStore.swift b/Sources/Fluid/Persistence/SettingsStore.swift index 35410389..de0d1310 100644 --- a/Sources/Fluid/Persistence/SettingsStore.swift +++ b/Sources/Fluid/Persistence/SettingsStore.swift @@ -2899,13 +2899,74 @@ final class SettingsStore: ObservableObject { // MARK: - Media Playback Control - /// When enabled, automatically pauses system media playback when transcription starts. - /// Only resumes if FluidVoice was the one that paused it. + /// What FluidVoice does to system media playback when transcription starts. + enum MediaBehaviorDuringTranscription: String, Codable, CaseIterable, Identifiable { + /// Leave media alone. + case none + /// Pause currently playing media; resume on stop if FluidVoice paused it. + case pause + /// Drop the system output volume to a low value during transcription + /// and restore it on stop. Music keeps playing, just quietly. + case duck + + var id: String { self.rawValue } + + var displayName: String { + switch self { + case .none: return "Leave Playing" + case .pause: return "Pause" + case .duck: return "Lower Volume" + } + } + } + + /// What to do with system media playback while transcribing. + /// New unified setting; reads migrate cleanly from the legacy + /// `pauseMediaDuringTranscription` boolean if present. + var mediaBehaviorDuringTranscription: MediaBehaviorDuringTranscription { + get { + if let raw = self.defaults.string(forKey: Keys.mediaBehaviorDuringTranscription), + let mode = MediaBehaviorDuringTranscription(rawValue: raw) { + return mode + } + // Migrate from legacy bool key on first read. + if self.defaults.object(forKey: Keys.pauseMediaDuringTranscription) != nil { + return self.defaults.bool(forKey: Keys.pauseMediaDuringTranscription) ? .pause : .none + } + return .none + } + set { + objectWillChange.send() + self.defaults.set(newValue.rawValue, forKey: Keys.mediaBehaviorDuringTranscription) + // Keep the legacy bool in sync so backup/restore round-trips don't + // surprise users who roll back to an older build. + self.defaults.set(newValue == .pause, forKey: Keys.pauseMediaDuringTranscription) + } + } + + /// Legacy boolean view of `mediaBehaviorDuringTranscription`. Kept so + /// `BackupService`'s payload (which exports a `Bool`) stays compatible. + /// Setting `true` selects `.pause`; setting `false` selects `.none` only + /// if the current mode was `.pause` — `.duck` is preserved. var pauseMediaDuringTranscription: Bool { - get { self.defaults.object(forKey: Keys.pauseMediaDuringTranscription) as? Bool ?? false } + get { self.mediaBehaviorDuringTranscription == .pause } + set { + if newValue { + self.mediaBehaviorDuringTranscription = .pause + } else if self.mediaBehaviorDuringTranscription == .pause { + self.mediaBehaviorDuringTranscription = .none + } + } + } + + /// When enabled, FluidVoice creates an `IOPMAssertion` while a recording + /// is active so the display doesn't sleep and the screen doesn't lock + /// mid-dictation. Released as soon as recording stops. + var preventSleepDuringTranscription: Bool { + get { self.defaults.object(forKey: Keys.preventSleepDuringTranscription) as? Bool ?? true } set { objectWillChange.send() - self.defaults.set(newValue, forKey: Keys.pauseMediaDuringTranscription) + self.defaults.set(newValue, forKey: Keys.preventSleepDuringTranscription) } } @@ -3670,6 +3731,8 @@ private extension SettingsStore { /// Media Playback Control static let pauseMediaDuringTranscription = "PauseMediaDuringTranscription" + static let mediaBehaviorDuringTranscription = "MediaBehaviorDuringTranscription" + static let preventSleepDuringTranscription = "PreventSleepDuringTranscription" /// Custom Dictation Prompt static let customDictationPrompt = "CustomDictationPrompt" diff --git a/Sources/Fluid/Services/ASRService.swift b/Sources/Fluid/Services/ASRService.swift index 984d5117..10d718ff 100644 --- a/Sources/Fluid/Services/ASRService.swift +++ b/Sources/Fluid/Services/ASRService.swift @@ -506,9 +506,9 @@ final class ASRService: ObservableObject { private let audioRouteRecoveryDelayNanoseconds: UInt64 = 1_000_000_000 private var isRecoveringAudioRoute = false - /// Tracks whether we paused system media for this recording session. - /// Used to resume playback only if we were the ones who paused it. - private var didPauseMediaForThisSession: Bool = false + /// What `MediaPlaybackService` did at the start of this session (paused, + /// ducked, or nothing). Used to undo that action on stop. + private var mediaSessionAction: MediaSessionAction = .none private var audioLevelSubject = PassthroughSubject() var audioLevelPublisher: AnyPublisher { self.audioLevelSubject.eraseToAnyPublisher() } @@ -751,8 +751,8 @@ final class ASRService: ObservableObject { return } - // Reset media pause state for this session - self.didPauseMediaForThisSession = false + // Reset media session action for this session + self.mediaSessionAction = .none self.audioRouteRecoveryTask?.cancel() self.audioRouteRecoveryTask = nil self.isRecoveringAudioRoute = false @@ -788,14 +788,28 @@ final class ASRService: ObservableObject { try self.setupEngineTap() DebugLogger.shared.debug("✅ Engine tap setup complete", source: "ASRService") - // Pause system media AFTER successful audio setup but BEFORE setting isRunning - // This ensures we only pause media when we know recording will succeed - if SettingsStore.shared.pauseMediaDuringTranscription { + // Apply media behaviour AFTER successful audio setup but BEFORE setting isRunning + // This ensures we only touch media when we know recording will succeed + switch SettingsStore.shared.mediaBehaviorDuringTranscription { + case .none: + self.mediaSessionAction = .none + case .pause: let didPause = await MediaPlaybackService.shared.pauseIfPlaying() - self.didPauseMediaForThisSession = didPause + self.mediaSessionAction = didPause ? .paused : .none if didPause { DebugLogger.shared.info("🎵 Paused system media for transcription", source: "ASRService") } + case .duck: + if let previousVolume = MediaPlaybackService.shared.duckSystemVolume() { + self.mediaSessionAction = .ducked(previousVolume: previousVolume) + } else { + self.mediaSessionAction = .none + } + } + + // Hold the display awake while recording, if the user opted in. + if SettingsStore.shared.preventSleepDuringTranscription { + SleepPreventionService.shared.preventSleep() } self.isRunning = true @@ -821,12 +835,11 @@ final class ASRService: ObservableObject { } catch { DebugLogger.shared.error("Failed to start ASR session: \(error)", source: "ASRService") - // Resume media if we paused it before the failure - if self.didPauseMediaForThisSession { - await MediaPlaybackService.shared.resumeIfWePaused(true) - self.didPauseMediaForThisSession = false - DebugLogger.shared.info("🎵 Resumed system media after start failure", source: "ASRService") - } + // Undo any media action we took before the failure. + await MediaPlaybackService.shared.restore(from: self.mediaSessionAction) + self.mediaSessionAction = .none + // Always release any sleep assertion we created. + SleepPreventionService.shared.allowSleep() // Provide user-friendly error feedback let errorMessage: String @@ -894,9 +907,14 @@ final class ASRService: ObservableObject { self.audioRouteRecoveryTask = nil self.isRecoveringAudioRoute = false - // Capture media pause state before we reset it, for resuming at the end - let shouldResumeMedia = SettingsStore.shared.pauseMediaDuringTranscription && self.didPauseMediaForThisSession - self.didPauseMediaForThisSession = false // Reset for next session + // Capture the media action so we can undo it at every exit path. + let pendingMediaRestore = self.mediaSessionAction + self.mediaSessionAction = .none // Reset for next session + + // Always release the sleep assertion at the start of stop — recording + // is over from the user's point of view, even if transcription is + // still running. + SleepPreventionService.shared.allowSleep() DebugLogger.shared.debug("📍 Preparing final transcription", source: "ASRService") @@ -928,6 +946,12 @@ final class ASRService: ObservableObject { // New engine will be lazily created on next access via computed property DebugLogger.shared.debug("✅ Engine instance recreated", source: "ASRService") + // Restore media as soon as the audio engine is fully torn down — there's + // no risk of recording the volume bump now that capture has stopped, and + // it lines the volume restore up with the moment the user lifts the + // hotkey rather than the moment transcription finishes. + await MediaPlaybackService.shared.restore(from: pendingMediaRestore) + // CRITICAL FIX: Await completion of streaming task AND any pending transcriptions // This prevents use-after-free crashes (EXC_BAD_ACCESS) when clearing buffer DebugLogger.shared.debug("⏳ Awaiting stopStreamingTimerAndAwait()...", source: "ASRService") @@ -955,10 +979,6 @@ final class ASRService: ObservableObject { "Final ASR result | provider=\(self.transcriptionProvider.name) | samples=0 | textChars=0 | confidence=nil | reason=no_audio", source: "ASRService" ) - if shouldResumeMedia { - await MediaPlaybackService.shared.resumeIfWePaused(true) - DebugLogger.shared.info("🎵 Resumed system media after empty audio", source: "ASRService") - } return "" } @@ -984,11 +1004,6 @@ final class ASRService: ObservableObject { guard self.transcriptionProvider.isReady else { DebugLogger.shared.error("Transcription provider is not ready", source: "ASRService") - // Resume media playback if we paused it - if shouldResumeMedia { - await MediaPlaybackService.shared.resumeIfWePaused(true) - DebugLogger.shared.info("🎵 Resumed system media after provider not ready", source: "ASRService") - } return "" } @@ -1026,12 +1041,6 @@ final class ASRService: ObservableObject { self.recordWordBoostHitIfAny(transcribedText: cleanedText) DebugLogger.shared.debug("After post-processing: '\(cleanedText)'", source: "ASRService") - // Resume media playback if we paused it - if shouldResumeMedia { - await MediaPlaybackService.shared.resumeIfWePaused(true) - DebugLogger.shared.info("🎵 Resumed system media after transcription", source: "ASRService") - } - return cleanedText } catch { DebugLogger.shared.error("ASR transcription failed: \(error)", source: "ASRService") @@ -1055,12 +1064,6 @@ final class ASRService: ObservableObject { // (e.g., accidental hotkey press) and would disrupt the user's workflow. // Errors are logged for debugging purposes. - // Resume media playback if we paused it - if shouldResumeMedia { - await MediaPlaybackService.shared.resumeIfWePaused(true) - DebugLogger.shared.info("🎵 Resumed system media after transcription failure", source: "ASRService") - } - return "" } } @@ -1073,9 +1076,12 @@ final class ASRService: ObservableObject { self.audioRouteRecoveryTask = nil self.isRecoveringAudioRoute = false - // Capture media pause state before we reset it, for resuming at the end - let shouldResumeMedia = SettingsStore.shared.pauseMediaDuringTranscription && self.didPauseMediaForThisSession - self.didPauseMediaForThisSession = false // Reset for next session + // Capture the media action so we can undo it after teardown. + let pendingMediaRestore = self.mediaSessionAction + self.mediaSessionAction = .none // Reset for next session + + // Release the sleep assertion as soon as recording stops. + SleepPreventionService.shared.allowSleep() DebugLogger.shared.info("🛑 Stopping recording - releasing audio devices", source: "ASRService") @@ -1117,11 +1123,7 @@ final class ASRService: ObservableObject { self.lastStreamingChunkFailureAnalyticsAt = nil self.refreshWordBoostStatus() - // Resume media playback if we paused it - if shouldResumeMedia { - await MediaPlaybackService.shared.resumeIfWePaused(true) - DebugLogger.shared.info("🎵 Resumed system media after stopping without transcription", source: "ASRService") - } + await MediaPlaybackService.shared.restore(from: pendingMediaRestore) } private func configureSession() throws { diff --git a/Sources/Fluid/Services/MediaPlaybackService.swift b/Sources/Fluid/Services/MediaPlaybackService.swift index 72160960..70f0e5f8 100644 --- a/Sources/Fluid/Services/MediaPlaybackService.swift +++ b/Sources/Fluid/Services/MediaPlaybackService.swift @@ -3,11 +3,38 @@ import Foundation import MediaRemoteAdapter #endif +/// What `MediaPlaybackService` did at the start of a transcription session. +/// Stored on `ASRService` so the matching restore at stop knows whether to +/// resume playback, restore the system volume, or do nothing. +enum MediaSessionAction: Equatable { + case none + case paused + case ducked(previousVolume: Float) +} + +/// Volume the system output is dropped to while ducking. 10% of full scale — +/// quiet enough that the music doesn't compete with dictation, loud enough +/// that the user knows something's still playing. +private let kDuckTargetVolume: Float = 0.10 + +/// Length of the fade ramp in seconds. Short enough that the duck has +/// fully landed before the user starts dictating, long enough to read as a +/// fade rather than a hard cut. +private let kFadeDuration: TimeInterval = 0.2 + +/// Number of discrete steps in the fade ramp. 30 steps over 200ms is ~150 +/// Hz, well above the threshold where you'd hear the staircase. +private let kFadeSteps = 30 + /// Service that wraps MediaRemoteAdapter's MediaController to provide -/// controlled pause/resume functionality during transcription. +/// controlled pause/resume functionality during transcription, plus a +/// volume-duck path for users who want music to keep playing quietly. /// -/// This service ensures we only pause media if it's currently playing, -/// and only resume if we were the ones who paused it. +/// Pause path: only pauses if media is currently playing, and only resumes +/// if we were the ones who paused it. +/// +/// Duck path: snapshots the current default output device volume, sets it to +/// `kDuckTargetVolume`, and restores the snapshotted value on stop. @MainActor final class MediaPlaybackService { static let shared = MediaPlaybackService() @@ -16,6 +43,11 @@ final class MediaPlaybackService { private let mediaController = MediaController() #endif + /// Holds the in-flight volume-fade task so a new fade can cancel any + /// previous one (e.g. the user releases the hotkey before the + /// fade-down has finished, and the fade-up needs to take over cleanly). + private var activeFadeTask: Task? + private init() {} // MARK: - Public API @@ -148,4 +180,94 @@ final class MediaPlaybackService { // No-op on Intel } #endif + + // MARK: - Duck path + + /// Snapshots the current system output volume and starts a background + /// fade-down to `kDuckTargetVolume`. Returns the previous volume so the + /// caller can hand it back to `restoreSystemVolume(previous:)` on stop. + /// + /// Returns `nil` if the volume couldn't be read, or if the user's + /// volume is already at or below the duck target — in either case we + /// don't touch the volume at all (and the matching restore becomes a + /// no-op). + func duckSystemVolume() -> Float? { + guard let previous = SystemVolumeController.currentVolume() else { + DebugLogger.shared.debug( + "MediaPlaybackService: Couldn't read system volume, skipping duck", + source: "MediaPlaybackService" + ) + return nil + } + guard previous > kDuckTargetVolume else { + DebugLogger.shared.debug( + "MediaPlaybackService: Volume \(String(format: "%.2f", previous)) already ≤ duck target, skipping", + source: "MediaPlaybackService" + ) + return nil + } + DebugLogger.shared.info( + "🔉 Fading system volume \(String(format: "%.2f", previous)) → \(String(format: "%.2f", kDuckTargetVolume)) over \(kFadeDuration)s", + source: "MediaPlaybackService" + ) + self.startFade(from: previous, to: kDuckTargetVolume) + return previous + } + + /// Fades the system output volume back up to the value snapshotted by + /// `duckSystemVolume()`. Reads the live volume first so a mid-fade + /// interruption (user released the hotkey before the duck-down had + /// finished) restarts cleanly from wherever the volume actually is. + func restoreSystemVolume(previous: Float?) { + guard let previous else { return } + let start = SystemVolumeController.currentVolume() ?? kDuckTargetVolume + DebugLogger.shared.info( + "🔊 Fading system volume \(String(format: "%.2f", start)) → \(String(format: "%.2f", previous)) over \(kFadeDuration)s", + source: "MediaPlaybackService" + ) + self.startFade(from: start, to: previous) + } + + /// Cancels any in-flight fade and starts a new one from `start` to + /// `target` over `kFadeDuration`. Runs detached so the main actor isn't + /// blocked between steps; CoreAudio property writes are thread-safe. + private func startFade(from start: Float, to target: Float) { + self.activeFadeTask?.cancel() + + let stepCount = kFadeSteps + let stepDelay = kFadeDuration / Double(stepCount) + let stepDelayNanos = UInt64(stepDelay * 1_000_000_000) + let delta = (target - start) / Float(stepCount) + + self.activeFadeTask = Task.detached(priority: .userInitiated) { + for step in 1...stepCount { + if Task.isCancelled { return } + let value = start + delta * Float(step) + _ = SystemVolumeController.setVolume(value) + if step < stepCount { + try? await Task.sleep(nanoseconds: stepDelayNanos) + } + } + // Land exactly on the target value if we weren't cancelled — + // floating-point drift across 30 steps could otherwise leave us + // a hair off (e.g. 0.0997 instead of 0.10). + if !Task.isCancelled { + _ = SystemVolumeController.setVolume(target) + } + } + } + + // MARK: - Unified restore + + /// Undoes whatever `MediaSessionAction` was taken at recording start. + func restore(from action: MediaSessionAction) async { + switch action { + case .none: + return + case .paused: + await self.resumeIfWePaused(true) + case .ducked(let previousVolume): + self.restoreSystemVolume(previous: previousVolume) + } + } } diff --git a/Sources/Fluid/Services/SleepPreventionService.swift b/Sources/Fluid/Services/SleepPreventionService.swift new file mode 100644 index 00000000..5e22a1bc --- /dev/null +++ b/Sources/Fluid/Services/SleepPreventionService.swift @@ -0,0 +1,68 @@ +import Foundation +import IOKit.pwr_mgt + +/// Holds an `IOPMAssertion` that prevents the display (and the system) from +/// going idle while the user is dictating. Released as soon as recording +/// stops so the laptop returns to its normal sleep behaviour. +/// +/// Uses `kIOPMAssertionTypePreventUserIdleDisplaySleep` rather than +/// `kIOPMAssertionTypePreventUserIdleSystemSleep`. The display assertion +/// implies the system one (display can't be on if the system's asleep), so +/// it's strictly stronger; and it stops the screen-lock timer that fires +/// after display sleep, which is the visible symptom Andrew was seeing. +@MainActor +final class SleepPreventionService { + static let shared = SleepPreventionService() + + private var assertionID: IOPMAssertionID = 0 + private var isActive = false + + private init() {} + + /// Creates the sleep-prevention assertion. No-op if already active so the + /// service is safe to call from re-entrant code paths. + func preventSleep(reason: String = "FluidVoice transcribing") { + guard !self.isActive else { return } + + var newID: IOPMAssertionID = 0 + let result = IOPMAssertionCreateWithName( + kIOPMAssertionTypePreventUserIdleDisplaySleep as CFString, + IOPMAssertionLevel(kIOPMAssertionLevelOn), + reason as CFString, + &newID + ) + + if result == kIOReturnSuccess { + self.assertionID = newID + self.isActive = true + DebugLogger.shared.info( + "☕ Sleep prevention assertion created (\(reason))", + source: "SleepPreventionService" + ) + } else { + DebugLogger.shared.warning( + "SleepPreventionService: IOPMAssertionCreateWithName failed (\(result))", + source: "SleepPreventionService" + ) + } + } + + /// Releases the assertion. No-op if there's nothing to release. + func allowSleep() { + guard self.isActive else { return } + let result = IOPMAssertionRelease(self.assertionID) + self.assertionID = 0 + self.isActive = false + if result == kIOReturnSuccess { + DebugLogger.shared.info( + "💤 Sleep prevention assertion released", + source: "SleepPreventionService" + ) + } else { + DebugLogger.shared.warning( + "SleepPreventionService: IOPMAssertionRelease failed (\(result))", + source: "SleepPreventionService" + ) + } + } +} diff --git a/Sources/Fluid/Services/SystemVolumeController.swift b/Sources/Fluid/Services/SystemVolumeController.swift new file mode 100644 index 00000000..7f2ce2e3 --- /dev/null +++ b/Sources/Fluid/Services/SystemVolumeController.swift @@ -0,0 +1,119 @@ +import CoreAudio +import Foundation + +/// Reads and writes the default output device's volume via CoreAudio's +/// `AudioObjectGetPropertyData` / `AudioObjectSetPropertyData`. +/// +/// macOS doesn't expose per-app output volume in any stable public API, so +/// adjusting the system output level is the closest equivalent. Side effect: +/// notification dings and other system sounds duck along with media for the +/// duration. That's intentional — the user is dictating, they don't want +/// surprises through the speakers. +/// +/// CoreAudio's `AudioObject*` APIs are thread-safe, so this enum is callable +/// from any actor or detached task — useful for background fade ramps. +enum SystemVolumeController { + /// Returns the current default output device's master scalar volume in + /// `0.0...1.0`, or `nil` if the device or master volume property isn't + /// available (some devices only expose per-channel volume). + static func currentVolume() -> Float? { + guard let deviceID = defaultOutputDeviceID() else { return nil } + + var address = AudioObjectPropertyAddress( + mSelector: kAudioDevicePropertyVolumeScalar, + mScope: kAudioDevicePropertyScopeOutput, + mElement: kAudioObjectPropertyElementMain + ) + + if AudioObjectHasProperty(deviceID, &address) { + var volume: Float32 = 0 + var size = UInt32(MemoryLayout.size) + let status = AudioObjectGetPropertyData(deviceID, &address, 0, nil, &size, &volume) + if status == noErr { + return volume + } + } + + // Fall back to averaging channels 1 and 2 if master isn't exposed. + let left = readChannelVolume(deviceID: deviceID, channel: 1) + let right = readChannelVolume(deviceID: deviceID, channel: 2) + switch (left, right) { + case let (l?, r?): return (l + r) / 2 + case let (l?, nil): return l + case let (nil, r?): return r + default: return nil + } + } + + /// Sets the default output device's volume to `value` (clamped to `0.0...1.0`). + /// Writes master if available, falls back to writing channels 1 and 2. + @discardableResult + static func setVolume(_ value: Float) -> Bool { + guard let deviceID = defaultOutputDeviceID() else { return false } + let clamped = max(0, min(1, value)) + + var masterAddress = AudioObjectPropertyAddress( + mSelector: kAudioDevicePropertyVolumeScalar, + mScope: kAudioDevicePropertyScopeOutput, + mElement: kAudioObjectPropertyElementMain + ) + + if AudioObjectHasProperty(deviceID, &masterAddress) { + var newValue = clamped + let size = UInt32(MemoryLayout.size) + let status = AudioObjectSetPropertyData(deviceID, &masterAddress, 0, nil, size, &newValue) + if status == noErr { return true } + } + + let leftOK = writeChannelVolume(deviceID: deviceID, channel: 1, value: clamped) + let rightOK = writeChannelVolume(deviceID: deviceID, channel: 2, value: clamped) + return leftOK || rightOK + } + + // MARK: - Private + + private static func defaultOutputDeviceID() -> AudioObjectID? { + var deviceID = AudioObjectID(kAudioObjectUnknown) + var size = UInt32(MemoryLayout.size) + var address = AudioObjectPropertyAddress( + mSelector: kAudioHardwarePropertyDefaultOutputDevice, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain + ) + let status = AudioObjectGetPropertyData( + AudioObjectID(kAudioObjectSystemObject), + &address, + 0, + nil, + &size, + &deviceID + ) + return (status == noErr && deviceID != kAudioObjectUnknown) ? deviceID : nil + } + + private static func readChannelVolume(deviceID: AudioObjectID, channel: UInt32) -> Float? { + var address = AudioObjectPropertyAddress( + mSelector: kAudioDevicePropertyVolumeScalar, + mScope: kAudioDevicePropertyScopeOutput, + mElement: channel + ) + guard AudioObjectHasProperty(deviceID, &address) else { return nil } + var volume: Float32 = 0 + var size = UInt32(MemoryLayout.size) + let status = AudioObjectGetPropertyData(deviceID, &address, 0, nil, &size, &volume) + return status == noErr ? volume : nil + } + + private static func writeChannelVolume(deviceID: AudioObjectID, channel: UInt32, value: Float) -> Bool { + var address = AudioObjectPropertyAddress( + mSelector: kAudioDevicePropertyVolumeScalar, + mScope: kAudioDevicePropertyScopeOutput, + mElement: channel + ) + guard AudioObjectHasProperty(deviceID, &address) else { return false } + var newValue = value + let size = UInt32(MemoryLayout.size) + let status = AudioObjectSetPropertyData(deviceID, &address, 0, nil, size, &newValue) + return status == noErr + } +} diff --git a/Sources/Fluid/UI/SettingsView.swift b/Sources/Fluid/UI/SettingsView.swift index 10c6384e..fc099e2a 100644 --- a/Sources/Fluid/UI/SettingsView.swift +++ b/Sources/Fluid/UI/SettingsView.swift @@ -836,12 +836,36 @@ struct SettingsView: View { ) Divider().opacity(0.2) + HStack(alignment: .center) { + VStack(alignment: .leading, spacing: 2) { + Text("Music During Transcription") + .font(.body) + Text("Leave playing, pause it, or lower the system volume to 10% while you dictate (restored when you stop).") + .font(.caption) + .foregroundStyle(.secondary) + } + + Spacer() + + Picker("", selection: Binding( + get: { SettingsStore.shared.mediaBehaviorDuringTranscription }, + set: { SettingsStore.shared.mediaBehaviorDuringTranscription = $0 } + )) { + ForEach(SettingsStore.MediaBehaviorDuringTranscription.allCases) { mode in + Text(mode.displayName).tag(mode) + } + } + .frame(width: 160) + .labelsHidden() + } + Divider().opacity(0.2) + self.optionToggleRow( - title: "Pause Media During Transcription", - description: "Automatically pause currently playing audio/video when transcription starts. Resumes only if FluidVoice paused it.", + title: "Keep Mac Awake While Dictating", + description: "Prevents the display from sleeping or locking while a recording is active. Released as soon as you stop.", isOn: Binding( - get: { SettingsStore.shared.pauseMediaDuringTranscription }, - set: { SettingsStore.shared.pauseMediaDuringTranscription = $0 } + get: { SettingsStore.shared.preventSleepDuringTranscription }, + set: { SettingsStore.shared.preventSleepDuringTranscription = $0 } ) ) Divider().opacity(0.2) From bf40a9af805be4ff21b704e34937b589f10aef38 Mon Sep 17 00:00:00 2001 From: Andrew Beniston Date: Thu, 7 May 2026 22:38:50 +0100 Subject: [PATCH 2/3] Address Codex review on PR #325 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for issues the automated reviewer flagged. 1. Backup restore: previously the legacy pauseMediaDuringTranscription bool's setter only mapped false to .none if the current mode was .pause, so restoring a backup that meant "leave playing" onto a machine currently set to .duck silently kept ducking. Restore now reads the new lossless mediaBehaviorDuringTranscription enum field from the payload when present, falling back to the bool only for backups created by older builds. The legacy bool's setter is also tightened so false always means .none — deterministic regardless of prior state. 2. Per-channel volume preservation: the duck snapshot path used to average L/R into a single scalar on devices without master volume, then write that average back to both channels on restore. One duck cycle would permanently flatten any non-centred balance. Replace the Float scalar in MediaSessionAction.ducked and the duck/restore API surface with a SystemVolumeSnapshot enum that holds either the master scalar or distinct L/R values, then restore the snapshot exactly at the end of the fade-up ramp so balance survives. Fade itself stays a scalar interpolation — the precise channel restore only happens on the final write, so the ramp's CPU profile is unchanged. --- Sources/Fluid/Persistence/BackupService.swift | 5 ++ Sources/Fluid/Persistence/SettingsStore.swift | 25 ++++-- .../Fluid/Services/MediaPlaybackService.swift | 60 +++++++++---- .../Services/SystemVolumeController.swift | 89 ++++++++++++++++--- 4 files changed, 140 insertions(+), 39 deletions(-) diff --git a/Sources/Fluid/Persistence/BackupService.swift b/Sources/Fluid/Persistence/BackupService.swift index deab0202..dbb10f67 100644 --- a/Sources/Fluid/Persistence/BackupService.swift +++ b/Sources/Fluid/Persistence/BackupService.swift @@ -61,6 +61,11 @@ struct SettingsBackupPayload: Codable, Equatable { let removeFillerWordsEnabled: Bool let gaavModeEnabled: Bool let pauseMediaDuringTranscription: Bool + /// Lossless capture of the unified media-behaviour enum (none / pause / + /// duck). Optional so that backups created by older builds (which only + /// wrote the legacy bool) still decode cleanly. New builds prefer this + /// field on restore and fall back to the bool only when it's nil. + let mediaBehaviorDuringTranscription: SettingsStore.MediaBehaviorDuringTranscription? let vocabularyBoostingEnabled: Bool let customDictionaryEntries: [SettingsStore.CustomDictionaryEntry] let selectedDictationPromptID: String? diff --git a/Sources/Fluid/Persistence/SettingsStore.swift b/Sources/Fluid/Persistence/SettingsStore.swift index de0d1310..e3e2575f 100644 --- a/Sources/Fluid/Persistence/SettingsStore.swift +++ b/Sources/Fluid/Persistence/SettingsStore.swift @@ -2261,6 +2261,7 @@ final class SettingsStore: ObservableObject { removeFillerWordsEnabled: self.removeFillerWordsEnabled, gaavModeEnabled: self.gaavModeEnabled, pauseMediaDuringTranscription: self.pauseMediaDuringTranscription, + mediaBehaviorDuringTranscription: self.mediaBehaviorDuringTranscription, vocabularyBoostingEnabled: self.vocabularyBoostingEnabled, customDictionaryEntries: self.customDictionaryEntries, selectedDictationPromptID: self.selectedDictationPromptID, @@ -2332,7 +2333,15 @@ final class SettingsStore: ObservableObject { self.fillerWords = payload.fillerWords self.removeFillerWordsEnabled = payload.removeFillerWordsEnabled self.gaavModeEnabled = payload.gaavModeEnabled - self.pauseMediaDuringTranscription = payload.pauseMediaDuringTranscription + // Prefer the lossless enum if the backup carried it (new builds); + // fall back to the legacy bool for backups from older versions. + // Either way the assignment is deterministic — current state on the + // restoring machine never decides the outcome. + if let mode = payload.mediaBehaviorDuringTranscription { + self.mediaBehaviorDuringTranscription = mode + } else { + self.mediaBehaviorDuringTranscription = payload.pauseMediaDuringTranscription ? .pause : .none + } self.vocabularyBoostingEnabled = payload.vocabularyBoostingEnabled self.customDictionaryEntries = payload.customDictionaryEntries @@ -2945,17 +2954,15 @@ final class SettingsStore: ObservableObject { } /// Legacy boolean view of `mediaBehaviorDuringTranscription`. Kept so - /// `BackupService`'s payload (which exports a `Bool`) stays compatible. - /// Setting `true` selects `.pause`; setting `false` selects `.none` only - /// if the current mode was `.pause` — `.duck` is preserved. + /// `BackupService`'s payload (which still exports a `Bool` for backward + /// compatibility with older builds) round-trips through the same key. + /// Deterministic in both directions: `true` selects `.pause`, `false` + /// selects `.none`. Restore paths should prefer the lossless enum field + /// on the payload when available so `.duck` survives a round trip. var pauseMediaDuringTranscription: Bool { get { self.mediaBehaviorDuringTranscription == .pause } set { - if newValue { - self.mediaBehaviorDuringTranscription = .pause - } else if self.mediaBehaviorDuringTranscription == .pause { - self.mediaBehaviorDuringTranscription = .none - } + self.mediaBehaviorDuringTranscription = newValue ? .pause : .none } } diff --git a/Sources/Fluid/Services/MediaPlaybackService.swift b/Sources/Fluid/Services/MediaPlaybackService.swift index 70f0e5f8..a3066fdc 100644 --- a/Sources/Fluid/Services/MediaPlaybackService.swift +++ b/Sources/Fluid/Services/MediaPlaybackService.swift @@ -6,10 +6,15 @@ import MediaRemoteAdapter /// What `MediaPlaybackService` did at the start of a transcription session. /// Stored on `ASRService` so the matching restore at stop knows whether to /// resume playback, restore the system volume, or do nothing. +/// +/// The duck case carries a `SystemVolumeSnapshot` rather than a single +/// `Float` so output devices that expose only per-channel volume (no master) +/// can have their stereo balance restored exactly. A flat scalar would +/// collapse L/R to their average on every duck cycle. enum MediaSessionAction: Equatable { case none case paused - case ducked(previousVolume: Float) + case ducked(previousVolume: SystemVolumeSnapshot) } /// Volume the system output is dropped to while ducking. 10% of full scale — @@ -184,54 +189,65 @@ final class MediaPlaybackService { // MARK: - Duck path /// Snapshots the current system output volume and starts a background - /// fade-down to `kDuckTargetVolume`. Returns the previous volume so the - /// caller can hand it back to `restoreSystemVolume(previous:)` on stop. + /// fade-down to `kDuckTargetVolume`. Returns the snapshot so the caller + /// can hand it back to `restoreSystemVolume(previous:)` on stop. /// /// Returns `nil` if the volume couldn't be read, or if the user's /// volume is already at or below the duck target — in either case we /// don't touch the volume at all (and the matching restore becomes a /// no-op). - func duckSystemVolume() -> Float? { - guard let previous = SystemVolumeController.currentVolume() else { + func duckSystemVolume() -> SystemVolumeSnapshot? { + guard let snapshot = SystemVolumeController.currentSnapshot() else { DebugLogger.shared.debug( "MediaPlaybackService: Couldn't read system volume, skipping duck", source: "MediaPlaybackService" ) return nil } - guard previous > kDuckTargetVolume else { + let previousScalar = snapshot.averageScalar + guard previousScalar > kDuckTargetVolume else { DebugLogger.shared.debug( - "MediaPlaybackService: Volume \(String(format: "%.2f", previous)) already ≤ duck target, skipping", + "MediaPlaybackService: Volume \(String(format: "%.2f", previousScalar)) already ≤ duck target, skipping", source: "MediaPlaybackService" ) return nil } DebugLogger.shared.info( - "🔉 Fading system volume \(String(format: "%.2f", previous)) → \(String(format: "%.2f", kDuckTargetVolume)) over \(kFadeDuration)s", + "🔉 Fading system volume \(String(format: "%.2f", previousScalar)) → \(String(format: "%.2f", kDuckTargetVolume)) over \(kFadeDuration)s", source: "MediaPlaybackService" ) - self.startFade(from: previous, to: kDuckTargetVolume) - return previous + self.startFade(from: previousScalar, to: kDuckTargetVolume, restoreSnapshot: nil) + return snapshot } - /// Fades the system output volume back up to the value snapshotted by + /// Fades the system output volume back up to the snapshot captured by /// `duckSystemVolume()`. Reads the live volume first so a mid-fade /// interruption (user released the hotkey before the duck-down had /// finished) restarts cleanly from wherever the volume actually is. - func restoreSystemVolume(previous: Float?) { + /// Re-applies the snapshot exactly at the end of the ramp so per-channel + /// detail (e.g. uneven L/R balance) comes back precisely rather than + /// flattened to the fade scalar. + func restoreSystemVolume(previous: SystemVolumeSnapshot?) { guard let previous else { return } let start = SystemVolumeController.currentVolume() ?? kDuckTargetVolume + let target = previous.averageScalar DebugLogger.shared.info( - "🔊 Fading system volume \(String(format: "%.2f", start)) → \(String(format: "%.2f", previous)) over \(kFadeDuration)s", + "🔊 Fading system volume \(String(format: "%.2f", start)) → \(String(format: "%.2f", target)) over \(kFadeDuration)s", source: "MediaPlaybackService" ) - self.startFade(from: start, to: previous) + self.startFade(from: start, to: target, restoreSnapshot: previous) } /// Cancels any in-flight fade and starts a new one from `start` to /// `target` over `kFadeDuration`. Runs detached so the main actor isn't /// blocked between steps; CoreAudio property writes are thread-safe. - private func startFade(from start: Float, to target: Float) { + /// + /// - Parameter restoreSnapshot: If non-nil, this snapshot is applied + /// exactly at the end of the ramp instead of writing the scalar + /// `target`. Used by the fade-up so per-channel volume detail + /// (uneven L/R balance) is restored precisely. Pass `nil` for the + /// fade-down — there's nothing to preserve at the duck target. + private func startFade(from start: Float, to target: Float, restoreSnapshot: SystemVolumeSnapshot?) { self.activeFadeTask?.cancel() let stepCount = kFadeSteps @@ -248,11 +264,17 @@ final class MediaPlaybackService { try? await Task.sleep(nanoseconds: stepDelayNanos) } } - // Land exactly on the target value if we weren't cancelled — - // floating-point drift across 30 steps could otherwise leave us - // a hair off (e.g. 0.0997 instead of 0.10). + // Land exactly on the target if we weren't cancelled — + // floating-point drift across the steps could otherwise leave + // us a hair off (e.g. 0.0997 instead of 0.10). If the caller + // asked for an exact snapshot restore, prefer that over the + // scalar target so per-channel detail comes back intact. if !Task.isCancelled { - _ = SystemVolumeController.setVolume(target) + if let restoreSnapshot { + _ = SystemVolumeController.restore(restoreSnapshot) + } else { + _ = SystemVolumeController.setVolume(target) + } } } } diff --git a/Sources/Fluid/Services/SystemVolumeController.swift b/Sources/Fluid/Services/SystemVolumeController.swift index 7f2ce2e3..9d47712a 100644 --- a/Sources/Fluid/Services/SystemVolumeController.swift +++ b/Sources/Fluid/Services/SystemVolumeController.swift @@ -1,6 +1,33 @@ import CoreAudio import Foundation +/// A captured snapshot of the system output volume that survives the duck +/// cycle and can be restored exactly. Keeps left/right channel values +/// independent for devices that don't expose a master volume property — +/// otherwise a non-centred balance setup would have one duck cycle +/// permanently flatten its channels to the average. +enum SystemVolumeSnapshot: Equatable { + case master(Float) + case channels(left: Float?, right: Float?) + + /// Scalar used as the "from" value of a fade ramp — fades interpolate a + /// single value, then we restore the exact snapshot at the end so any + /// per-channel detail comes back precisely. + var averageScalar: Float { + switch self { + case .master(let v): + return v + case .channels(let l, let r): + switch (l, r) { + case let (l?, r?): return (l + r) / 2 + case let (l?, nil): return l + case let (nil, r?): return r + case (nil, nil): return 0 + } + } + } +} + /// Reads and writes the default output device's volume via CoreAudio's /// `AudioObjectGetPropertyData` / `AudioObjectSetPropertyData`. /// @@ -15,38 +42,45 @@ import Foundation enum SystemVolumeController { /// Returns the current default output device's master scalar volume in /// `0.0...1.0`, or `nil` if the device or master volume property isn't - /// available (some devices only expose per-channel volume). + /// available. Used as the "from" value of the duck-down fade ramp. static func currentVolume() -> Float? { + currentSnapshot()?.averageScalar + } + + /// Captures the current default output device's full volume state for + /// later exact restoration. Prefers the master scalar, falls back to a + /// per-channel snapshot for devices that don't expose master volume. + static func currentSnapshot() -> SystemVolumeSnapshot? { guard let deviceID = defaultOutputDeviceID() else { return nil } - var address = AudioObjectPropertyAddress( + var masterAddress = AudioObjectPropertyAddress( mSelector: kAudioDevicePropertyVolumeScalar, mScope: kAudioDevicePropertyScopeOutput, mElement: kAudioObjectPropertyElementMain ) - if AudioObjectHasProperty(deviceID, &address) { + if AudioObjectHasProperty(deviceID, &masterAddress) { var volume: Float32 = 0 var size = UInt32(MemoryLayout.size) - let status = AudioObjectGetPropertyData(deviceID, &address, 0, nil, &size, &volume) + let status = AudioObjectGetPropertyData(deviceID, &masterAddress, 0, nil, &size, &volume) if status == noErr { - return volume + return .master(volume) } } - // Fall back to averaging channels 1 and 2 if master isn't exposed. let left = readChannelVolume(deviceID: deviceID, channel: 1) let right = readChannelVolume(deviceID: deviceID, channel: 2) - switch (left, right) { - case let (l?, r?): return (l + r) / 2 - case let (l?, nil): return l - case let (nil, r?): return r - default: return nil + if left != nil || right != nil { + return .channels(left: left, right: right) } + return nil } /// Sets the default output device's volume to `value` (clamped to `0.0...1.0`). /// Writes master if available, falls back to writing channels 1 and 2. + /// Used by the duck-down fade ramp where balance preservation isn't + /// meaningful (the duck target is uniform); restore-up uses + /// `restore(_:)` to re-apply the original per-channel values exactly. @discardableResult static func setVolume(_ value: Float) -> Bool { guard let deviceID = defaultOutputDeviceID() else { return false } @@ -70,6 +104,39 @@ enum SystemVolumeController { return leftOK || rightOK } + /// Re-applies a snapshot exactly. For master snapshots this writes the + /// master scalar; for per-channel snapshots this writes the original + /// left and right values independently, preserving stereo balance that + /// `setVolume(_:)` would otherwise have flattened. + @discardableResult + static func restore(_ snapshot: SystemVolumeSnapshot) -> Bool { + guard let deviceID = defaultOutputDeviceID() else { return false } + + switch snapshot { + case .master(let value): + var masterAddress = AudioObjectPropertyAddress( + mSelector: kAudioDevicePropertyVolumeScalar, + mScope: kAudioDevicePropertyScopeOutput, + mElement: kAudioObjectPropertyElementMain + ) + guard AudioObjectHasProperty(deviceID, &masterAddress) else { return false } + var newValue = max(0, min(1, value)) + let size = UInt32(MemoryLayout.size) + let status = AudioObjectSetPropertyData(deviceID, &masterAddress, 0, nil, size, &newValue) + return status == noErr + + case .channels(let left, let right): + var anyOK = false + if let left { + anyOK = writeChannelVolume(deviceID: deviceID, channel: 1, value: max(0, min(1, left))) || anyOK + } + if let right { + anyOK = writeChannelVolume(deviceID: deviceID, channel: 2, value: max(0, min(1, right))) || anyOK + } + return anyOK + } + } + // MARK: - Private private static func defaultOutputDeviceID() -> AudioObjectID? { From 1a83116966f7d56aaea35af2da8299ccf159054e Mon Sep 17 00:00:00 2001 From: Andrew Beniston Date: Thu, 7 May 2026 23:14:13 +0100 Subject: [PATCH 3/3] Improve duck-down responsiveness so the fade lines up with the hotkey Three changes to make the duck feel snappy rather than lagging behind the start sound. The duck was previously fired inside ASRService.start(), which is called via Task { await asr.start() } after captureRecordingContext + applyDictationShortcutSelectionContext + setActiveRecordingMode + setOverlayMode have all run. That puts the duck ~80ms behind the hotkey on a typical Mac, while the start sound (which is fired from the same beginDictationRecording call site) becomes audible from CoreAudio in ~30ms. So the user heard the sound first and the fade a beat later. 1. Hoist the duck and start sound to the very top of beginDictationRecording in ContentView. They now fire alongside the hotkey press, before any of the pre-recording UI work. The duck snapshot is threaded through to ASRService.start() as a `preAppliedMediaAction` parameter so the existing in-start duck-handling code adopts the action without re-firing it. 2. Snap the volume halfway down to the duck target SYNCHRONOUSLY inside duckSystemVolume() before kicking off the detached fade Task. CoreAudio property writes are sub-millisecond round trip, so the audible drop now lands within ~1ms of the call returning, bypassing both Task.detached scheduling latency (occasionally 10-30ms on Debug builds) and the fade ramp's first few steps where each per-step volume change is too small to perceive. The detached fade then smoothly lands the rest of the way, so the duck still has a soft edge. 3. ASRService.start() restores the pre-applied media action on its guard-failure paths (mic-not-authorized, already-running) so a hotkey press that doesn't end up starting a session doesn't leave the user's volume stuck at the duck target. --- Sources/Fluid/ContentView.swift | 24 +++++++++++--- Sources/Fluid/Services/ASRService.swift | 33 ++++++++++++++----- .../Fluid/Services/MediaPlaybackService.swift | 24 +++++++++++--- 3 files changed, 64 insertions(+), 17 deletions(-) diff --git a/Sources/Fluid/ContentView.swift b/Sources/Fluid/ContentView.swift index e6b8da20..dbce4dae 100644 --- a/Sources/Fluid/ContentView.swift +++ b/Sources/Fluid/ContentView.swift @@ -2947,6 +2947,25 @@ extension ContentView { private func beginDictationRecording(for slot: SettingsStore.DictationShortcutSlot, mode: ActiveRecordingMode) { DebugLogger.shared.debug("Begin dictation recording for slot \(slot.rawValue)", source: "ContentView") + + // Fire the duck first, then the start sound — at the very top of this + // function so they happen alongside the hotkey press, not 80ms later + // behind captureRecordingContext + setActiveRecordingMode + + // setOverlayMode. Duck-before-sound matters because CoreAudio's sound + // playback latency (~30-50ms) is shorter than the time it takes the + // fade to drop the music to a perceptibly quieter level; if the + // sound went first, you'd hear it before the music dipped. + let willStart = !self.asr.isRunning + var preAppliedMediaAction: MediaSessionAction = .none + if willStart, SettingsStore.shared.mediaBehaviorDuringTranscription == .duck { + if let prev = MediaPlaybackService.shared.duckSystemVolume() { + preAppliedMediaAction = .ducked(previousVolume: prev) + } + } + if willStart, SettingsStore.shared.enableTranscriptionSounds { + TranscriptionSoundPlayer.shared.playStartSound() + } + self.captureRecordingContext() self.applyDictationShortcutSelectionContext(for: slot) self.setActiveRecordingMode(mode) @@ -2954,11 +2973,8 @@ extension ContentView { self.menuBarManager.setOverlayMode(.dictation) guard !self.asr.isRunning else { return } - if SettingsStore.shared.enableTranscriptionSounds { - TranscriptionSoundPlayer.shared.playStartSound() - } Task { - await self.asr.start() + await self.asr.start(preAppliedMediaAction: preAppliedMediaAction) } } diff --git a/Sources/Fluid/Services/ASRService.swift b/Sources/Fluid/Services/ASRService.swift index 10d718ff..f67c5259 100644 --- a/Sources/Fluid/Services/ASRService.swift +++ b/Sources/Fluid/Services/ASRService.swift @@ -739,20 +739,30 @@ final class ASRService: ObservableObject { /// ## Errors /// If audio session configuration fails, the method will silently fail /// and `isRunning` will remain `false`. Check the debug logs for details. - func start() async { + /// - Parameter preAppliedMediaAction: Optionally, a media action the + /// caller has already taken before invoking start() — used by + /// ContentView to fire the duck ramp the instant the hotkey fires + /// (alongside the start sound) rather than waiting ~80ms behind the + /// pre-recording UI work. If `.none`, start() will fire the duck + /// itself based on the user's setting. + func start(preAppliedMediaAction: MediaSessionAction = .none) async { DebugLogger.shared.info("🎤 START() called - beginning recording session", source: "ASRService") guard self.micStatus == .authorized else { DebugLogger.shared.error("❌ START() blocked - mic not authorized", source: "ASRService") + // The caller may have pre-fired a duck for a session we never get + // to run; undo it so the user's volume isn't left at 10%. + await MediaPlaybackService.shared.restore(from: preAppliedMediaAction) return } guard self.isRunning == false, self.isStarting == false else { DebugLogger.shared.warning("⚠️ START() blocked - already running (started: \(self.isRunning), starting: \(self.isStarting))", source: "ASRService") + await MediaPlaybackService.shared.restore(from: preAppliedMediaAction) return } - // Reset media session action for this session - self.mediaSessionAction = .none + // Adopt any media action the caller already took, otherwise reset. + self.mediaSessionAction = preAppliedMediaAction self.audioRouteRecoveryTask?.cancel() self.audioRouteRecoveryTask = nil self.isRecoveringAudioRoute = false @@ -788,8 +798,15 @@ final class ASRService: ObservableObject { try self.setupEngineTap() DebugLogger.shared.debug("✅ Engine tap setup complete", source: "ASRService") - // Apply media behaviour AFTER successful audio setup but BEFORE setting isRunning - // This ensures we only touch media when we know recording will succeed + // Apply media pause behaviour AFTER successful audio setup but + // BEFORE setting isRunning, so we only pause if recording will + // actually start. Duck is the responsibility of the caller — + // ContentView pre-fires it the instant the hotkey arrives so the + // fade visibly starts alongside the start sound rather than + // waiting behind audio engine setup. If a caller doesn't supply + // a pre-applied duck and the setting is .duck, we fire it here + // as a fallback so the behaviour still works for any code path + // that hasn't been hoisted. switch SettingsStore.shared.mediaBehaviorDuringTranscription { case .none: self.mediaSessionAction = .none @@ -800,10 +817,10 @@ final class ASRService: ObservableObject { DebugLogger.shared.info("🎵 Paused system media for transcription", source: "ASRService") } case .duck: - if let previousVolume = MediaPlaybackService.shared.duckSystemVolume() { + if case .ducked = self.mediaSessionAction { + // Duck was pre-fired by the caller — nothing more to do. + } else if let previousVolume = MediaPlaybackService.shared.duckSystemVolume() { self.mediaSessionAction = .ducked(previousVolume: previousVolume) - } else { - self.mediaSessionAction = .none } } diff --git a/Sources/Fluid/Services/MediaPlaybackService.swift b/Sources/Fluid/Services/MediaPlaybackService.swift index a3066fdc..fbc72904 100644 --- a/Sources/Fluid/Services/MediaPlaybackService.swift +++ b/Sources/Fluid/Services/MediaPlaybackService.swift @@ -25,10 +25,13 @@ private let kDuckTargetVolume: Float = 0.10 /// Length of the fade ramp in seconds. Short enough that the duck has /// fully landed before the user starts dictating, long enough to read as a /// fade rather than a hard cut. -private let kFadeDuration: TimeInterval = 0.2 +private let kFadeDuration: TimeInterval = 0.1 -/// Number of discrete steps in the fade ramp. 30 steps over 200ms is ~150 -/// Hz, well above the threshold where you'd hear the staircase. +/// Number of discrete steps in the fade ramp. 30 steps over 100ms is 300 Hz, +/// well above the threshold where you'd hear the staircase. The fade only +/// covers the second half of the duck (the first half is a synchronous snap +/// in `duckSystemVolume()` for snappy feel) so a relaxed 100ms tail reads +/// as a soft landing rather than a long fade. private let kFadeSteps = 30 /// Service that wraps MediaRemoteAdapter's MediaController to provide @@ -212,11 +215,22 @@ final class MediaPlaybackService { ) return nil } + + // Snap the volume halfway down to the duck target SYNCHRONOUSLY before + // starting the detached fade. This puts a clearly audible drop on the + // user's ear within the round-trip time of one CoreAudio property + // write (sub-millisecond), bypassing both Task.detached scheduling + // latency and the fade ramp's first few steps where the per-step + // volume change is too small to perceive. The detached fade then + // smoothly lands the rest of the way to kDuckTargetVolume. + let immediateDrop = (previousScalar + kDuckTargetVolume) / 2 + SystemVolumeController.setVolume(immediateDrop) + DebugLogger.shared.info( - "🔉 Fading system volume \(String(format: "%.2f", previousScalar)) → \(String(format: "%.2f", kDuckTargetVolume)) over \(kFadeDuration)s", + "🔉 Snapped \(String(format: "%.2f", previousScalar)) → \(String(format: "%.2f", immediateDrop)), fading to \(String(format: "%.2f", kDuckTargetVolume)) over \(kFadeDuration)s", source: "MediaPlaybackService" ) - self.startFade(from: previousScalar, to: kDuckTargetVolume, restoreSnapshot: nil) + self.startFade(from: immediateDrop, to: kDuckTargetVolume, restoreSnapshot: nil) return snapshot }