diff --git a/scripts/smoke-cljs-azure-parity.mjs b/scripts/smoke-cljs-azure-parity.mjs index 0875471..dd6e9f1 100644 --- a/scripts/smoke-cljs-azure-parity.mjs +++ b/scripts/smoke-cljs-azure-parity.mjs @@ -160,7 +160,8 @@ try { word_boundaries: item.wordTimings, visemes: item.visemes, }); - const actual = normalizeTimeline(plan.vocalTimeline.visemes); + const actualEvents = plan.vocalTimeline.visemes; + const actual = normalizeTimeline(actualEvents); tts.dispose(); @@ -173,6 +174,22 @@ try { ].join('\n'), ); } + + const debug = actualEvents.find((event) => event.debug?.provider === 'azure')?.debug; + if (!debug || + typeof debug.providerId !== 'number' || + typeof debug.providerTimeMs !== 'number' || + typeof debug.canonicalVisemeId !== 'number' || + debug.morphTargetKey !== String(debug.canonicalVisemeId)) { + throw new Error(`Azure debug metadata missing for "${item.name}": ${JSON.stringify(actualEvents)}`); + } + + if (item.name === 'long-e, f/v, and final th') { + const thDebug = actualEvents.find((event) => event.debug?.providerId === 19)?.debug; + if (thDebug?.word !== 'growth' || thDebug.canonicalVisemeId !== 13 || thDebug.refined !== true) { + throw new Error(`Azure TH refinement debug missing for "${item.name}": ${JSON.stringify(actualEvents)}`); + } + } } } finally { await rm(tempDir, { recursive: true, force: true }); diff --git a/scripts/smoke-cljs-blink.mjs b/scripts/smoke-cljs-blink.mjs index 133192b..8633c36 100644 --- a/scripts/smoke-cljs-blink.mjs +++ b/scripts/smoke-cljs-blink.mjs @@ -560,7 +560,12 @@ if (!vocalEffects.some((effect) => effect.op === 'pause') || !vocalEffects.some( } const azureName = vocal.processVisemeEvents( - [{ visemeId: 1, offsetMs: 0, durationMs: 120 }], + [{ + visemeId: 1, + offsetMs: 0, + durationMs: 120, + debug: { provider: 'azure', providerId: 1, canonicalVisemeId: 1, morphTargetKey: '1' }, + }], 'azure_vocal_test', ); if (azureName !== 'azure_vocal_test') { @@ -571,6 +576,15 @@ if (vocalScheduled.length !== 2 || vocalRemoved[0] !== vocalName) { throw new Error(`Expected starting Azure vocal to replace previous sentence, scheduled=${vocalScheduled.length}, removed=${vocalRemoved.join(', ')}`); } +const azureVocalDebug = vocalScheduled[1].snippet.visemeDebug?.[0]; +if (azureVocalDebug?.provider !== 'azure' || + azureVocalDebug.morphTargetKey !== '1' || + typeof azureVocalDebug.jawValue !== 'number' || + typeof azureVocalDebug.totalLipActivation !== 'number' || + typeof azureVocalDebug.activeMorphValue !== 'number') { + throw new Error(`Expected vocal snippet to expose Azure activation debug, received ${JSON.stringify(vocalScheduled[1].snippet.visemeDebug)}`); +} + vocal.stopSentence(); if (!vocalRemoved.includes('azure_vocal_test')) { throw new Error(`Expected vocal stopSentence to remove active Azure snippet, removed ${vocalRemoved.join(', ')}`); @@ -655,6 +669,10 @@ const lipSyncAzureName = lipSync.processAzureVisemes( { visemeId: 4, time: 0.12 }, ], 240, + { + wordTimings: [{ word: 'ah', start: 0, end: 0.24 }], + visualLeadMs: 20, + }, ); if (!lipSyncAzureName?.startsWith('azure_lipsync_')) { @@ -665,6 +683,15 @@ if (lipSyncScheduled.length !== 2 || lipSyncScheduled[1].snippet.snippetIntensit throw new Error(`Expected CLJS lipsync Azure scheduling with configured intensity, received ${JSON.stringify(lipSyncScheduled[1])}`); } +const lipSyncAzureEvent = lipSyncEvents.find((event) => event.type === 'AZURE_SCHEDULED'); +const lipSyncAzureDebug = lipSyncAzureEvent?.debugTimeline?.[0]; +if (lipSyncAzureDebug?.provider !== 'azure' || + lipSyncAzureDebug.word !== 'ah' || + lipSyncAzureDebug.visualLeadMs !== 20 || + lipSyncAzureDebug.morphTargetKey !== String(lipSyncAzureDebug.canonicalVisemeId)) { + throw new Error(`Expected CLJS lipsync Azure debug timeline, received ${JSON.stringify(lipSyncAzureEvent)}`); +} + const neutralName = lipSync.endSpeech(); if (!neutralName?.startsWith('neutral_')) { throw new Error(`Expected CLJS lipsync neutral return name, received ${neutralName}`); @@ -766,6 +793,15 @@ if (!(ledViseme?.offsetMs < rawTimelineViseme?.offsetMs)) { throw new Error(`Expected Azure visual lead in vocal timeline, led=${JSON.stringify(ledViseme)}, raw=${JSON.stringify(rawTimelineViseme)}`); } +const refinedThDebug = azurePlan.vocalTimeline.visemes.find((event) => event.debug?.providerId === 19)?.debug; +if (refinedThDebug?.provider !== 'azure' || + refinedThDebug.word !== 'growth' || + refinedThDebug.canonicalVisemeId !== 13 || + refinedThDebug.refined !== true || + refinedThDebug.visualLeadMs !== 35) { + throw new Error(`Expected Azure TH refinement debug data, received ${JSON.stringify(azurePlan.vocalTimeline.visemes)}`); +} + tts.playbackStarted(utteranceId); tts.processWordBoundary('we', 0.05, utteranceId); tts.finishSpeech(utteranceId); diff --git a/src-cljs/latticework/lipsync.cljs b/src-cljs/latticework/lipsync.cljs index 8cff3ce..2325dfb 100644 --- a/src-cljs/latticework/lipsync.cljs +++ b/src-cljs/latticework/lipsync.cljs @@ -383,6 +383,10 @@ (sort-by :time) vec)) +(defn- word-label [word] + (when-let [text (:word word)] + (str text))) + (defn- azure-duration-ms [current next-event total-duration-ms] (let [offset-ms (max 0 (.round js/Math (* (:time current) 1000))) remaining-ms (if (finite-number? total-duration-ms) @@ -404,29 +408,46 @@ (+ raw-span-ms overlap-ms))] (clamp-duration desired-ms 1 max-ms remaining-ms))))) -(defn- push-azure-event [timeline provider-id canonical-id offset-ms duration-ms visual-lead-ms] - (if (or (= provider-id 0) (<= duration-ms 0)) - timeline - (if (and (get diphthong-targets provider-id) - (>= duration-ms diphthong-min-duration-ms)) - (let [[first-viseme second-viseme] (get diphthong-targets provider-id) - second-offset (min (+ offset-ms duration-ms (- diphthong-secondary-min-ms)) - (+ offset-ms (* duration-ms 0.55))) - first-duration (max diphthong-secondary-min-ms - (min duration-ms - (+ (- second-offset offset-ms) (* duration-ms 0.25)))) - second-duration (max diphthong-secondary-min-ms - (- (+ offset-ms duration-ms) second-offset))] - (conj timeline - {:visemeId first-viseme - :offsetMs (.round js/Math (apply-visual-lead offset-ms visual-lead-ms)) - :durationMs (.round js/Math first-duration)} - {:visemeId second-viseme - :offsetMs (.round js/Math (apply-visual-lead second-offset visual-lead-ms)) - :durationMs (.round js/Math second-duration)})) - (conj timeline {:visemeId canonical-id - :offsetMs (.round js/Math (apply-visual-lead offset-ms visual-lead-ms)) - :durationMs (.round js/Math duration-ms)})))) +(defn- azure-debug [event canonical-id offset-ms duration-ms visual-lead-ms segment] + (cond-> {:provider "azure" + :providerId (:providerId event) + :providerTimeMs (:timeMs event) + :audioOffsetMs (.round js/Math offset-ms) + :visualOffsetMs (.round js/Math (apply-visual-lead offset-ms visual-lead-ms)) + :durationMs (.round js/Math duration-ms) + :baseCanonicalVisemeId (:baseCanonicalId event) + :canonicalVisemeId canonical-id + :morphTargetKey (str canonical-id) + :segment segment} + (:word event) (assoc :word (:word event)) + (not= (:baseCanonicalId event) canonical-id) (assoc :refined true) + (and (finite-number? visual-lead-ms) (pos? visual-lead-ms)) (assoc :visualLeadMs visual-lead-ms))) + +(defn- azure-viseme-event [event canonical-id offset-ms duration-ms visual-lead-ms segment] + {:visemeId canonical-id + :offsetMs (.round js/Math (apply-visual-lead offset-ms visual-lead-ms)) + :durationMs (.round js/Math duration-ms) + :debug (azure-debug event canonical-id offset-ms duration-ms visual-lead-ms segment)}) + +(defn- push-azure-event [timeline event offset-ms duration-ms visual-lead-ms] + (let [provider-id (:providerId event) + canonical-id (:canonicalId event)] + (if (or (= provider-id 0) (<= duration-ms 0)) + timeline + (if (and (get diphthong-targets provider-id) + (>= duration-ms diphthong-min-duration-ms)) + (let [[first-viseme second-viseme] (get diphthong-targets provider-id) + second-offset (min (+ offset-ms duration-ms (- diphthong-secondary-min-ms)) + (+ offset-ms (* duration-ms 0.55))) + first-duration (max diphthong-secondary-min-ms + (min duration-ms + (+ (- second-offset offset-ms) (* duration-ms 0.25)))) + second-duration (max diphthong-secondary-min-ms + (- (+ offset-ms duration-ms) second-offset))] + (conj timeline + (azure-viseme-event event first-viseme offset-ms first-duration visual-lead-ms "diphthong-primary") + (azure-viseme-event event second-viseme second-offset second-duration visual-lead-ms "diphthong-secondary"))) + (conj timeline (azure-viseme-event event canonical-id offset-ms duration-ms visual-lead-ms "single")))))) (defn- mapped-azure-events [events options] (loop [remaining (normalize-azure-visemes events) @@ -436,11 +457,12 @@ (let [event (first remaining) provider-id (:providerId event) base-canonical (get azure-to-canonical provider-id 2) + word (find-word-at-time (:time event) (:wordTimings options)) canonical-id (refine-azure-viseme provider-id base-canonical (:time event) - (find-word-at-time (:time event) (:wordTimings options))) + word) time-ms (* (:time event) 1000) previous (last mapped)] (if (nil? canonical-id) @@ -451,7 +473,10 @@ (recur (rest remaining) mapped) (recur (rest remaining) (conj mapped (assoc event + :timeMs (.round js/Math time-ms) + :baseCanonicalId base-canonical :canonicalId canonical-id + :word (word-label word) :className (azure-viseme-class provider-id canonical-id)))))))))) (defn azure-visemes-to-timeline @@ -465,12 +490,10 @@ (vec (sort-by :offsetMs timeline)) (let [event (nth events index) next-event (when (< (inc index) (count events)) (nth events (inc index))) - provider-id (:providerId event) - canonical-id (:canonicalId event) offset-ms (max 0 (.round js/Math (* (:time event) 1000))) duration-ms (azure-duration-ms event next-event total-duration-ms)] (recur (inc index) - (push-azure-event timeline provider-id canonical-id offset-ms duration-ms visual-lead-ms)))))))) + (push-azure-event timeline event offset-ms duration-ms visual-lead-ms)))))))) (defn process-azure-visemes! [state events total-duration-ms snippet-name options] (let [timeline (azure-visemes-to-timeline events total-duration-ms options) @@ -493,7 +516,8 @@ (schedule-snippet-outputs snippet cleanup-ms) [(event-output {:type "AZURE_SCHEDULED" :snippetName snippet-name - :eventCount (count events)}) + :eventCount (count events) + :debugTimeline (mapv :debug timeline)}) (state-output state)]))))))) (defn end-speech! [state] diff --git a/src-cljs/latticework/vocal.cljs b/src-cljs/latticework/vocal.cljs index 1c15341..dafc133 100644 --- a/src-cljs/latticework/vocal.cljs +++ b/src-cljs/latticework/vocal.cljs @@ -701,9 +701,55 @@ (recur (inc i) jaw-curve)))))) (defn- normalize-viseme-event [event] - {:visemeId (int (clamp 0 14 (:visemeId event))) - :offsetMs (max 0 (number-or (:offsetMs event) 0)) - :durationMs (max 0 (number-or (:durationMs event) 0))}) + (cond-> {:visemeId (int (clamp 0 14 (:visemeId event))) + :offsetMs (max 0 (number-or (:offsetMs event) 0)) + :durationMs (max 0 (number-or (:durationMs event) 0))} + (:debug event) (assoc :debug (:debug event)))) + +(defn- curve-value-at [curve time-sec] + (let [points (vec (sort-by :time (or curve [])))] + (cond + (empty? points) 0 + (<= time-sec (:time (first points))) (number-or (:intensity (first points)) 0) + (>= time-sec (:time (last points))) (number-or (:intensity (last points)) 0) + :else + (loop [previous (first points) + remaining (rest points)] + (let [current (first remaining)] + (if (or (nil? current) (<= time-sec (:time current))) + (let [start-time (number-or (:time previous) 0) + end-time (number-or (:time current) start-time) + start-intensity (number-or (:intensity previous) 0) + end-intensity (number-or (:intensity current) start-intensity) + span (max 0.0001 (- end-time start-time)) + progress (protocol/clamp 0 1 (/ (- time-sec start-time) span))] + (+ start-intensity (* (- end-intensity start-intensity) progress))) + (recur current (rest remaining)))))))) + +(defn- lip-curve-key? [key] + (when (re-matches #"\d+" key) + (<= 0 (js/parseInt key 10) 14))) + +(defn- total-lip-activation-at [curves time-sec] + (reduce-kv + (fn [total key curve] + (if (lip-curve-key? key) + (+ total (curve-value-at curve time-sec)) + total)) + 0 + (or curves {}))) + +(defn- viseme-debug-summary [event curves] + (let [viseme-id (:visemeId event) + sample-time-sec (/ (+ (:offsetMs event) (/ (:durationMs event) 2)) 1000) + morph-target-key (str viseme-id)] + (merge (:debug event) + {:visemeId viseme-id + :morphTargetKey morph-target-key + :sampleTimeSec sample-time-sec + :jawValue (curve-value-at (get curves jaw-au) sample-time-sec) + :totalLipActivation (total-lip-activation-at curves sample-time-sec) + :activeMorphValue (curve-value-at (get curves morph-target-key) sample-time-sec)}))) (defn build-vocal-snippet ([events] (build-vocal-snippet events nil nil)) @@ -720,6 +766,7 @@ :snippetPlaybackRate 1.0 :snippetIntensityScale 1.0 :snippetJawScale jaw-scale + :visemeDebug [] :loop false :maxTime 0 :curves {}} @@ -755,6 +802,7 @@ :snippetIntensityScale 1.0 :snippetJawScale jaw-scale :autoVisemeJaw false + :visemeDebug (mapv #(viseme-debug-summary % articulated-curves) events) :loop false :maxTime max-time :curves articulated-curves}))))) diff --git a/types/cljs.d.ts b/types/cljs.d.ts index ef20d05..02f8960 100644 --- a/types/cljs.d.ts +++ b/types/cljs.d.ts @@ -440,10 +440,34 @@ export interface VocalConfig { priority?: number; } +export interface VocalVisemeDebug { + provider?: string; + providerId?: number; + providerTimeMs?: number; + audioOffsetMs?: number; + visualOffsetMs?: number; + durationMs?: number; + baseCanonicalVisemeId?: number; + canonicalVisemeId?: number; + visemeId?: number; + word?: string; + morphTargetKey?: string; + segment?: string; + refined?: boolean; + visualLeadMs?: number; + sampleTimeSec?: number; + jawValue?: number; + totalLipActivation?: number; + activeMorphValue?: number; + [key: string]: unknown; +} + export interface VocalVisemeEvent { visemeId: number; offsetMs: number; durationMs: number; + debug?: VocalVisemeDebug; + [key: string]: unknown; } export interface VocalWordTiming { @@ -470,6 +494,7 @@ export interface VocalSnippet { snippetIntensityScale: number; snippetJawScale?: number; autoVisemeJaw?: boolean; + visemeDebug?: VocalVisemeDebug[]; loop: boolean; maxTime: number; curves: Record; @@ -576,6 +601,7 @@ export interface LipSyncEvent { wordIndex?: number; snippetName?: string; eventCount?: number; + debugTimeline?: VocalVisemeDebug[]; [key: string]: unknown; }