Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion scripts/smoke-cljs-azure-parity.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ try {
word_boundaries: item.wordTimings,
visemes: item.visemes,
});
const actual = normalizeTimeline(plan.vocalTimeline.visemes);
const actualEvents = plan.vocalTimeline.visemes;
const actual = normalizeTimeline(actualEvents);

tts.dispose();

Expand All @@ -173,6 +174,22 @@ try {
].join('\n'),
);
}

const debug = actualEvents.find((event) => event.debug?.provider === 'azure')?.debug;
if (!debug ||
typeof debug.providerId !== 'number' ||
typeof debug.providerTimeMs !== 'number' ||
typeof debug.canonicalVisemeId !== 'number' ||
debug.morphTargetKey !== String(debug.canonicalVisemeId)) {
throw new Error(`Azure debug metadata missing for "${item.name}": ${JSON.stringify(actualEvents)}`);
}

if (item.name === 'long-e, f/v, and final th') {
const thDebug = actualEvents.find((event) => event.debug?.providerId === 19)?.debug;
if (thDebug?.word !== 'growth' || thDebug.canonicalVisemeId !== 13 || thDebug.refined !== true) {
throw new Error(`Azure TH refinement debug missing for "${item.name}": ${JSON.stringify(actualEvents)}`);
}
}
}
} finally {
await rm(tempDir, { recursive: true, force: true });
Expand Down
38 changes: 37 additions & 1 deletion scripts/smoke-cljs-blink.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,12 @@ if (!vocalEffects.some((effect) => effect.op === 'pause') || !vocalEffects.some(
}

const azureName = vocal.processVisemeEvents(
[{ visemeId: 1, offsetMs: 0, durationMs: 120 }],
[{
visemeId: 1,
offsetMs: 0,
durationMs: 120,
debug: { provider: 'azure', providerId: 1, canonicalVisemeId: 1, morphTargetKey: '1' },
}],
'azure_vocal_test',
);
if (azureName !== 'azure_vocal_test') {
Expand All @@ -571,6 +576,15 @@ if (vocalScheduled.length !== 2 || vocalRemoved[0] !== vocalName) {
throw new Error(`Expected starting Azure vocal to replace previous sentence, scheduled=${vocalScheduled.length}, removed=${vocalRemoved.join(', ')}`);
}

const azureVocalDebug = vocalScheduled[1].snippet.visemeDebug?.[0];
if (azureVocalDebug?.provider !== 'azure' ||
azureVocalDebug.morphTargetKey !== '1' ||
typeof azureVocalDebug.jawValue !== 'number' ||
typeof azureVocalDebug.totalLipActivation !== 'number' ||
typeof azureVocalDebug.activeMorphValue !== 'number') {
throw new Error(`Expected vocal snippet to expose Azure activation debug, received ${JSON.stringify(vocalScheduled[1].snippet.visemeDebug)}`);
}

vocal.stopSentence();
if (!vocalRemoved.includes('azure_vocal_test')) {
throw new Error(`Expected vocal stopSentence to remove active Azure snippet, removed ${vocalRemoved.join(', ')}`);
Expand Down Expand Up @@ -655,6 +669,10 @@ const lipSyncAzureName = lipSync.processAzureVisemes(
{ visemeId: 4, time: 0.12 },
],
240,
{
wordTimings: [{ word: 'ah', start: 0, end: 0.24 }],
visualLeadMs: 20,
},
);

if (!lipSyncAzureName?.startsWith('azure_lipsync_')) {
Expand All @@ -665,6 +683,15 @@ if (lipSyncScheduled.length !== 2 || lipSyncScheduled[1].snippet.snippetIntensit
throw new Error(`Expected CLJS lipsync Azure scheduling with configured intensity, received ${JSON.stringify(lipSyncScheduled[1])}`);
}

const lipSyncAzureEvent = lipSyncEvents.find((event) => event.type === 'AZURE_SCHEDULED');
const lipSyncAzureDebug = lipSyncAzureEvent?.debugTimeline?.[0];
if (lipSyncAzureDebug?.provider !== 'azure' ||
lipSyncAzureDebug.word !== 'ah' ||
lipSyncAzureDebug.visualLeadMs !== 20 ||
lipSyncAzureDebug.morphTargetKey !== String(lipSyncAzureDebug.canonicalVisemeId)) {
throw new Error(`Expected CLJS lipsync Azure debug timeline, received ${JSON.stringify(lipSyncAzureEvent)}`);
}

const neutralName = lipSync.endSpeech();
if (!neutralName?.startsWith('neutral_')) {
throw new Error(`Expected CLJS lipsync neutral return name, received ${neutralName}`);
Expand Down Expand Up @@ -766,6 +793,15 @@ if (!(ledViseme?.offsetMs < rawTimelineViseme?.offsetMs)) {
throw new Error(`Expected Azure visual lead in vocal timeline, led=${JSON.stringify(ledViseme)}, raw=${JSON.stringify(rawTimelineViseme)}`);
}

const refinedThDebug = azurePlan.vocalTimeline.visemes.find((event) => event.debug?.providerId === 19)?.debug;
if (refinedThDebug?.provider !== 'azure' ||
refinedThDebug.word !== 'growth' ||
refinedThDebug.canonicalVisemeId !== 13 ||
refinedThDebug.refined !== true ||
refinedThDebug.visualLeadMs !== 35) {
throw new Error(`Expected Azure TH refinement debug data, received ${JSON.stringify(azurePlan.vocalTimeline.visemes)}`);
}

tts.playbackStarted(utteranceId);
tts.processWordBoundary('we', 0.05, utteranceId);
tts.finishSpeech(utteranceId);
Expand Down
80 changes: 52 additions & 28 deletions src-cljs/latticework/lipsync.cljs
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,10 @@
(sort-by :time)
vec))

(defn- word-label [word]
(when-let [text (:word word)]
(str text)))

(defn- azure-duration-ms [current next-event total-duration-ms]
(let [offset-ms (max 0 (.round js/Math (* (:time current) 1000)))
remaining-ms (if (finite-number? total-duration-ms)
Expand All @@ -404,29 +408,46 @@
(+ raw-span-ms overlap-ms))]
(clamp-duration desired-ms 1 max-ms remaining-ms)))))

(defn- push-azure-event [timeline provider-id canonical-id offset-ms duration-ms visual-lead-ms]
(if (or (= provider-id 0) (<= duration-ms 0))
timeline
(if (and (get diphthong-targets provider-id)
(>= duration-ms diphthong-min-duration-ms))
(let [[first-viseme second-viseme] (get diphthong-targets provider-id)
second-offset (min (+ offset-ms duration-ms (- diphthong-secondary-min-ms))
(+ offset-ms (* duration-ms 0.55)))
first-duration (max diphthong-secondary-min-ms
(min duration-ms
(+ (- second-offset offset-ms) (* duration-ms 0.25))))
second-duration (max diphthong-secondary-min-ms
(- (+ offset-ms duration-ms) second-offset))]
(conj timeline
{:visemeId first-viseme
:offsetMs (.round js/Math (apply-visual-lead offset-ms visual-lead-ms))
:durationMs (.round js/Math first-duration)}
{:visemeId second-viseme
:offsetMs (.round js/Math (apply-visual-lead second-offset visual-lead-ms))
:durationMs (.round js/Math second-duration)}))
(conj timeline {:visemeId canonical-id
:offsetMs (.round js/Math (apply-visual-lead offset-ms visual-lead-ms))
:durationMs (.round js/Math duration-ms)}))))
(defn- azure-debug [event canonical-id offset-ms duration-ms visual-lead-ms segment]
(cond-> {:provider "azure"
:providerId (:providerId event)
:providerTimeMs (:timeMs event)
:audioOffsetMs (.round js/Math offset-ms)
:visualOffsetMs (.round js/Math (apply-visual-lead offset-ms visual-lead-ms))
:durationMs (.round js/Math duration-ms)
:baseCanonicalVisemeId (:baseCanonicalId event)
:canonicalVisemeId canonical-id
:morphTargetKey (str canonical-id)
:segment segment}
(:word event) (assoc :word (:word event))
(not= (:baseCanonicalId event) canonical-id) (assoc :refined true)
(and (finite-number? visual-lead-ms) (pos? visual-lead-ms)) (assoc :visualLeadMs visual-lead-ms)))

(defn- azure-viseme-event [event canonical-id offset-ms duration-ms visual-lead-ms segment]
{:visemeId canonical-id
:offsetMs (.round js/Math (apply-visual-lead offset-ms visual-lead-ms))
:durationMs (.round js/Math duration-ms)
:debug (azure-debug event canonical-id offset-ms duration-ms visual-lead-ms segment)})

(defn- push-azure-event [timeline event offset-ms duration-ms visual-lead-ms]
(let [provider-id (:providerId event)
canonical-id (:canonicalId event)]
(if (or (= provider-id 0) (<= duration-ms 0))
timeline
(if (and (get diphthong-targets provider-id)
(>= duration-ms diphthong-min-duration-ms))
(let [[first-viseme second-viseme] (get diphthong-targets provider-id)
second-offset (min (+ offset-ms duration-ms (- diphthong-secondary-min-ms))
(+ offset-ms (* duration-ms 0.55)))
first-duration (max diphthong-secondary-min-ms
(min duration-ms
(+ (- second-offset offset-ms) (* duration-ms 0.25))))
second-duration (max diphthong-secondary-min-ms
(- (+ offset-ms duration-ms) second-offset))]
(conj timeline
(azure-viseme-event event first-viseme offset-ms first-duration visual-lead-ms "diphthong-primary")
(azure-viseme-event event second-viseme second-offset second-duration visual-lead-ms "diphthong-secondary")))
(conj timeline (azure-viseme-event event canonical-id offset-ms duration-ms visual-lead-ms "single"))))))

(defn- mapped-azure-events [events options]
(loop [remaining (normalize-azure-visemes events)
Expand All @@ -436,11 +457,12 @@
(let [event (first remaining)
provider-id (:providerId event)
base-canonical (get azure-to-canonical provider-id 2)
word (find-word-at-time (:time event) (:wordTimings options))
canonical-id (refine-azure-viseme
provider-id
base-canonical
(:time event)
(find-word-at-time (:time event) (:wordTimings options)))
word)
time-ms (* (:time event) 1000)
previous (last mapped)]
(if (nil? canonical-id)
Expand All @@ -451,7 +473,10 @@
(recur (rest remaining) mapped)
(recur (rest remaining)
(conj mapped (assoc event
:timeMs (.round js/Math time-ms)
:baseCanonicalId base-canonical
:canonicalId canonical-id
:word (word-label word)
:className (azure-viseme-class provider-id canonical-id))))))))))

(defn azure-visemes-to-timeline
Expand All @@ -465,12 +490,10 @@
(vec (sort-by :offsetMs timeline))
(let [event (nth events index)
next-event (when (< (inc index) (count events)) (nth events (inc index)))
provider-id (:providerId event)
canonical-id (:canonicalId event)
offset-ms (max 0 (.round js/Math (* (:time event) 1000)))
duration-ms (azure-duration-ms event next-event total-duration-ms)]
(recur (inc index)
(push-azure-event timeline provider-id canonical-id offset-ms duration-ms visual-lead-ms))))))))
(push-azure-event timeline event offset-ms duration-ms visual-lead-ms))))))))

(defn process-azure-visemes! [state events total-duration-ms snippet-name options]
(let [timeline (azure-visemes-to-timeline events total-duration-ms options)
Expand All @@ -493,7 +516,8 @@
(schedule-snippet-outputs snippet cleanup-ms)
[(event-output {:type "AZURE_SCHEDULED"
:snippetName snippet-name
:eventCount (count events)})
:eventCount (count events)
:debugTimeline (mapv :debug timeline)})
(state-output state)])))))))

(defn end-speech! [state]
Expand Down
54 changes: 51 additions & 3 deletions src-cljs/latticework/vocal.cljs
Original file line number Diff line number Diff line change
Expand Up @@ -701,9 +701,55 @@
(recur (inc i) jaw-curve))))))

(defn- normalize-viseme-event [event]
{:visemeId (int (clamp 0 14 (:visemeId event)))
:offsetMs (max 0 (number-or (:offsetMs event) 0))
:durationMs (max 0 (number-or (:durationMs event) 0))})
(cond-> {:visemeId (int (clamp 0 14 (:visemeId event)))
:offsetMs (max 0 (number-or (:offsetMs event) 0))
:durationMs (max 0 (number-or (:durationMs event) 0))}
(:debug event) (assoc :debug (:debug event))))

(defn- curve-value-at [curve time-sec]
(let [points (vec (sort-by :time (or curve [])))]
(cond
(empty? points) 0
(<= time-sec (:time (first points))) (number-or (:intensity (first points)) 0)
(>= time-sec (:time (last points))) (number-or (:intensity (last points)) 0)
:else
(loop [previous (first points)
remaining (rest points)]
(let [current (first remaining)]
(if (or (nil? current) (<= time-sec (:time current)))
(let [start-time (number-or (:time previous) 0)
end-time (number-or (:time current) start-time)
start-intensity (number-or (:intensity previous) 0)
end-intensity (number-or (:intensity current) start-intensity)
span (max 0.0001 (- end-time start-time))
progress (protocol/clamp 0 1 (/ (- time-sec start-time) span))]
(+ start-intensity (* (- end-intensity start-intensity) progress)))
(recur current (rest remaining))))))))

(defn- lip-curve-key? [key]
(when (re-matches #"\d+" key)
(<= 0 (js/parseInt key 10) 14)))

(defn- total-lip-activation-at [curves time-sec]
(reduce-kv
(fn [total key curve]
(if (lip-curve-key? key)
(+ total (curve-value-at curve time-sec))
total))
0
(or curves {})))

(defn- viseme-debug-summary [event curves]
(let [viseme-id (:visemeId event)
sample-time-sec (/ (+ (:offsetMs event) (/ (:durationMs event) 2)) 1000)
morph-target-key (str viseme-id)]
(merge (:debug event)
{:visemeId viseme-id
:morphTargetKey morph-target-key
:sampleTimeSec sample-time-sec
:jawValue (curve-value-at (get curves jaw-au) sample-time-sec)
:totalLipActivation (total-lip-activation-at curves sample-time-sec)
:activeMorphValue (curve-value-at (get curves morph-target-key) sample-time-sec)})))

(defn build-vocal-snippet
([events] (build-vocal-snippet events nil nil))
Expand All @@ -720,6 +766,7 @@
:snippetPlaybackRate 1.0
:snippetIntensityScale 1.0
:snippetJawScale jaw-scale
:visemeDebug []
:loop false
:maxTime 0
:curves {}}
Expand Down Expand Up @@ -755,6 +802,7 @@
:snippetIntensityScale 1.0
:snippetJawScale jaw-scale
:autoVisemeJaw false
:visemeDebug (mapv #(viseme-debug-summary % articulated-curves) events)
:loop false
:maxTime max-time
:curves articulated-curves})))))
Expand Down
26 changes: 26 additions & 0 deletions types/cljs.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,34 @@ export interface VocalConfig {
priority?: number;
}

export interface VocalVisemeDebug {
provider?: string;
providerId?: number;
providerTimeMs?: number;
audioOffsetMs?: number;
visualOffsetMs?: number;
durationMs?: number;
baseCanonicalVisemeId?: number;
canonicalVisemeId?: number;
visemeId?: number;
word?: string;
morphTargetKey?: string;
segment?: string;
refined?: boolean;
visualLeadMs?: number;
sampleTimeSec?: number;
jawValue?: number;
totalLipActivation?: number;
activeMorphValue?: number;
[key: string]: unknown;
}

export interface VocalVisemeEvent {
visemeId: number;
offsetMs: number;
durationMs: number;
debug?: VocalVisemeDebug;
[key: string]: unknown;
}

export interface VocalWordTiming {
Expand All @@ -470,6 +494,7 @@ export interface VocalSnippet {
snippetIntensityScale: number;
snippetJawScale?: number;
autoVisemeJaw?: boolean;
visemeDebug?: VocalVisemeDebug[];
loop: boolean;
maxTime: number;
curves: Record<string, AnimationCurvePoint[]>;
Expand Down Expand Up @@ -576,6 +601,7 @@ export interface LipSyncEvent {
wordIndex?: number;
snippetName?: string;
eventCount?: number;
debugTimeline?: VocalVisemeDebug[];
[key: string]: unknown;
}

Expand Down
Loading