diff --git a/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs b/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs index b976f4a562a..4c35ff7a5a8 100644 --- a/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs +++ b/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs @@ -24,10 +24,6 @@ namespace Microsoft.Extensions.AI; [Experimental(DiagnosticIds.Experiments.AISpeechToText, UrlFormat = DiagnosticIds.UrlFormat)] internal sealed class OpenAISpeechToTextClient : ISpeechToTextClient { - /// Filename to use when audio lacks a name. - /// This information internally is required but is only being used to create a header name in the multipart request. - private const string Filename = "audio.mp3"; - /// Metadata about the client. private readonly SpeechToTextClientMetadata _metadata; @@ -64,9 +60,7 @@ public async Task GetTextAsync( SpeechToTextResponse response = new(); - string filename = audioSpeechStream is FileStream fileStream ? - Path.GetFileName(fileStream.Name) : // Use the file name if we can get one from the stream. - Filename; // Otherwise, use a default name; this is only used to create a header name in the multipart request. + string filename = ResolveFilename(audioSpeechStream); if (IsTranslationRequest(options)) { @@ -120,9 +114,7 @@ public async IAsyncEnumerable GetStreamingTextAsync( { _ = Throw.IfNull(audioSpeechStream); - string filename = audioSpeechStream is FileStream fileStream ? - Path.GetFileName(fileStream.Name) : // Use the file name if we can get one from the stream. - Filename; // Otherwise, use a default name; this is only used to create a header name in the multipart request. + string filename = ResolveFilename(audioSpeechStream); if (IsTranslationRequest(options)) { @@ -185,6 +177,84 @@ options is not null && options.TextLanguage is not null && (options.SpeechLanguage is null || options.SpeechLanguage != options.TextLanguage); + /// + /// Resolves the filename to use for the audio stream in the multipart request. + /// Priority: name, then magic-byte detection (seekable streams only), then default. + /// + private static string ResolveFilename(Stream audioSpeechStream) + { + const int FormatDetectionByteCount = 12; + + if (audioSpeechStream is FileStream fileStream) + { + return Path.GetFileName(fileStream.Name); + } + + // For seekable streams, peek at the header to detect audio format, then rewind. + if (audioSpeechStream.CanSeek) + { + byte[] header = new byte[FormatDetectionByteCount]; + int bytesRead = 0; + while (bytesRead < header.Length) + { + int n = audioSpeechStream.Read(header, bytesRead, header.Length - bytesRead); + if (n <= 0) + { + break; + } + + bytesRead += n; + } + + audioSpeechStream.Position -= bytesRead; + return $"audio.{DetectAudioExtension(header.AsSpan(0, bytesRead))}"; + } + + return "audio.mp3"; + } + + /// Detects the audio format extension from the leading bytes of the audio data. + private static string DetectAudioExtension(ReadOnlySpan header) + { + // WAV: "RIFF" at offset 0 and "WAVE" at offset 8. + if (header.Length >= 12 && + header.Slice(0, 4).SequenceEqual("RIFF"u8) && + header.Slice(8, 4).SequenceEqual("WAVE"u8)) + { + return "wav"; + } + + // WebM/Matroska: EBML header ID at offset 0. + if (header.Length >= 4 && + header.Slice(0, 4).SequenceEqual((ReadOnlySpan)[0x1A, 0x45, 0xDF, 0xA3])) + { + return "webm"; + } + + // M4A/MP4: ISO BMFF "ftyp" box type at offset 4. + if (header.Length >= 8 && + header.Slice(4, 4).SequenceEqual("ftyp"u8)) + { + return "m4a"; + } + + // MP3: ID3v2 tag at offset 0. + if (header.Length >= 3 && + header.Slice(0, 3).SequenceEqual("ID3"u8)) + { + return "mp3"; + } + + // MP3: MPEG frame sync word (11 set bits). + if (header.Length >= 2 && + header[0] == 0xFF && (header[1] & 0xE0) == 0xE0) + { + return "mp3"; + } + + return "mp3"; + } + /// Converts an extensions options instance to an OpenAI transcription options instance. private AudioTranscriptionOptions ToOpenAITranscriptionOptions(SpeechToTextOptions? options) { diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj index ed657f5e2b6..0c6f1dba503 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj +++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj @@ -20,12 +20,28 @@ + + + + Never + + Never + + + Never + + + Never + + + Never + diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.m4a b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.m4a new file mode 100644 index 00000000000..a082110afce Binary files /dev/null and b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.m4a differ diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.wav b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.wav new file mode 100644 index 00000000000..9b655bb88e1 Binary files /dev/null and b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.wav differ diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.webm b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.webm new file mode 100644 index 00000000000..98309fa200b Binary files /dev/null and b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.webm differ diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001_noid3.mp3 b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001_noid3.mp3 new file mode 100644 index 00000000000..9ef9e9378dc Binary files /dev/null and b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001_noid3.mp3 differ diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs index f0ea6c1790e..4d48e67c4da 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs @@ -59,6 +59,23 @@ public virtual async Task GetStreamingTextAsync_SingleStreamingResponseChoice() Assert.Contains("gym", responseText, StringComparison.OrdinalIgnoreCase); } + [ConditionalTheory] + [InlineData("audio001.mp3")] + [InlineData("audio001_noid3.mp3")] + [InlineData("audio001.wav")] + [InlineData("audio001.m4a")] + [InlineData("audio001.webm")] + public virtual async Task GetTextAsync_AutoDetectsAudioFormat(string fileName) + { + SkipIfNotEnabled(); + + using var audioSpeechStream = GetAudioStream(fileName); + var response = await _client.GetTextAsync(audioSpeechStream); + + Assert.NotNull(response); + Assert.Contains("gym", response.Text, StringComparison.OrdinalIgnoreCase); + } + private static Stream GetAudioStream(string fileName) { using Stream? s = typeof(SpeechToTextClientIntegrationTests).Assembly.GetManifestResourceStream($"Microsoft.Extensions.AI.Resources.{fileName}"); diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs index 6b0374d70cd..77267476e28 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs @@ -41,6 +41,8 @@ public class VerbatimMultiPartHttpHandler(string expectedInput, string sentJsonO { public string? ExpectedRequestUriContains { get; init; } + public string? ExpectedAudioFilename { get; init; } + protected override async Task SendAsync( HttpRequestMessage request, CancellationToken cancellationToken) @@ -111,8 +113,16 @@ protected override async Task SendAsync( // Text field string name = ExtractNameFromHeaders(headers); - // Skip file fields - if (!name.StartsWith("file")) + // For file fields, optionally check the filename + if (name.StartsWith("file")) + { + if (ExpectedAudioFilename is not null) + { + string? actualFilename = ExtractFilenameFromHeaders(headers); + Assert.Equal(ExpectedAudioFilename, actualFilename); + } + } + else { if (parameters.ContainsKey(name)) { @@ -185,6 +195,25 @@ private static string ExtractNameFromHeaders(string headers) return headers.Substring(start, end - start).Trim('"'); } + private static string? ExtractFilenameFromHeaders(string headers) + { + const string FilenamePrefix = "filename="; + int start = headers.IndexOf(FilenamePrefix); + if (start < 0) + { + return null; + } + + start += FilenamePrefix.Length; + int end = headers.IndexOf(";", start); + if (end == -1) + { + end = headers.Length; + } + + return headers.Substring(start, end - start).Trim('"'); + } + public static string? RemoveWhiteSpace(string? text) => text is null ? null : Regex.Replace(text, @"\s*", string.Empty); diff --git a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs index 0605a3ed655..af640b3b598 100644 --- a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs @@ -352,4 +352,87 @@ private static ISpeechToTextClient CreateSpeechToTextClient(HttpClient httpClien new OpenAIClient(new ApiKeyCredential("apikey"), new OpenAIClientOptions { Transport = new HttpClientPipelineTransport(httpClient) }) .GetAudioClient(modelId) .AsISpeechToTextClient(); + + public static TheoryData AudioFormatDetectionData => new() + { + // WAV: RIFF____WAVE + { "RIFF\x00\x00\x00\x00WAVE"u8.ToArray(), "audio.wav" }, + + // MP3: ID3v2 tag + { new byte[] { (byte)'I', (byte)'D', (byte)'3', 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.mp3" }, + + // MP3: MPEG sync word (0xFF 0xFB) + { new byte[] { 0xFF, 0xFB, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.mp3" }, + + // WebM/Matroska: EBML header + { new byte[] { 0x1A, 0x45, 0xDF, 0xA3, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.webm" }, + + // M4A/MP4: ISO BMFF ftyp box + { new byte[] { 0x00, 0x00, 0x00, 0x20, (byte)'f', (byte)'t', (byte)'y', (byte)'p', (byte)'M', (byte)'4', (byte)'A', (byte)' ' }, "audio.m4a" }, + + // Unknown bytes: defaults to mp3 + { new byte[] { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C }, "audio.mp3" }, + }; + + [Theory] + [MemberData(nameof(AudioFormatDetectionData))] + public async Task GetTextAsync_DetectsAudioFormatFromMagicBytes(byte[] header, string expectedFilename) + { + const string Input = """ + { + "model": "gpt-4o-transcribe" + } + """; + + const string Output = """ + { + "text":"Hello." + } + """; + + using var audioSpeechStream = new MemoryStream(header); + + using VerbatimMultiPartHttpHandler handler = new(Input, Output) + { + ExpectedAudioFilename = expectedFilename, + }; + using HttpClient httpClient = new(handler); + using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "gpt-4o-transcribe"); + + var response = await client.GetTextAsync(audioSpeechStream); + Assert.NotNull(response); + } + + [Theory] + [MemberData(nameof(AudioFormatDetectionData))] + public async Task GetStreamingTextAsync_DetectsAudioFormatFromMagicBytes(byte[] header, string expectedFilename) + { + const string Input = """ + { + "model": "gpt-4o-transcribe", + "stream":true + } + """; + + const string Output = """ + { + "text":"Hello." + } + """; + + using var audioSpeechStream = new MemoryStream(header); + + using VerbatimMultiPartHttpHandler handler = new(Input, Output) + { + ExpectedRequestUriContains = "audio/transcriptions", + ExpectedAudioFilename = expectedFilename, + }; + using HttpClient httpClient = new(handler); + using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "gpt-4o-transcribe"); + + await foreach (var update in client.GetStreamingTextAsync(audioSpeechStream)) + { + Assert.NotNull(update); + } + } }