diff --git a/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs b/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs
index b976f4a562a..4c35ff7a5a8 100644
--- a/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs
+++ b/src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAISpeechToTextClient.cs
@@ -24,10 +24,6 @@ namespace Microsoft.Extensions.AI;
[Experimental(DiagnosticIds.Experiments.AISpeechToText, UrlFormat = DiagnosticIds.UrlFormat)]
internal sealed class OpenAISpeechToTextClient : ISpeechToTextClient
{
- /// Filename to use when audio lacks a name.
- /// This information internally is required but is only being used to create a header name in the multipart request.
- private const string Filename = "audio.mp3";
-
/// Metadata about the client.
private readonly SpeechToTextClientMetadata _metadata;
@@ -64,9 +60,7 @@ public async Task GetTextAsync(
SpeechToTextResponse response = new();
- string filename = audioSpeechStream is FileStream fileStream ?
- Path.GetFileName(fileStream.Name) : // Use the file name if we can get one from the stream.
- Filename; // Otherwise, use a default name; this is only used to create a header name in the multipart request.
+ string filename = ResolveFilename(audioSpeechStream);
if (IsTranslationRequest(options))
{
@@ -120,9 +114,7 @@ public async IAsyncEnumerable GetStreamingTextAsync(
{
_ = Throw.IfNull(audioSpeechStream);
- string filename = audioSpeechStream is FileStream fileStream ?
- Path.GetFileName(fileStream.Name) : // Use the file name if we can get one from the stream.
- Filename; // Otherwise, use a default name; this is only used to create a header name in the multipart request.
+ string filename = ResolveFilename(audioSpeechStream);
if (IsTranslationRequest(options))
{
@@ -185,6 +177,84 @@ options is not null &&
options.TextLanguage is not null &&
(options.SpeechLanguage is null || options.SpeechLanguage != options.TextLanguage);
+ ///
+ /// Resolves the filename to use for the audio stream in the multipart request.
+ /// Priority: name, then magic-byte detection (seekable streams only), then default.
+ ///
+ private static string ResolveFilename(Stream audioSpeechStream)
+ {
+ const int FormatDetectionByteCount = 12;
+
+ if (audioSpeechStream is FileStream fileStream)
+ {
+ return Path.GetFileName(fileStream.Name);
+ }
+
+ // For seekable streams, peek at the header to detect audio format, then rewind.
+ if (audioSpeechStream.CanSeek)
+ {
+ byte[] header = new byte[FormatDetectionByteCount];
+ int bytesRead = 0;
+ while (bytesRead < header.Length)
+ {
+ int n = audioSpeechStream.Read(header, bytesRead, header.Length - bytesRead);
+ if (n <= 0)
+ {
+ break;
+ }
+
+ bytesRead += n;
+ }
+
+ audioSpeechStream.Position -= bytesRead;
+ return $"audio.{DetectAudioExtension(header.AsSpan(0, bytesRead))}";
+ }
+
+ return "audio.mp3";
+ }
+
+ /// Detects the audio format extension from the leading bytes of the audio data.
+ private static string DetectAudioExtension(ReadOnlySpan header)
+ {
+ // WAV: "RIFF" at offset 0 and "WAVE" at offset 8.
+ if (header.Length >= 12 &&
+ header.Slice(0, 4).SequenceEqual("RIFF"u8) &&
+ header.Slice(8, 4).SequenceEqual("WAVE"u8))
+ {
+ return "wav";
+ }
+
+ // WebM/Matroska: EBML header ID at offset 0.
+ if (header.Length >= 4 &&
+ header.Slice(0, 4).SequenceEqual((ReadOnlySpan)[0x1A, 0x45, 0xDF, 0xA3]))
+ {
+ return "webm";
+ }
+
+ // M4A/MP4: ISO BMFF "ftyp" box type at offset 4.
+ if (header.Length >= 8 &&
+ header.Slice(4, 4).SequenceEqual("ftyp"u8))
+ {
+ return "m4a";
+ }
+
+ // MP3: ID3v2 tag at offset 0.
+ if (header.Length >= 3 &&
+ header.Slice(0, 3).SequenceEqual("ID3"u8))
+ {
+ return "mp3";
+ }
+
+ // MP3: MPEG frame sync word (11 set bits).
+ if (header.Length >= 2 &&
+ header[0] == 0xFF && (header[1] & 0xE0) == 0xE0)
+ {
+ return "mp3";
+ }
+
+ return "mp3";
+ }
+
/// Converts an extensions options instance to an OpenAI transcription options instance.
private AudioTranscriptionOptions ToOpenAITranscriptionOptions(SpeechToTextOptions? options)
{
diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj
index ed657f5e2b6..0c6f1dba503 100644
--- a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj
+++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Microsoft.Extensions.AI.Integration.Tests.csproj
@@ -20,12 +20,28 @@
+
+
+
+
Never
+
+ Never
+
+
+ Never
+
+
+ Never
+
+
+ Never
+
diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.m4a b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.m4a
new file mode 100644
index 00000000000..a082110afce
Binary files /dev/null and b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.m4a differ
diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.wav b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.wav
new file mode 100644
index 00000000000..9b655bb88e1
Binary files /dev/null and b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.wav differ
diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.webm b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.webm
new file mode 100644
index 00000000000..98309fa200b
Binary files /dev/null and b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001.webm differ
diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001_noid3.mp3 b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001_noid3.mp3
new file mode 100644
index 00000000000..9ef9e9378dc
Binary files /dev/null and b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/Resources/audio001_noid3.mp3 differ
diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs
index f0ea6c1790e..4d48e67c4da 100644
--- a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs
+++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/SpeechToTextClientIntegrationTests.cs
@@ -59,6 +59,23 @@ public virtual async Task GetStreamingTextAsync_SingleStreamingResponseChoice()
Assert.Contains("gym", responseText, StringComparison.OrdinalIgnoreCase);
}
+ [ConditionalTheory]
+ [InlineData("audio001.mp3")]
+ [InlineData("audio001_noid3.mp3")]
+ [InlineData("audio001.wav")]
+ [InlineData("audio001.m4a")]
+ [InlineData("audio001.webm")]
+ public virtual async Task GetTextAsync_AutoDetectsAudioFormat(string fileName)
+ {
+ SkipIfNotEnabled();
+
+ using var audioSpeechStream = GetAudioStream(fileName);
+ var response = await _client.GetTextAsync(audioSpeechStream);
+
+ Assert.NotNull(response);
+ Assert.Contains("gym", response.Text, StringComparison.OrdinalIgnoreCase);
+ }
+
private static Stream GetAudioStream(string fileName)
{
using Stream? s = typeof(SpeechToTextClientIntegrationTests).Assembly.GetManifestResourceStream($"Microsoft.Extensions.AI.Resources.{fileName}");
diff --git a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs
index 6b0374d70cd..77267476e28 100644
--- a/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs
+++ b/test/Libraries/Microsoft.Extensions.AI.Integration.Tests/VerbatimMultiPartHttpHandler.cs
@@ -41,6 +41,8 @@ public class VerbatimMultiPartHttpHandler(string expectedInput, string sentJsonO
{
public string? ExpectedRequestUriContains { get; init; }
+ public string? ExpectedAudioFilename { get; init; }
+
protected override async Task SendAsync(
HttpRequestMessage request,
CancellationToken cancellationToken)
@@ -111,8 +113,16 @@ protected override async Task SendAsync(
// Text field
string name = ExtractNameFromHeaders(headers);
- // Skip file fields
- if (!name.StartsWith("file"))
+ // For file fields, optionally check the filename
+ if (name.StartsWith("file"))
+ {
+ if (ExpectedAudioFilename is not null)
+ {
+ string? actualFilename = ExtractFilenameFromHeaders(headers);
+ Assert.Equal(ExpectedAudioFilename, actualFilename);
+ }
+ }
+ else
{
if (parameters.ContainsKey(name))
{
@@ -185,6 +195,25 @@ private static string ExtractNameFromHeaders(string headers)
return headers.Substring(start, end - start).Trim('"');
}
+ private static string? ExtractFilenameFromHeaders(string headers)
+ {
+ const string FilenamePrefix = "filename=";
+ int start = headers.IndexOf(FilenamePrefix);
+ if (start < 0)
+ {
+ return null;
+ }
+
+ start += FilenamePrefix.Length;
+ int end = headers.IndexOf(";", start);
+ if (end == -1)
+ {
+ end = headers.Length;
+ }
+
+ return headers.Substring(start, end - start).Trim('"');
+ }
+
public static string? RemoveWhiteSpace(string? text) =>
text is null ? null :
Regex.Replace(text, @"\s*", string.Empty);
diff --git a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs
index 0605a3ed655..af640b3b598 100644
--- a/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs
+++ b/test/Libraries/Microsoft.Extensions.AI.OpenAI.Tests/OpenAISpeechToTextClientTests.cs
@@ -352,4 +352,87 @@ private static ISpeechToTextClient CreateSpeechToTextClient(HttpClient httpClien
new OpenAIClient(new ApiKeyCredential("apikey"), new OpenAIClientOptions { Transport = new HttpClientPipelineTransport(httpClient) })
.GetAudioClient(modelId)
.AsISpeechToTextClient();
+
+ public static TheoryData AudioFormatDetectionData => new()
+ {
+ // WAV: RIFF____WAVE
+ { "RIFF\x00\x00\x00\x00WAVE"u8.ToArray(), "audio.wav" },
+
+ // MP3: ID3v2 tag
+ { new byte[] { (byte)'I', (byte)'D', (byte)'3', 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.mp3" },
+
+ // MP3: MPEG sync word (0xFF 0xFB)
+ { new byte[] { 0xFF, 0xFB, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.mp3" },
+
+ // WebM/Matroska: EBML header
+ { new byte[] { 0x1A, 0x45, 0xDF, 0xA3, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.webm" },
+
+ // M4A/MP4: ISO BMFF ftyp box
+ { new byte[] { 0x00, 0x00, 0x00, 0x20, (byte)'f', (byte)'t', (byte)'y', (byte)'p', (byte)'M', (byte)'4', (byte)'A', (byte)' ' }, "audio.m4a" },
+
+ // Unknown bytes: defaults to mp3
+ { new byte[] { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C }, "audio.mp3" },
+ };
+
+ [Theory]
+ [MemberData(nameof(AudioFormatDetectionData))]
+ public async Task GetTextAsync_DetectsAudioFormatFromMagicBytes(byte[] header, string expectedFilename)
+ {
+ const string Input = """
+ {
+ "model": "gpt-4o-transcribe"
+ }
+ """;
+
+ const string Output = """
+ {
+ "text":"Hello."
+ }
+ """;
+
+ using var audioSpeechStream = new MemoryStream(header);
+
+ using VerbatimMultiPartHttpHandler handler = new(Input, Output)
+ {
+ ExpectedAudioFilename = expectedFilename,
+ };
+ using HttpClient httpClient = new(handler);
+ using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "gpt-4o-transcribe");
+
+ var response = await client.GetTextAsync(audioSpeechStream);
+ Assert.NotNull(response);
+ }
+
+ [Theory]
+ [MemberData(nameof(AudioFormatDetectionData))]
+ public async Task GetStreamingTextAsync_DetectsAudioFormatFromMagicBytes(byte[] header, string expectedFilename)
+ {
+ const string Input = """
+ {
+ "model": "gpt-4o-transcribe",
+ "stream":true
+ }
+ """;
+
+ const string Output = """
+ {
+ "text":"Hello."
+ }
+ """;
+
+ using var audioSpeechStream = new MemoryStream(header);
+
+ using VerbatimMultiPartHttpHandler handler = new(Input, Output)
+ {
+ ExpectedRequestUriContains = "audio/transcriptions",
+ ExpectedAudioFilename = expectedFilename,
+ };
+ using HttpClient httpClient = new(handler);
+ using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "gpt-4o-transcribe");
+
+ await foreach (var update in client.GetStreamingTextAsync(audioSpeechStream))
+ {
+ Assert.NotNull(update);
+ }
+ }
}