dotnet · jozkee · Jun 16, 2026 · jozkee · Jun 16, 2026
@@ -24,10 +24,6 @@ namespace Microsoft.Extensions.AI;
 [Experimental(DiagnosticIds.Experiments.AISpeechToText, UrlFormat = DiagnosticIds.UrlFormat)]
 internal sealed class OpenAISpeechToTextClient : ISpeechToTextClient
 {
-    /// <summary>Filename to use when audio lacks a name.</summary>
-    /// <remarks>This information internally is required but is only being used to create a header name in the multipart request.</remarks>
-    private const string Filename = "audio.mp3";
-
     /// <summary>Metadata about the client.</summary>
     private readonly SpeechToTextClientMetadata _metadata;
 
@@ -64,9 +60,7 @@ public async Task<SpeechToTextResponse> GetTextAsync(
 
         SpeechToTextResponse response = new();
 
-        string filename = audioSpeechStream is FileStream fileStream ?
-            Path.GetFileName(fileStream.Name) : // Use the file name if we can get one from the stream.
-            Filename; // Otherwise, use a default name; this is only used to create a header name in the multipart request.
+        string filename = ResolveFilename(audioSpeechStream);
 
         if (IsTranslationRequest(options))
         {
@@ -120,9 +114,7 @@ public async IAsyncEnumerable<SpeechToTextResponseUpdate> GetStreamingTextAsync(
     {
         _ = Throw.IfNull(audioSpeechStream);
 
-        string filename = audioSpeechStream is FileStream fileStream ?
-            Path.GetFileName(fileStream.Name) : // Use the file name if we can get one from the stream.
-            Filename; // Otherwise, use a default name; this is only used to create a header name in the multipart request.
+        string filename = ResolveFilename(audioSpeechStream);
 
         if (IsTranslationRequest(options))
         {
@@ -185,6 +177,84 @@ options is not null &&
         options.TextLanguage is not null &&
         (options.SpeechLanguage is null || options.SpeechLanguage != options.TextLanguage);
 
+    /// <summary>
+    /// Resolves the filename to use for the audio stream in the multipart request.
+    /// Priority: <see cref="FileStream"/> name, then magic-byte detection (seekable streams only), then default.
+    /// </summary>
+    private static string ResolveFilename(Stream audioSpeechStream)
+    {
+        const int FormatDetectionByteCount = 12;
+
+        if (audioSpeechStream is FileStream fileStream)
+        {
+            return Path.GetFileName(fileStream.Name);
+        }
+
+        // For seekable streams, peek at the header to detect audio format, then rewind.
+        if (audioSpeechStream.CanSeek)
+        {
+            byte[] header = new byte[FormatDetectionByteCount];
+            int bytesRead = 0;
+            while (bytesRead < header.Length)
+            {
+                int n = audioSpeechStream.Read(header, bytesRead, header.Length - bytesRead);
+                if (n <= 0)
+                {
+                    break;
+                }
+
+                bytesRead += n;
+            }
+
+            audioSpeechStream.Position -= bytesRead;
+            return $"audio.{DetectAudioExtension(header.AsSpan(0, bytesRead))}";
+        }
+
+        return "audio.mp3";
+    }
+
+    /// <summary>Detects the audio format extension from the leading bytes of the audio data.</summary>
+    private static string DetectAudioExtension(ReadOnlySpan<byte> header)
+    {
+        // WAV: "RIFF" at offset 0 and "WAVE" at offset 8.
+        if (header.Length >= 12 &&
+            header.Slice(0, 4).SequenceEqual("RIFF"u8) &&
+            header.Slice(8, 4).SequenceEqual("WAVE"u8))
+        {
+            return "wav";
+        }
+
+        // WebM/Matroska: EBML header ID at offset 0.
+        if (header.Length >= 4 &&
+            header.Slice(0, 4).SequenceEqual((ReadOnlySpan<byte>)[0x1A, 0x45, 0xDF, 0xA3]))
+        {
+            return "webm";
+        }
+
+        // M4A/MP4: ISO BMFF "ftyp" box type at offset 4.
+        if (header.Length >= 8 &&
+            header.Slice(4, 4).SequenceEqual("ftyp"u8))
+        {
+            return "m4a";
+        }
+
+        // MP3: ID3v2 tag at offset 0.
+        if (header.Length >= 3 &&
+            header.Slice(0, 3).SequenceEqual("ID3"u8))
+        {
+            return "mp3";
+        }
+
+        // MP3: MPEG frame sync word (11 set bits).
+        if (header.Length >= 2 &&
+            header[0] == 0xFF && (header[1] & 0xE0) == 0xE0)
+        {
+            return "mp3";
+        }
+
+        return "mp3";
+    }
+
     /// <summary>Converts an extensions options instance to an OpenAI transcription options instance.</summary>
     private AudioTranscriptionOptions ToOpenAITranscriptionOptions(SpeechToTextOptions? options)
     {

@@ -20,12 +20,28 @@
 
   <ItemGroup>
     <None Remove="Resources\audio001.mp3" />
+    <None Remove="Resources\audio001.wav" />
+    <None Remove="Resources\audio001.m4a" />
+    <None Remove="Resources\audio001.webm" />
+    <None Remove="Resources\audio001_noid3.mp3" />
   </ItemGroup>
 
   <ItemGroup>
     <EmbeddedResource Include="Resources\audio001.mp3">
       <CopyToOutputDirectory>Never</CopyToOutputDirectory>
     </EmbeddedResource>
+    <EmbeddedResource Include="Resources\audio001.wav">
+      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
+    </EmbeddedResource>
+    <EmbeddedResource Include="Resources\audio001.m4a">
+      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
+    </EmbeddedResource>
+    <EmbeddedResource Include="Resources\audio001.webm">
+      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
+    </EmbeddedResource>
+    <EmbeddedResource Include="Resources\audio001_noid3.mp3">
+      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
+    </EmbeddedResource>
     <EmbeddedResource Include="..\..\Shared\ImageDataUri\dotnet.png" Link="Resources\dotnet.png" />
   </ItemGroup>
 

@@ -59,6 +59,23 @@ public virtual async Task GetStreamingTextAsync_SingleStreamingResponseChoice()
         Assert.Contains("gym", responseText, StringComparison.OrdinalIgnoreCase);
     }
 
+    [ConditionalTheory]
+    [InlineData("audio001.mp3")]
+    [InlineData("audio001_noid3.mp3")]
+    [InlineData("audio001.wav")]
+    [InlineData("audio001.m4a")]
+    [InlineData("audio001.webm")]
+    public virtual async Task GetTextAsync_AutoDetectsAudioFormat(string fileName)
+    {
+        SkipIfNotEnabled();
+
+        using var audioSpeechStream = GetAudioStream(fileName);
+        var response = await _client.GetTextAsync(audioSpeechStream);
+
+        Assert.NotNull(response);
+        Assert.Contains("gym", response.Text, StringComparison.OrdinalIgnoreCase);
+    }
+
     private static Stream GetAudioStream(string fileName)
     {
         using Stream? s = typeof(SpeechToTextClientIntegrationTests).Assembly.GetManifestResourceStream($"Microsoft.Extensions.AI.Resources.{fileName}");

@@ -41,6 +41,8 @@ public class VerbatimMultiPartHttpHandler(string expectedInput, string sentJsonO
 {
     public string? ExpectedRequestUriContains { get; init; }
 
+    public string? ExpectedAudioFilename { get; init; }
+
     protected override async Task<HttpResponseMessage> SendAsync(
         HttpRequestMessage request,
         CancellationToken cancellationToken)
@@ -111,8 +113,16 @@ protected override async Task<HttpResponseMessage> SendAsync(
                 // Text field
                 string name = ExtractNameFromHeaders(headers);
 
-                // Skip file fields
-                if (!name.StartsWith("file"))
+                // For file fields, optionally check the filename
+                if (name.StartsWith("file"))
+                {
+                    if (ExpectedAudioFilename is not null)
+                    {
+                        string? actualFilename = ExtractFilenameFromHeaders(headers);
+                        Assert.Equal(ExpectedAudioFilename, actualFilename);
+                    }
+                }
+                else
                 {
                     if (parameters.ContainsKey(name))
                     {
@@ -185,6 +195,25 @@ private static string ExtractNameFromHeaders(string headers)
         return headers.Substring(start, end - start).Trim('"');
     }
 
+    private static string? ExtractFilenameFromHeaders(string headers)
+    {
+        const string FilenamePrefix = "filename=";
+        int start = headers.IndexOf(FilenamePrefix);
+        if (start < 0)
+        {
+            return null;
+        }
+
+        start += FilenamePrefix.Length;
+        int end = headers.IndexOf(";", start);
+        if (end == -1)
+        {
+            end = headers.Length;
+        }
+
+        return headers.Substring(start, end - start).Trim('"');
+    }
+
     public static string? RemoveWhiteSpace(string? text) =>
         text is null ? null :
         Regex.Replace(text, @"\s*", string.Empty);

@@ -352,4 +352,87 @@ private static ISpeechToTextClient CreateSpeechToTextClient(HttpClient httpClien
         new OpenAIClient(new ApiKeyCredential("apikey"), new OpenAIClientOptions { Transport = new HttpClientPipelineTransport(httpClient) })
             .GetAudioClient(modelId)
             .AsISpeechToTextClient();
+
+    public static TheoryData<byte[], string> AudioFormatDetectionData => new()
+    {
+        // WAV: RIFF____WAVE
+        { "RIFF\x00\x00\x00\x00WAVE"u8.ToArray(), "audio.wav" },
+
+        // MP3: ID3v2 tag
+        { new byte[] { (byte)'I', (byte)'D', (byte)'3', 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.mp3" },
+
+        // MP3: MPEG sync word (0xFF 0xFB)
+        { new byte[] { 0xFF, 0xFB, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.mp3" },
+
+        // WebM/Matroska: EBML header
+        { new byte[] { 0x1A, 0x45, 0xDF, 0xA3, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.webm" },
+
+        // M4A/MP4: ISO BMFF ftyp box
+        { new byte[] { 0x00, 0x00, 0x00, 0x20, (byte)'f', (byte)'t', (byte)'y', (byte)'p', (byte)'M', (byte)'4', (byte)'A', (byte)' ' }, "audio.m4a" },
+
+        // Unknown bytes: defaults to mp3
+        { new byte[] { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C }, "audio.mp3" },
+    };
+
+    [Theory]
+    [MemberData(nameof(AudioFormatDetectionData))]
+    public async Task GetTextAsync_DetectsAudioFormatFromMagicBytes(byte[] header, string expectedFilename)
+    {
+        const string Input = """
+                {
+                    "model": "gpt-4o-transcribe"
+                }
+                """;
+
+        const string Output = """
+                {
+                    "text":"Hello."
+                }
+                """;
+
+        using var audioSpeechStream = new MemoryStream(header);
+
+        using VerbatimMultiPartHttpHandler handler = new(Input, Output)
+        {
+            ExpectedAudioFilename = expectedFilename,
+        };
+        using HttpClient httpClient = new(handler);
+        using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "gpt-4o-transcribe");
+
+        var response = await client.GetTextAsync(audioSpeechStream);
+        Assert.NotNull(response);
+    }
+
+    [Theory]
+    [MemberData(nameof(AudioFormatDetectionData))]
+    public async Task GetStreamingTextAsync_DetectsAudioFormatFromMagicBytes(byte[] header, string expectedFilename)
+    {
+        const string Input = """
+                {
+                    "model": "gpt-4o-transcribe",
+                    "stream":true
+                }
+                """;
+
+        const string Output = """
+                {
+                    "text":"Hello."
+                }
+                """;
+
+        using var audioSpeechStream = new MemoryStream(header);
+
+        using VerbatimMultiPartHttpHandler handler = new(Input, Output)
+        {
+            ExpectedRequestUriContains = "audio/transcriptions",
+            ExpectedAudioFilename = expectedFilename,
+        };
+        using HttpClient httpClient = new(handler);
+        using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "gpt-4o-transcribe");
+
+        await foreach (var update in client.GetStreamingTextAsync(audioSpeechStream))
+        {
+            Assert.NotNull(update);
+        }
+    }
 }