Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@ namespace Microsoft.Extensions.AI;
[Experimental(DiagnosticIds.Experiments.AISpeechToText, UrlFormat = DiagnosticIds.UrlFormat)]
internal sealed class OpenAISpeechToTextClient : ISpeechToTextClient
{
/// <summary>Filename to use when audio lacks a name.</summary>
/// <remarks>This information internally is required but is only being used to create a header name in the multipart request.</remarks>
private const string Filename = "audio.mp3";

/// <summary>Metadata about the client.</summary>
private readonly SpeechToTextClientMetadata _metadata;

Expand Down Expand Up @@ -64,9 +60,7 @@ public async Task<SpeechToTextResponse> GetTextAsync(

SpeechToTextResponse response = new();

string filename = audioSpeechStream is FileStream fileStream ?
Path.GetFileName(fileStream.Name) : // Use the file name if we can get one from the stream.
Filename; // Otherwise, use a default name; this is only used to create a header name in the multipart request.
string filename = ResolveFilename(audioSpeechStream);

if (IsTranslationRequest(options))
{
Expand Down Expand Up @@ -120,9 +114,7 @@ public async IAsyncEnumerable<SpeechToTextResponseUpdate> GetStreamingTextAsync(
{
_ = Throw.IfNull(audioSpeechStream);

string filename = audioSpeechStream is FileStream fileStream ?
Path.GetFileName(fileStream.Name) : // Use the file name if we can get one from the stream.
Filename; // Otherwise, use a default name; this is only used to create a header name in the multipart request.
string filename = ResolveFilename(audioSpeechStream);

if (IsTranslationRequest(options))
{
Expand Down Expand Up @@ -185,6 +177,84 @@ options is not null &&
options.TextLanguage is not null &&
(options.SpeechLanguage is null || options.SpeechLanguage != options.TextLanguage);

/// <summary>
/// Resolves the filename to use for the audio stream in the multipart request.
/// Priority: <see cref="FileStream"/> name, then magic-byte detection (seekable streams only), then default.
/// </summary>
private static string ResolveFilename(Stream audioSpeechStream)
{
const int FormatDetectionByteCount = 12;

if (audioSpeechStream is FileStream fileStream)
{
return Path.GetFileName(fileStream.Name);
}

// For seekable streams, peek at the header to detect audio format, then rewind.
if (audioSpeechStream.CanSeek)
{
byte[] header = new byte[FormatDetectionByteCount];
int bytesRead = 0;
while (bytesRead < header.Length)
{
int n = audioSpeechStream.Read(header, bytesRead, header.Length - bytesRead);
if (n <= 0)
{
break;
}

bytesRead += n;
}

audioSpeechStream.Position -= bytesRead;
return $"audio.{DetectAudioExtension(header.AsSpan(0, bytesRead))}";
}

return "audio.mp3";
}

/// <summary>Detects the audio format extension from the leading bytes of the audio data.</summary>
private static string DetectAudioExtension(ReadOnlySpan<byte> header)

@jozkee jozkee Jun 16, 2026

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reference, OpenAI supported formats are: mp3, mp4, mpeg, mpga, m4a, wav, and webm. And quotes from the specs related to the matching occurring in this method:

  1. WAV — RIFF at offset 0, WAVE at offset 8
    Source: Microsoft Multimedia Programming Interface and Data Specifications 1.0 (August 1991), referenced from:
    https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html

Field Length Contents
ckID 4 Chunk ID: "RIFF"
cksize 4 Chunk size: 4+n
WAVEID 4 WAVE ID: "WAVE"

And later under Examples, the full structure shows bytes 0–3 = RIFF, bytes 4–7 = size, and the WAVEID field at bytes 8–11 is WAVE.

  1. MP3 / MPEG / MPGA — ID3 at offset 0, or frame sync 0xFF 0xE_
    Source: http://www.mp3-tech.org/programmer/frame_header.html (authoritative MP3 technical reference, derived from ISO/IEC 11172-3)

Verified citation (exact text):

The first twelve bits (or first eleven bits in the case of the MPEG 2.5 extension) of a frame header are always set to 1 and are called "frame sync".

And the header table shows:

Sign Length (bits) Position (bits) Description
A 11 (31-21) Frame sync (all bits must be set)
11 bits set = bytes 0xFF + top 3 bits of next byte set = (header[1] & 0xE0) == 0xE0

For ID3v2 tags preceding MP3 data:
Source: https://id3.org/id3v2.3.0 — Section 3.1 "ID3v2 header"

"The first three bytes of the tag are always "ID3" to indicate that this is an ID3v2 tag"

  1. MP4 / M4A — ftyp at offset 4
    Source: W3C Note "ISO BMFF Byte Stream Format" (referencing ISO/IEC 14496-12 "ISO Base Media File Format"):
    https://www.w3.org/TR/mse-byte-stream-format-isobmff/

Verified citation (exact text):

An ISO BMFF initialization segment is defined in this specification as a single File Type Box (ftyp) followed by a single Movie Box (moov).

Per ISO 14496-12 box format: bytes 0–3 = box size (uint32 big-endian), bytes 4–7 = box type (FourCC). The first box MUST be ftyp.

  1. WebM — 0x1A 0x45 0xDF 0xA3 at offset 0
    Source: RFC 8794 — "Extensible Binary Meta Language" (IETF Standards Track), Section 8.1 "EBML Header":
    https://www.rfc-editor.org/rfc/rfc8794.txt

Verified citation (exact text from Section 8.1):

The EBML Header MUST contain a single Master Element with an Element Name of "EBML" and Element ID of "0x1A45DFA3" (see Section 11.2.1)

WebM is a profile of Matroska (RFC 9559), which is an EBML Document Type. Every WebM file begins with the EBML Header whose first element has ID 0x1A45DFA3.

{
// WAV: "RIFF" at offset 0 and "WAVE" at offset 8.
if (header.Length >= 12 &&
header.Slice(0, 4).SequenceEqual("RIFF"u8) &&
header.Slice(8, 4).SequenceEqual("WAVE"u8))
{
return "wav";
}

// WebM/Matroska: EBML header ID at offset 0.
if (header.Length >= 4 &&
header.Slice(0, 4).SequenceEqual((ReadOnlySpan<byte>)[0x1A, 0x45, 0xDF, 0xA3]))
{
return "webm";
}

// M4A/MP4: ISO BMFF "ftyp" box type at offset 4.
if (header.Length >= 8 &&
header.Slice(4, 4).SequenceEqual("ftyp"u8))
{
return "m4a";
}

// MP3: ID3v2 tag at offset 0.
if (header.Length >= 3 &&
header.Slice(0, 3).SequenceEqual("ID3"u8))
{
return "mp3";
}

// MP3: MPEG frame sync word (11 set bits).
if (header.Length >= 2 &&
header[0] == 0xFF && (header[1] & 0xE0) == 0xE0)
{
return "mp3";
}

return "mp3";
}

/// <summary>Converts an extensions options instance to an OpenAI transcription options instance.</summary>
private AudioTranscriptionOptions ToOpenAITranscriptionOptions(SpeechToTextOptions? options)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,28 @@

<ItemGroup>
<None Remove="Resources\audio001.mp3" />
<None Remove="Resources\audio001.wav" />
<None Remove="Resources\audio001.m4a" />
<None Remove="Resources\audio001.webm" />
<None Remove="Resources\audio001_noid3.mp3" />
</ItemGroup>

<ItemGroup>
<EmbeddedResource Include="Resources\audio001.mp3">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</EmbeddedResource>
<EmbeddedResource Include="Resources\audio001.wav">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</EmbeddedResource>
<EmbeddedResource Include="Resources\audio001.m4a">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</EmbeddedResource>
<EmbeddedResource Include="Resources\audio001.webm">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</EmbeddedResource>
<EmbeddedResource Include="Resources\audio001_noid3.mp3">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</EmbeddedResource>
<EmbeddedResource Include="..\..\Shared\ImageDataUri\dotnet.png" Link="Resources\dotnet.png" />
</ItemGroup>

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,23 @@ public virtual async Task GetStreamingTextAsync_SingleStreamingResponseChoice()
Assert.Contains("gym", responseText, StringComparison.OrdinalIgnoreCase);
}

[ConditionalTheory]
[InlineData("audio001.mp3")]
[InlineData("audio001_noid3.mp3")]
[InlineData("audio001.wav")]
[InlineData("audio001.m4a")]
[InlineData("audio001.webm")]
public virtual async Task GetTextAsync_AutoDetectsAudioFormat(string fileName)
{
SkipIfNotEnabled();

using var audioSpeechStream = GetAudioStream(fileName);
var response = await _client.GetTextAsync(audioSpeechStream);

Assert.NotNull(response);
Assert.Contains("gym", response.Text, StringComparison.OrdinalIgnoreCase);
}

private static Stream GetAudioStream(string fileName)
{
using Stream? s = typeof(SpeechToTextClientIntegrationTests).Assembly.GetManifestResourceStream($"Microsoft.Extensions.AI.Resources.{fileName}");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ public class VerbatimMultiPartHttpHandler(string expectedInput, string sentJsonO
{
public string? ExpectedRequestUriContains { get; init; }

public string? ExpectedAudioFilename { get; init; }

protected override async Task<HttpResponseMessage> SendAsync(
HttpRequestMessage request,
CancellationToken cancellationToken)
Expand Down Expand Up @@ -111,8 +113,16 @@ protected override async Task<HttpResponseMessage> SendAsync(
// Text field
string name = ExtractNameFromHeaders(headers);

// Skip file fields
if (!name.StartsWith("file"))
// For file fields, optionally check the filename
if (name.StartsWith("file"))
{
if (ExpectedAudioFilename is not null)
{
string? actualFilename = ExtractFilenameFromHeaders(headers);
Assert.Equal(ExpectedAudioFilename, actualFilename);
}
}
else
{
if (parameters.ContainsKey(name))
{
Expand Down Expand Up @@ -185,6 +195,25 @@ private static string ExtractNameFromHeaders(string headers)
return headers.Substring(start, end - start).Trim('"');
}

private static string? ExtractFilenameFromHeaders(string headers)
{
const string FilenamePrefix = "filename=";
int start = headers.IndexOf(FilenamePrefix);
if (start < 0)
{
return null;
}

start += FilenamePrefix.Length;
int end = headers.IndexOf(";", start);
if (end == -1)
{
end = headers.Length;
}

return headers.Substring(start, end - start).Trim('"');
}
Comment thread
jozkee marked this conversation as resolved.

public static string? RemoveWhiteSpace(string? text) =>
text is null ? null :
Regex.Replace(text, @"\s*", string.Empty);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -352,4 +352,87 @@ private static ISpeechToTextClient CreateSpeechToTextClient(HttpClient httpClien
new OpenAIClient(new ApiKeyCredential("apikey"), new OpenAIClientOptions { Transport = new HttpClientPipelineTransport(httpClient) })
.GetAudioClient(modelId)
.AsISpeechToTextClient();

public static TheoryData<byte[], string> AudioFormatDetectionData => new()
{
// WAV: RIFF____WAVE
{ "RIFF\x00\x00\x00\x00WAVE"u8.ToArray(), "audio.wav" },

// MP3: ID3v2 tag
{ new byte[] { (byte)'I', (byte)'D', (byte)'3', 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.mp3" },

// MP3: MPEG sync word (0xFF 0xFB)
{ new byte[] { 0xFF, 0xFB, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.mp3" },

// WebM/Matroska: EBML header
{ new byte[] { 0x1A, 0x45, 0xDF, 0xA3, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, "audio.webm" },

// M4A/MP4: ISO BMFF ftyp box
{ new byte[] { 0x00, 0x00, 0x00, 0x20, (byte)'f', (byte)'t', (byte)'y', (byte)'p', (byte)'M', (byte)'4', (byte)'A', (byte)' ' }, "audio.m4a" },

// Unknown bytes: defaults to mp3
{ new byte[] { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C }, "audio.mp3" },
};

[Theory]
[MemberData(nameof(AudioFormatDetectionData))]
public async Task GetTextAsync_DetectsAudioFormatFromMagicBytes(byte[] header, string expectedFilename)
{
const string Input = """
{
"model": "gpt-4o-transcribe"
}
""";

const string Output = """
{
"text":"Hello."
}
""";

using var audioSpeechStream = new MemoryStream(header);

using VerbatimMultiPartHttpHandler handler = new(Input, Output)
{
ExpectedAudioFilename = expectedFilename,
};
using HttpClient httpClient = new(handler);
using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "gpt-4o-transcribe");

var response = await client.GetTextAsync(audioSpeechStream);
Assert.NotNull(response);
}

[Theory]
[MemberData(nameof(AudioFormatDetectionData))]
public async Task GetStreamingTextAsync_DetectsAudioFormatFromMagicBytes(byte[] header, string expectedFilename)
{
const string Input = """
{
"model": "gpt-4o-transcribe",
"stream":true
}
""";

const string Output = """
{
"text":"Hello."
}
""";

using var audioSpeechStream = new MemoryStream(header);

using VerbatimMultiPartHttpHandler handler = new(Input, Output)
{
ExpectedRequestUriContains = "audio/transcriptions",
ExpectedAudioFilename = expectedFilename,
};
using HttpClient httpClient = new(handler);
using ISpeechToTextClient client = CreateSpeechToTextClient(httpClient, "gpt-4o-transcribe");

await foreach (var update in client.GetStreamingTextAsync(audioSpeechStream))
{
Assert.NotNull(update);
}
}
}
Loading