Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 176 additions & 2 deletions Sources/Fluid/Networking/ModelDownloader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,13 @@ final class HuggingFaceModelDownloader {
let files = try await listFilesRecursively(relativePath: item.path)
for rel in files {
let dest = targetRoot.appendingPathComponent(rel)
if FileManager.default.fileExists(atPath: dest.path) == false {
if self.needsDownload(relativePath: rel, at: dest) {
pendingFiles.append(rel)
Comment on lines +107 to 108

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Validate caches before provider-level skips

For Nemotron/external CoreML caches that already look complete, this new cached-markup check is never reached: NemotronProvider.prepare returns after a file-existence-only modelsExistOnDisk() check, and ExternalCoreMLTranscriptionProvider.ensureArtifactsPresent can return after validation that only proves required entries/manifest metadata exist. A cache with a valid manifest or .mlpackage directory but an HTML page saved inside a binary therefore still skips ensureModelsPresent and remains permanently corrupted; move this validation into those preflight checks or force the downloader to run before returning.

Useful? React with 👍 / 👎.

}
}
} else {
let dest = targetRoot.appendingPathComponent(item.path)
if FileManager.default.fileExists(atPath: dest.path) == false {
if self.needsDownload(relativePath: item.path, at: dest) {
pendingFiles.append(item.path)
}
}
Expand Down Expand Up @@ -170,6 +170,36 @@ final class HuggingFaceModelDownloader {
self.requiredItemsList
}

/// Decides whether `relativePath` needs to be (re)downloaded into `destination`.
///
/// A file is pending when it is missing, OR when it is present but its cached content
/// looks like an HTML/markup payload — a corrupt artifact cached before download-time
/// content validation existed (see #353). `fileExists` alone would leave such a payload
/// stuck forever, because `downloadFile` (and its validator) only runs for pending files.
/// A present markup file is deleted here so a clean copy is fetched; on a read error the
/// file is left in place and treated as valid, so we never delete on uncertainty.
private func needsDownload(relativePath: String, at destination: URL) -> Bool {
guard FileManager.default.fileExists(atPath: destination.path) else {
return true
}
guard Self.cachedFileIsMarkup(at: destination) else {
return false
}
DebugLogger.shared.warning(
"[ModelDL] Cached file is an HTML/markup page, not model data; deleting to re-download: \(relativePath)",
source: "ModelDownloader"
)
do {
try FileManager.default.removeItem(at: destination)
} catch {
DebugLogger.shared.error(
"[ModelDL] Failed to delete corrupt cached file \(relativePath): \(error.localizedDescription)",
source: "ModelDownloader"
)
}
return true
}

private func downloadDirectory(relativePath: String, to destination: URL) async throws {
try FileManager.default.createDirectory(at: destination, withIntermediateDirectories: true)

Expand All @@ -196,13 +226,19 @@ final class HuggingFaceModelDownloader {
continuation.resume(throwing: NSError(domain: "HF", code: http.statusCode))
return
}
// Reject HTML error/block pages (e.g. a corporate proxy returning its
// notification page with HTTP 200) before persisting them as a model
// file, otherwise a corrupt payload is cached permanently. See #353.
try Self.validateDownloadedFile(at: tempUrl, response: response, relativePath: relativePath)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Validate cached artifacts before skipping downloads

This validation only runs for freshly downloaded temp files, but ensureModelsPresent builds pendingFiles using a plain fileExists check before downloadFile is called. For users who already hit #353 and have an HTML block page cached at a model path, that file is still treated as present and this new validator is never reached, so the corrupt model remains permanently stuck until they manually clear the cache. Please validate existing artifacts or force a re-download when a present file is detected as markup.

Useful? React with 👍 / 👎.

try FileManager.default.createDirectory(at: destination.deletingLastPathComponent(), withIntermediateDirectories: true)
if FileManager.default.fileExists(atPath: destination.path) {
try FileManager.default.removeItem(at: destination)
}
try FileManager.default.moveItem(at: tempUrl, to: destination)
continuation.resume()
} catch {
// Never leave a rejected/partial payload behind.
try? FileManager.default.removeItem(at: tempUrl)
continuation.resume(throwing: error)
}
}
Expand All @@ -213,6 +249,144 @@ final class HuggingFaceModelDownloader {
}
}

// MARK: - Content Validation

/// Validates a freshly-downloaded artifact before it is persisted as a model file.
///
/// A network proxy / secure web gateway can return an HTML (or XML) block page with
/// HTTP 200 in place of the real file. Persisting that markup (e.g. as `coremldata.bin`)
/// permanently caches a corrupt model. We reject any payload that looks like HTML/XML
/// markup — by its `Content-Type` or by its leading bytes — since no model artifact
/// (CoreML binary, JSON vocab, `.mil`) is a markup document. See issue #353.
static func validateDownloadedFile(at fileURL: URL, response: URLResponse?, relativePath: String) throws {
if let http = response as? HTTPURLResponse,
let contentType = http.value(forHTTPHeaderField: "Content-Type") {
let lowered = contentType.lowercased()
if lowered.contains("text/html") || lowered.contains("text/xml") || lowered.contains("application/xml") {
throw Self.invalidContentError(
relativePath: relativePath,
detail: "the server returned a markup page (Content-Type: \(contentType))"
)
}
}

// Sniff the leading bytes in case markup was returned without a markup Content-Type.
// Read a small prefix only — model files can be gigabytes.
let handle = try FileHandle(forReadingFrom: fileURL)
defer { try? handle.close() }
let prefix = (try? handle.read(upToCount: 512)) ?? Data()
if Self.looksLikeHTML(prefix) {
throw Self.invalidContentError(
relativePath: relativePath,
detail: "the downloaded file is an HTML/markup document, not the expected model data"
)
}
}

/// Returns `true` if a file already on disk is an HTML/markup payload rather than real
/// model data — a corrupt artifact cached before download-time validation existed (#353).
///
/// This is the cached-file analog of `validateDownloadedFile`'s byte-sniff: it reuses the
/// same `looksLikeHTML` check on a small leading prefix (model files can be gigabytes, so
/// only 512 bytes are read). There is no `URLResponse` for a cached file, so only the
/// content is inspected, not a `Content-Type`. Returns `false` (treat as valid) on any
/// read error, so an unreadable file is never deleted on uncertainty.
static func cachedFileIsMarkup(at fileURL: URL) -> Bool {
guard let handle = try? FileHandle(forReadingFrom: fileURL) else {
return false
}
defer { try? handle.close() }
let prefix = (try? handle.read(upToCount: 512)) ?? Data()
return Self.looksLikeHTML(prefix)
}

/// Returns `true` if any cached payload under `relativePaths` (each resolved against `root`)
/// is an HTML/markup document rather than real model data — the cached-*tree* analog of
/// `cachedFileIsMarkup`, intended for a provider preflight to call before trusting a present
/// cache and skipping the downloader. The downloader itself already re-validates each file via
/// `needsDownload`, but a preflight that returns on file-existence alone never reaches it, so a
/// corrupt-but-present cache would slip through (see #353).
///
/// Each relative path may be a regular file or a directory (e.g. a `.mlpackage` bundle).
/// Directories are scanned recursively and every regular file inside is byte-sniffed with
/// `cachedFileIsMarkup`, reusing the single `looksLikeHTML` detector — there is no second
/// markup heuristic. Conservative on uncertainty, mirroring `cachedFileIsMarkup`: a path that
/// does not exist, a file that cannot be read, or a directory that cannot be enumerated is
/// skipped (treated as non-markup), so a valid cache is never reported corrupt. An empty
/// required directory therefore yields `false` here — its incompleteness is the existence
/// check's concern, not this markup check's.
static func cachedPayloadContainsMarkup(root: URL, relativePaths: [String]) -> Bool {
let fileManager = FileManager.default
for relativePath in relativePaths {
let url = root.appendingPathComponent(relativePath)
var isDirectory: ObjCBool = false
guard fileManager.fileExists(atPath: url.path, isDirectory: &isDirectory) else {
continue
}
if isDirectory.boolValue {
guard let enumerator = fileManager.enumerator(
at: url,
includingPropertiesForKeys: [.isRegularFileKey],
options: [.skipsHiddenFiles]
) else {
continue
}
for case let fileURL as URL in enumerator {
let isRegularFile = (try? fileURL.resourceValues(forKeys: [.isRegularFileKey]))?.isRegularFile ?? false
guard isRegularFile else { continue }
if Self.cachedFileIsMarkup(at: fileURL) {
return true
}
}
} else if Self.cachedFileIsMarkup(at: url) {
return true
}
}
return false
}

/// Returns `true` if `data` begins with an HTML / XML markup marker, ignoring a leading
/// UTF-8 BOM and ASCII whitespace.
///
/// No artifact this downloader fetches legitimately begins with `<`: CoreML compiled
/// `.mlmodelc` / `.mlpackage` payloads are binary (`coremldata.bin`, `weights/weight.bin`,
/// `model.mlmodel` protobuf) or JSON (`metadata.json`, `Manifest.json`) starting with
/// `{` / `[`; the MIL program text (`model.mil`) starts with `program`; the vocab JSON
/// starts with `{`; and `tokenizer.model` is a SentencePiece binary. So any payload that,
/// after BOM + whitespace stripping, starts with `<` followed by a markup-ish byte is a
/// proxy/block page or a markup document standing in for the real file — reject it. This
/// catches `<!doctype`, `<html`, `<head>`, `<body>`, `<script>`, `<meta>`, comments
/// (`<!-- -->`) and XML / `<?xml` declarations, not just the two prefixes we used to
/// match. See issue #353.
static func looksLikeHTML(_ data: Data) -> Bool {
var bytes = [UInt8](data.prefix(512))
if bytes.starts(with: [0xEF, 0xBB, 0xBF]) {
bytes.removeFirst(3)
}
while let first = bytes.first,
first == 0x20 || first == 0x09 || first == 0x0A || first == 0x0D {
bytes.removeFirst()
}
// Must begin with `<` (0x3C)…
guard bytes.first == 0x3C, bytes.count >= 2 else {
return false
}
// …immediately followed by a markup-ish byte: an ASCII letter (a tag such as
// `<html`), `!` (0x21 — `<!doctype`, `<!--`), `?` (0x3F — `<?xml`), or `/` (0x2F —
// a stray closing tag). Requiring this second byte avoids over-rejecting a
// hypothetical text artifact that merely contains a stray `<` not followed by markup.
let second = bytes[1]
let isAsciiLetter = (second >= 0x41 && second <= 0x5A) || (second >= 0x61 && second <= 0x7A)
return isAsciiLetter || second == 0x21 || second == 0x3F || second == 0x2F
}

private static func invalidContentError(relativePath: String, detail: String) -> NSError {
NSError(domain: "HF", code: -3, userInfo: [
NSLocalizedDescriptionKey:
"Could not download \(relativePath): \(detail). A network proxy or firewall may be blocking model downloads.",
])
}

private final class DownloadProgressDelegate: NSObject, URLSessionDownloadDelegate {
private let onProgress: ((Double) -> Void)?
var onFinish: ((URL, URLResponse) -> Void)?
Expand Down
29 changes: 28 additions & 1 deletion Sources/Fluid/Services/ExternalCoreMLTranscriptionProvider.swift
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,14 @@ final class ExternalCoreMLTranscriptionProvider: TranscriptionProvider {
try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true)
let isManagedDirectory = Self.isAppManagedArtifactsDirectory(directory, spec: spec)

// `validateArtifacts` proves the required entries exist and the manifest JSON decodes, but
// it does NOT byte-check the `.mlpackage` binaries — a network proxy can have returned an
// HTML block page (HTTP 200) in place of one, persisting markup as a model file. Re-sniff
// the present artifacts so such a payload forces the downloader to run (it then deletes +
// re-fetches the corrupt files via `needsDownload`) instead of being trusted forever. The
// outdated-bundle-stamp refresh below is preserved and takes precedence: a stamp-stale
// managed cache is still fully removed even if it is also markup-corrupt, so the bundle
// is wholly refreshed rather than only the corrupt files re-fetched. See #353.
if spec.validateArtifacts(at: directory) {
if isManagedDirectory, self.artifactBundleStampMatches(spec: spec, directory: directory) == false {
DebugLogger.shared.warning(
Expand All @@ -238,6 +246,11 @@ final class ExternalCoreMLTranscriptionProvider: TranscriptionProvider {
)
try FileManager.default.removeItem(at: directory)
try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true)
} else if Self.cachedArtifactsAreMarkupCorrupt(spec: spec, directory: directory) {
DebugLogger.shared.warning(
"ExternalCoreML: cached artifacts for \(directory.lastPathComponent) contain an HTML/markup payload (corrupt); re-downloading",
source: "ExternalCoreML"
)
} else {
DebugLogger.shared.info(
"ExternalCoreML: artifact validation passed for \(directory.lastPathComponent)",
Expand All @@ -248,7 +261,8 @@ final class ExternalCoreMLTranscriptionProvider: TranscriptionProvider {
}
}

if spec.validateArtifacts(at: directory) {
if spec.validateArtifacts(at: directory),
!Self.cachedArtifactsAreMarkupCorrupt(spec: spec, directory: directory) {
DebugLogger.shared.info(
"ExternalCoreML: artifact validation passed for \(directory.lastPathComponent)",
source: "ExternalCoreML"
Expand Down Expand Up @@ -307,6 +321,19 @@ final class ExternalCoreMLTranscriptionProvider: TranscriptionProvider {
return directory.standardizedFileURL.path == defaultCacheDirectory.standardizedFileURL.path
}

/// `true` if any required cached artifact is an HTML/markup payload instead of model data — a
/// corrupt cache a markup-blind `validateArtifacts` check would otherwise trust (#353). Reuses
/// the downloader's shared byte-sniff; conservative on read errors (never flags on uncertainty).
private static func cachedArtifactsAreMarkupCorrupt(
spec: ExternalCoreMLASRModelSpec,
directory: URL
) -> Bool {
HuggingFaceModelDownloader.cachedPayloadContainsMarkup(
root: directory,
relativePaths: spec.requiredEntries
)
}

private static func makeError(_ description: String) -> NSError {
NSError(
domain: "ExternalCoreMLTranscriptionProvider",
Expand Down
16 changes: 15 additions & 1 deletion Sources/Fluid/Services/NemotronProvider.swift
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,27 @@ final class NemotronProvider: TranscriptionProvider {
}
try FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)

if self.modelsExistOnDisk() {
// A file-existence check alone would trust a corrupt cache: a network proxy can have
// returned an HTML block page (HTTP 200) in place of a model file, persisting markup as
// e.g. a `.mlpackage` binary or `tokenizer.model`. Re-sniff the present artifacts so such
// a payload forces a re-download (the downloader then deletes + re-fetches the bad files
// via `needsDownload`) instead of being loaded as a model forever. See #353.
let modelsPresent = self.modelsExistOnDisk()
let cachedArtifactsCorrupt = modelsPresent
&& HuggingFaceModelDownloader.cachedPayloadContainsMarkup(root: dir, relativePaths: self.requiredFiles)
if modelsPresent && !cachedArtifactsCorrupt {
DebugLogger.shared.info(
"Nemotron: artifacts present at \(dir.path); skipping download",
source: "Nemotron"
)
progressHandler?(0.8)
} else {
if cachedArtifactsCorrupt {
DebugLogger.shared.warning(
"Nemotron: cached artifacts at \(dir.path) contain an HTML/markup payload (corrupt); re-downloading",
source: "Nemotron"
)
}
DebugLogger.shared.info(
"Nemotron: artifacts missing; downloading from \(self.repositoryOwner)/\(self.repositoryName)",
source: "Nemotron"
Expand Down
6 changes: 6 additions & 0 deletions Sources/Fluid/UI/WelcomeView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1903,6 +1903,12 @@ struct OnboardingFlowView: View {
try await self.asr.ensureAsrReady()
} catch {
DebugLogger.shared.error("Failed to prepare onboarding voice model \(route.model.displayName): \(error)", source: "OnboardingFlowView")
// Surface the failure in the UI instead of only logging it, so the user
// isn't stuck at a disabled button. The shared ContentView alert (bound to
// asr.showError) presents this during onboarding. See #355.
self.asr.errorTitle = "Voice Model Setup Failed"
self.asr.errorMessage = error.localizedDescription
self.asr.showError = true
}
await self.asr.checkIfModelsExistAsync()
}
Expand Down
Loading
Loading