altic-dev · altic-dev · Jun 26, 2026 · Jun 19, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/Sources/Fluid/Networking/ModelDownloader.swift b/Sources/Fluid/Networking/ModelDownloader.swift
@@ -104,13 +104,13 @@ final class HuggingFaceModelDownloader {
                 let files = try await listFilesRecursively(relativePath: item.path)
                 for rel in files {
                     let dest = targetRoot.appendingPathComponent(rel)
-                    if FileManager.default.fileExists(atPath: dest.path) == false {
+                    if self.needsDownload(relativePath: rel, at: dest) {
                         pendingFiles.append(rel)
                     }
                 }
             } else {
                 let dest = targetRoot.appendingPathComponent(item.path)
-                if FileManager.default.fileExists(atPath: dest.path) == false {
+                if self.needsDownload(relativePath: item.path, at: dest) {
                     pendingFiles.append(item.path)
                 }
             }
@@ -170,6 +170,36 @@ final class HuggingFaceModelDownloader {
         self.requiredItemsList
     }
 
+    /// Decides whether `relativePath` needs to be (re)downloaded into `destination`.
+    ///
+    /// A file is pending when it is missing, OR when it is present but its cached content
+    /// looks like an HTML/markup payload — a corrupt artifact cached before download-time
+    /// content validation existed (see #353). `fileExists` alone would leave such a payload
+    /// stuck forever, because `downloadFile` (and its validator) only runs for pending files.
+    /// A present markup file is deleted here so a clean copy is fetched; on a read error the
+    /// file is left in place and treated as valid, so we never delete on uncertainty.
+    private func needsDownload(relativePath: String, at destination: URL) -> Bool {
+        guard FileManager.default.fileExists(atPath: destination.path) else {
+            return true
+        }
+        guard Self.cachedFileIsMarkup(at: destination) else {
+            return false
+        }
+        DebugLogger.shared.warning(
+            "[ModelDL] Cached file is an HTML/markup page, not model data; deleting to re-download: \(relativePath)",
+            source: "ModelDownloader"
+        )
+        do {
+            try FileManager.default.removeItem(at: destination)
+        } catch {
+            DebugLogger.shared.error(
+                "[ModelDL] Failed to delete corrupt cached file \(relativePath): \(error.localizedDescription)",
+                source: "ModelDownloader"
+            )
+        }
+        return true
+    }
+
     private func downloadDirectory(relativePath: String, to destination: URL) async throws {
         try FileManager.default.createDirectory(at: destination, withIntermediateDirectories: true)
 
@@ -196,13 +226,19 @@ final class HuggingFaceModelDownloader {
                         continuation.resume(throwing: NSError(domain: "HF", code: http.statusCode))
                         return
                     }
+                    // Reject HTML error/block pages (e.g. a corporate proxy returning its
+                    // notification page with HTTP 200) before persisting them as a model
+                    // file, otherwise a corrupt payload is cached permanently. See #353.
+                    try Self.validateDownloadedFile(at: tempUrl, response: response, relativePath: relativePath)
                     try FileManager.default.createDirectory(at: destination.deletingLastPathComponent(), withIntermediateDirectories: true)
                     if FileManager.default.fileExists(atPath: destination.path) {
                         try FileManager.default.removeItem(at: destination)
                     }
                     try FileManager.default.moveItem(at: tempUrl, to: destination)
                     continuation.resume()
                 } catch {
+                    // Never leave a rejected/partial payload behind.
+                    try? FileManager.default.removeItem(at: tempUrl)
                     continuation.resume(throwing: error)
                 }
             }
@@ -213,6 +249,144 @@ final class HuggingFaceModelDownloader {
         }
     }
 
+    // MARK: - Content Validation
+
+    /// Validates a freshly-downloaded artifact before it is persisted as a model file.
+    ///
+    /// A network proxy / secure web gateway can return an HTML (or XML) block page with
+    /// HTTP 200 in place of the real file. Persisting that markup (e.g. as `coremldata.bin`)
+    /// permanently caches a corrupt model. We reject any payload that looks like HTML/XML
+    /// markup — by its `Content-Type` or by its leading bytes — since no model artifact
+    /// (CoreML binary, JSON vocab, `.mil`) is a markup document. See issue #353.
+    static func validateDownloadedFile(at fileURL: URL, response: URLResponse?, relativePath: String) throws {
+        if let http = response as? HTTPURLResponse,
+           let contentType = http.value(forHTTPHeaderField: "Content-Type") {
+            let lowered = contentType.lowercased()
+            if lowered.contains("text/html") || lowered.contains("text/xml") || lowered.contains("application/xml") {
+                throw Self.invalidContentError(
+                    relativePath: relativePath,
+                    detail: "the server returned a markup page (Content-Type: \(contentType))"
+                )
+            }
+        }
+
+        // Sniff the leading bytes in case markup was returned without a markup Content-Type.
+        // Read a small prefix only — model files can be gigabytes.
+        let handle = try FileHandle(forReadingFrom: fileURL)
+        defer { try? handle.close() }
+        let prefix = (try? handle.read(upToCount: 512)) ?? Data()
+        if Self.looksLikeHTML(prefix) {
+            throw Self.invalidContentError(
+                relativePath: relativePath,
+                detail: "the downloaded file is an HTML/markup document, not the expected model data"
+            )
+        }
+    }
+
+    /// Returns `true` if a file already on disk is an HTML/markup payload rather than real
+    /// model data — a corrupt artifact cached before download-time validation existed (#353).
+    ///
+    /// This is the cached-file analog of `validateDownloadedFile`'s byte-sniff: it reuses the
+    /// same `looksLikeHTML` check on a small leading prefix (model files can be gigabytes, so
+    /// only 512 bytes are read). There is no `URLResponse` for a cached file, so only the
+    /// content is inspected, not a `Content-Type`. Returns `false` (treat as valid) on any
+    /// read error, so an unreadable file is never deleted on uncertainty.
+    static func cachedFileIsMarkup(at fileURL: URL) -> Bool {
+        guard let handle = try? FileHandle(forReadingFrom: fileURL) else {
+            return false
+        }
+        defer { try? handle.close() }
+        let prefix = (try? handle.read(upToCount: 512)) ?? Data()
+        return Self.looksLikeHTML(prefix)
+    }
+
+    /// Returns `true` if any cached payload under `relativePaths` (each resolved against `root`)
+    /// is an HTML/markup document rather than real model data — the cached-*tree* analog of
+    /// `cachedFileIsMarkup`, intended for a provider preflight to call before trusting a present
+    /// cache and skipping the downloader. The downloader itself already re-validates each file via
+    /// `needsDownload`, but a preflight that returns on file-existence alone never reaches it, so a
+    /// corrupt-but-present cache would slip through (see #353).
+    ///
+    /// Each relative path may be a regular file or a directory (e.g. a `.mlpackage` bundle).
+    /// Directories are scanned recursively and every regular file inside is byte-sniffed with
+    /// `cachedFileIsMarkup`, reusing the single `looksLikeHTML` detector — there is no second
+    /// markup heuristic. Conservative on uncertainty, mirroring `cachedFileIsMarkup`: a path that
+    /// does not exist, a file that cannot be read, or a directory that cannot be enumerated is
+    /// skipped (treated as non-markup), so a valid cache is never reported corrupt. An empty
+    /// required directory therefore yields `false` here — its incompleteness is the existence
+    /// check's concern, not this markup check's.
+    static func cachedPayloadContainsMarkup(root: URL, relativePaths: [String]) -> Bool {
+        let fileManager = FileManager.default
+        for relativePath in relativePaths {
+            let url = root.appendingPathComponent(relativePath)
+            var isDirectory: ObjCBool = false
+            guard fileManager.fileExists(atPath: url.path, isDirectory: &isDirectory) else {
+                continue
+            }
+            if isDirectory.boolValue {
+                guard let enumerator = fileManager.enumerator(
+                    at: url,
+                    includingPropertiesForKeys: [.isRegularFileKey],
+                    options: [.skipsHiddenFiles]
+                ) else {
+                    continue
+                }
+                for case let fileURL as URL in enumerator {
+                    let isRegularFile = (try? fileURL.resourceValues(forKeys: [.isRegularFileKey]))?.isRegularFile ?? false
+                    guard isRegularFile else { continue }
+                    if Self.cachedFileIsMarkup(at: fileURL) {
+                        return true
+                    }
+                }
+            } else if Self.cachedFileIsMarkup(at: url) {
+                return true
+            }
+        }
+        return false
+    }
+
+    /// Returns `true` if `data` begins with an HTML / XML markup marker, ignoring a leading
+    /// UTF-8 BOM and ASCII whitespace.
+    ///
+    /// No artifact this downloader fetches legitimately begins with `<`: CoreML compiled
+    /// `.mlmodelc` / `.mlpackage` payloads are binary (`coremldata.bin`, `weights/weight.bin`,
+    /// `model.mlmodel` protobuf) or JSON (`metadata.json`, `Manifest.json`) starting with
+    /// `{` / `[`; the MIL program text (`model.mil`) starts with `program`; the vocab JSON
+    /// starts with `{`; and `tokenizer.model` is a SentencePiece binary. So any payload that,
+    /// after BOM + whitespace stripping, starts with `<` followed by a markup-ish byte is a
+    /// proxy/block page or a markup document standing in for the real file — reject it. This
+    /// catches `<!doctype`, `<html`, `<head>`, `<body>`, `<script>`, `<meta>`, comments
+    /// (`<!-- -->`) and XML / `<?xml` declarations, not just the two prefixes we used to
+    /// match. See issue #353.
+    static func looksLikeHTML(_ data: Data) -> Bool {
+        var bytes = [UInt8](data.prefix(512))
+        if bytes.starts(with: [0xEF, 0xBB, 0xBF]) {
+            bytes.removeFirst(3)
+        }
+        while let first = bytes.first,
+              first == 0x20 || first == 0x09 || first == 0x0A || first == 0x0D {
+            bytes.removeFirst()
+        }
+        // Must begin with `<` (0x3C)…
+        guard bytes.first == 0x3C, bytes.count >= 2 else {
+            return false
+        }
+        // …immediately followed by a markup-ish byte: an ASCII letter (a tag such as
+        // `<html`), `!` (0x21 — `<!doctype`, `<!--`), `?` (0x3F — `<?xml`), or `/` (0x2F —
+        // a stray closing tag). Requiring this second byte avoids over-rejecting a
+        // hypothetical text artifact that merely contains a stray `<` not followed by markup.
+        let second = bytes[1]
+        let isAsciiLetter = (second >= 0x41 && second <= 0x5A) || (second >= 0x61 && second <= 0x7A)
+        return isAsciiLetter || second == 0x21 || second == 0x3F || second == 0x2F
+    }
+
+    private static func invalidContentError(relativePath: String, detail: String) -> NSError {
+        NSError(domain: "HF", code: -3, userInfo: [
+            NSLocalizedDescriptionKey:
+                "Could not download \(relativePath): \(detail). A network proxy or firewall may be blocking model downloads.",
+        ])
+    }
+
     private final class DownloadProgressDelegate: NSObject, URLSessionDownloadDelegate {
         private let onProgress: ((Double) -> Void)?
         var onFinish: ((URL, URLResponse) -> Void)?

diff --git a/Sources/Fluid/Services/ExternalCoreMLTranscriptionProvider.swift b/Sources/Fluid/Services/ExternalCoreMLTranscriptionProvider.swift
@@ -230,6 +230,14 @@ final class ExternalCoreMLTranscriptionProvider: TranscriptionProvider {
         try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true)
         let isManagedDirectory = Self.isAppManagedArtifactsDirectory(directory, spec: spec)
 
+        // `validateArtifacts` proves the required entries exist and the manifest JSON decodes, but
+        // it does NOT byte-check the `.mlpackage` binaries — a network proxy can have returned an
+        // HTML block page (HTTP 200) in place of one, persisting markup as a model file. Re-sniff
+        // the present artifacts so such a payload forces the downloader to run (it then deletes +
+        // re-fetches the corrupt files via `needsDownload`) instead of being trusted forever. The
+        // outdated-bundle-stamp refresh below is preserved and takes precedence: a stamp-stale
+        // managed cache is still fully removed even if it is also markup-corrupt, so the bundle
+        // is wholly refreshed rather than only the corrupt files re-fetched. See #353.
         if spec.validateArtifacts(at: directory) {
             if isManagedDirectory, self.artifactBundleStampMatches(spec: spec, directory: directory) == false {
                 DebugLogger.shared.warning(
@@ -238,6 +246,11 @@ final class ExternalCoreMLTranscriptionProvider: TranscriptionProvider {
                 )
                 try FileManager.default.removeItem(at: directory)
                 try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true)
+            } else if Self.cachedArtifactsAreMarkupCorrupt(spec: spec, directory: directory) {
+                DebugLogger.shared.warning(
+                    "ExternalCoreML: cached artifacts for \(directory.lastPathComponent) contain an HTML/markup payload (corrupt); re-downloading",
+                    source: "ExternalCoreML"
+                )
             } else {
                 DebugLogger.shared.info(
                     "ExternalCoreML: artifact validation passed for \(directory.lastPathComponent)",
@@ -248,7 +261,8 @@ final class ExternalCoreMLTranscriptionProvider: TranscriptionProvider {
             }
         }
 
-        if spec.validateArtifacts(at: directory) {
+        if spec.validateArtifacts(at: directory),
+           !Self.cachedArtifactsAreMarkupCorrupt(spec: spec, directory: directory) {
             DebugLogger.shared.info(
                 "ExternalCoreML: artifact validation passed for \(directory.lastPathComponent)",
                 source: "ExternalCoreML"
@@ -307,6 +321,19 @@ final class ExternalCoreMLTranscriptionProvider: TranscriptionProvider {
         return directory.standardizedFileURL.path == defaultCacheDirectory.standardizedFileURL.path
     }
 
+    /// `true` if any required cached artifact is an HTML/markup payload instead of model data — a
+    /// corrupt cache a markup-blind `validateArtifacts` check would otherwise trust (#353). Reuses
+    /// the downloader's shared byte-sniff; conservative on read errors (never flags on uncertainty).
+    private static func cachedArtifactsAreMarkupCorrupt(
+        spec: ExternalCoreMLASRModelSpec,
+        directory: URL
+    ) -> Bool {
+        HuggingFaceModelDownloader.cachedPayloadContainsMarkup(
+            root: directory,
+            relativePaths: spec.requiredEntries
+        )
+    }
+
     private static func makeError(_ description: String) -> NSError {
         NSError(
             domain: "ExternalCoreMLTranscriptionProvider",

diff --git a/Sources/Fluid/Services/NemotronProvider.swift b/Sources/Fluid/Services/NemotronProvider.swift
@@ -88,13 +88,27 @@ final class NemotronProvider: TranscriptionProvider {
         }
         try FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
 
-        if self.modelsExistOnDisk() {
+        // A file-existence check alone would trust a corrupt cache: a network proxy can have
+        // returned an HTML block page (HTTP 200) in place of a model file, persisting markup as
+        // e.g. a `.mlpackage` binary or `tokenizer.model`. Re-sniff the present artifacts so such
+        // a payload forces a re-download (the downloader then deletes + re-fetches the bad files
+        // via `needsDownload`) instead of being loaded as a model forever. See #353.
+        let modelsPresent = self.modelsExistOnDisk()
+        let cachedArtifactsCorrupt = modelsPresent
+            && HuggingFaceModelDownloader.cachedPayloadContainsMarkup(root: dir, relativePaths: self.requiredFiles)
+        if modelsPresent && !cachedArtifactsCorrupt {
             DebugLogger.shared.info(
                 "Nemotron: artifacts present at \(dir.path); skipping download",
                 source: "Nemotron"
             )
             progressHandler?(0.8)
         } else {
+            if cachedArtifactsCorrupt {
+                DebugLogger.shared.warning(
+                    "Nemotron: cached artifacts at \(dir.path) contain an HTML/markup payload (corrupt); re-downloading",
+                    source: "Nemotron"
+                )
+            }
             DebugLogger.shared.info(
                 "Nemotron: artifacts missing; downloading from \(self.repositoryOwner)/\(self.repositoryName)",
                 source: "Nemotron"

diff --git a/Sources/Fluid/UI/WelcomeView.swift b/Sources/Fluid/UI/WelcomeView.swift
@@ -1903,6 +1903,12 @@ struct OnboardingFlowView: View {
                 try await self.asr.ensureAsrReady()
             } catch {
                 DebugLogger.shared.error("Failed to prepare onboarding voice model \(route.model.displayName): \(error)", source: "OnboardingFlowView")
+                // Surface the failure in the UI instead of only logging it, so the user
+                // isn't stuck at a disabled button. The shared ContentView alert (bound to
+                // asr.showError) presents this during onboarding. See #355.
+                self.asr.errorTitle = "Voice Model Setup Failed"
+                self.asr.errorMessage = error.localizedDescription
+                self.asr.showError = true
             }
             await self.asr.checkIfModelsExistAsync()
         }