diff --git a/Sources/Fluid/Networking/ModelDownloader.swift b/Sources/Fluid/Networking/ModelDownloader.swift index e7b067a9..8c817e19 100644 --- a/Sources/Fluid/Networking/ModelDownloader.swift +++ b/Sources/Fluid/Networking/ModelDownloader.swift @@ -104,13 +104,13 @@ final class HuggingFaceModelDownloader { let files = try await listFilesRecursively(relativePath: item.path) for rel in files { let dest = targetRoot.appendingPathComponent(rel) - if FileManager.default.fileExists(atPath: dest.path) == false { + if self.needsDownload(relativePath: rel, at: dest) { pendingFiles.append(rel) } } } else { let dest = targetRoot.appendingPathComponent(item.path) - if FileManager.default.fileExists(atPath: dest.path) == false { + if self.needsDownload(relativePath: item.path, at: dest) { pendingFiles.append(item.path) } } @@ -170,6 +170,36 @@ final class HuggingFaceModelDownloader { self.requiredItemsList } + /// Decides whether `relativePath` needs to be (re)downloaded into `destination`. + /// + /// A file is pending when it is missing, OR when it is present but its cached content + /// looks like an HTML/markup payload — a corrupt artifact cached before download-time + /// content validation existed (see #353). `fileExists` alone would leave such a payload + /// stuck forever, because `downloadFile` (and its validator) only runs for pending files. + /// A present markup file is deleted here so a clean copy is fetched; on a read error the + /// file is left in place and treated as valid, so we never delete on uncertainty. + private func needsDownload(relativePath: String, at destination: URL) -> Bool { + guard FileManager.default.fileExists(atPath: destination.path) else { + return true + } + guard Self.cachedFileIsMarkup(at: destination) else { + return false + } + DebugLogger.shared.warning( + "[ModelDL] Cached file is an HTML/markup page, not model data; deleting to re-download: \(relativePath)", + source: "ModelDownloader" + ) + do { + try FileManager.default.removeItem(at: destination) + } catch { + DebugLogger.shared.error( + "[ModelDL] Failed to delete corrupt cached file \(relativePath): \(error.localizedDescription)", + source: "ModelDownloader" + ) + } + return true + } + private func downloadDirectory(relativePath: String, to destination: URL) async throws { try FileManager.default.createDirectory(at: destination, withIntermediateDirectories: true) @@ -196,6 +226,10 @@ final class HuggingFaceModelDownloader { continuation.resume(throwing: NSError(domain: "HF", code: http.statusCode)) return } + // Reject HTML error/block pages (e.g. a corporate proxy returning its + // notification page with HTTP 200) before persisting them as a model + // file, otherwise a corrupt payload is cached permanently. See #353. + try Self.validateDownloadedFile(at: tempUrl, response: response, relativePath: relativePath) try FileManager.default.createDirectory(at: destination.deletingLastPathComponent(), withIntermediateDirectories: true) if FileManager.default.fileExists(atPath: destination.path) { try FileManager.default.removeItem(at: destination) @@ -203,6 +237,8 @@ final class HuggingFaceModelDownloader { try FileManager.default.moveItem(at: tempUrl, to: destination) continuation.resume() } catch { + // Never leave a rejected/partial payload behind. + try? FileManager.default.removeItem(at: tempUrl) continuation.resume(throwing: error) } } @@ -213,6 +249,144 @@ final class HuggingFaceModelDownloader { } } + // MARK: - Content Validation + + /// Validates a freshly-downloaded artifact before it is persisted as a model file. + /// + /// A network proxy / secure web gateway can return an HTML (or XML) block page with + /// HTTP 200 in place of the real file. Persisting that markup (e.g. as `coremldata.bin`) + /// permanently caches a corrupt model. We reject any payload that looks like HTML/XML + /// markup — by its `Content-Type` or by its leading bytes — since no model artifact + /// (CoreML binary, JSON vocab, `.mil`) is a markup document. See issue #353. + static func validateDownloadedFile(at fileURL: URL, response: URLResponse?, relativePath: String) throws { + if let http = response as? HTTPURLResponse, + let contentType = http.value(forHTTPHeaderField: "Content-Type") { + let lowered = contentType.lowercased() + if lowered.contains("text/html") || lowered.contains("text/xml") || lowered.contains("application/xml") { + throw Self.invalidContentError( + relativePath: relativePath, + detail: "the server returned a markup page (Content-Type: \(contentType))" + ) + } + } + + // Sniff the leading bytes in case markup was returned without a markup Content-Type. + // Read a small prefix only — model files can be gigabytes. + let handle = try FileHandle(forReadingFrom: fileURL) + defer { try? handle.close() } + let prefix = (try? handle.read(upToCount: 512)) ?? Data() + if Self.looksLikeHTML(prefix) { + throw Self.invalidContentError( + relativePath: relativePath, + detail: "the downloaded file is an HTML/markup document, not the expected model data" + ) + } + } + + /// Returns `true` if a file already on disk is an HTML/markup payload rather than real + /// model data — a corrupt artifact cached before download-time validation existed (#353). + /// + /// This is the cached-file analog of `validateDownloadedFile`'s byte-sniff: it reuses the + /// same `looksLikeHTML` check on a small leading prefix (model files can be gigabytes, so + /// only 512 bytes are read). There is no `URLResponse` for a cached file, so only the + /// content is inspected, not a `Content-Type`. Returns `false` (treat as valid) on any + /// read error, so an unreadable file is never deleted on uncertainty. + static func cachedFileIsMarkup(at fileURL: URL) -> Bool { + guard let handle = try? FileHandle(forReadingFrom: fileURL) else { + return false + } + defer { try? handle.close() } + let prefix = (try? handle.read(upToCount: 512)) ?? Data() + return Self.looksLikeHTML(prefix) + } + + /// Returns `true` if any cached payload under `relativePaths` (each resolved against `root`) + /// is an HTML/markup document rather than real model data — the cached-*tree* analog of + /// `cachedFileIsMarkup`, intended for a provider preflight to call before trusting a present + /// cache and skipping the downloader. The downloader itself already re-validates each file via + /// `needsDownload`, but a preflight that returns on file-existence alone never reaches it, so a + /// corrupt-but-present cache would slip through (see #353). + /// + /// Each relative path may be a regular file or a directory (e.g. a `.mlpackage` bundle). + /// Directories are scanned recursively and every regular file inside is byte-sniffed with + /// `cachedFileIsMarkup`, reusing the single `looksLikeHTML` detector — there is no second + /// markup heuristic. Conservative on uncertainty, mirroring `cachedFileIsMarkup`: a path that + /// does not exist, a file that cannot be read, or a directory that cannot be enumerated is + /// skipped (treated as non-markup), so a valid cache is never reported corrupt. An empty + /// required directory therefore yields `false` here — its incompleteness is the existence + /// check's concern, not this markup check's. + static func cachedPayloadContainsMarkup(root: URL, relativePaths: [String]) -> Bool { + let fileManager = FileManager.default + for relativePath in relativePaths { + let url = root.appendingPathComponent(relativePath) + var isDirectory: ObjCBool = false + guard fileManager.fileExists(atPath: url.path, isDirectory: &isDirectory) else { + continue + } + if isDirectory.boolValue { + guard let enumerator = fileManager.enumerator( + at: url, + includingPropertiesForKeys: [.isRegularFileKey], + options: [.skipsHiddenFiles] + ) else { + continue + } + for case let fileURL as URL in enumerator { + let isRegularFile = (try? fileURL.resourceValues(forKeys: [.isRegularFileKey]))?.isRegularFile ?? false + guard isRegularFile else { continue } + if Self.cachedFileIsMarkup(at: fileURL) { + return true + } + } + } else if Self.cachedFileIsMarkup(at: url) { + return true + } + } + return false + } + + /// Returns `true` if `data` begins with an HTML / XML markup marker, ignoring a leading + /// UTF-8 BOM and ASCII whitespace. + /// + /// No artifact this downloader fetches legitimately begins with `<`: CoreML compiled + /// `.mlmodelc` / `.mlpackage` payloads are binary (`coremldata.bin`, `weights/weight.bin`, + /// `model.mlmodel` protobuf) or JSON (`metadata.json`, `Manifest.json`) starting with + /// `{` / `[`; the MIL program text (`model.mil`) starts with `program`; the vocab JSON + /// starts with `{`; and `tokenizer.model` is a SentencePiece binary. So any payload that, + /// after BOM + whitespace stripping, starts with `<` followed by a markup-ish byte is a + /// proxy/block page or a markup document standing in for the real file — reject it. This + /// catches ``, ``, `