-
-
Notifications
You must be signed in to change notification settings - Fork 193
fix: validate downloaded model content + surface onboarding errors (#353, #355) #386
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -104,13 +104,13 @@ final class HuggingFaceModelDownloader { | |
| let files = try await listFilesRecursively(relativePath: item.path) | ||
| for rel in files { | ||
| let dest = targetRoot.appendingPathComponent(rel) | ||
| if FileManager.default.fileExists(atPath: dest.path) == false { | ||
| if self.needsDownload(relativePath: rel, at: dest) { | ||
| pendingFiles.append(rel) | ||
| } | ||
| } | ||
| } else { | ||
| let dest = targetRoot.appendingPathComponent(item.path) | ||
| if FileManager.default.fileExists(atPath: dest.path) == false { | ||
| if self.needsDownload(relativePath: item.path, at: dest) { | ||
| pendingFiles.append(item.path) | ||
| } | ||
| } | ||
|
|
@@ -170,6 +170,36 @@ final class HuggingFaceModelDownloader { | |
| self.requiredItemsList | ||
| } | ||
|
|
||
| /// Decides whether `relativePath` needs to be (re)downloaded into `destination`. | ||
| /// | ||
| /// A file is pending when it is missing, OR when it is present but its cached content | ||
| /// looks like an HTML/markup payload — a corrupt artifact cached before download-time | ||
| /// content validation existed (see #353). `fileExists` alone would leave such a payload | ||
| /// stuck forever, because `downloadFile` (and its validator) only runs for pending files. | ||
| /// A present markup file is deleted here so a clean copy is fetched; on a read error the | ||
| /// file is left in place and treated as valid, so we never delete on uncertainty. | ||
| private func needsDownload(relativePath: String, at destination: URL) -> Bool { | ||
| guard FileManager.default.fileExists(atPath: destination.path) else { | ||
| return true | ||
| } | ||
| guard Self.cachedFileIsMarkup(at: destination) else { | ||
| return false | ||
| } | ||
| DebugLogger.shared.warning( | ||
| "[ModelDL] Cached file is an HTML/markup page, not model data; deleting to re-download: \(relativePath)", | ||
| source: "ModelDownloader" | ||
| ) | ||
| do { | ||
| try FileManager.default.removeItem(at: destination) | ||
| } catch { | ||
| DebugLogger.shared.error( | ||
| "[ModelDL] Failed to delete corrupt cached file \(relativePath): \(error.localizedDescription)", | ||
| source: "ModelDownloader" | ||
| ) | ||
| } | ||
| return true | ||
| } | ||
|
|
||
| private func downloadDirectory(relativePath: String, to destination: URL) async throws { | ||
| try FileManager.default.createDirectory(at: destination, withIntermediateDirectories: true) | ||
|
|
||
|
|
@@ -196,13 +226,19 @@ final class HuggingFaceModelDownloader { | |
| continuation.resume(throwing: NSError(domain: "HF", code: http.statusCode)) | ||
| return | ||
| } | ||
| // Reject HTML error/block pages (e.g. a corporate proxy returning its | ||
| // notification page with HTTP 200) before persisting them as a model | ||
| // file, otherwise a corrupt payload is cached permanently. See #353. | ||
| try Self.validateDownloadedFile(at: tempUrl, response: response, relativePath: relativePath) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This validation only runs for freshly downloaded temp files, but Useful? React with 👍 / 👎. |
||
| try FileManager.default.createDirectory(at: destination.deletingLastPathComponent(), withIntermediateDirectories: true) | ||
| if FileManager.default.fileExists(atPath: destination.path) { | ||
| try FileManager.default.removeItem(at: destination) | ||
| } | ||
| try FileManager.default.moveItem(at: tempUrl, to: destination) | ||
| continuation.resume() | ||
| } catch { | ||
| // Never leave a rejected/partial payload behind. | ||
| try? FileManager.default.removeItem(at: tempUrl) | ||
| continuation.resume(throwing: error) | ||
| } | ||
| } | ||
|
|
@@ -213,6 +249,144 @@ final class HuggingFaceModelDownloader { | |
| } | ||
| } | ||
|
|
||
| // MARK: - Content Validation | ||
|
|
||
| /// Validates a freshly-downloaded artifact before it is persisted as a model file. | ||
| /// | ||
| /// A network proxy / secure web gateway can return an HTML (or XML) block page with | ||
| /// HTTP 200 in place of the real file. Persisting that markup (e.g. as `coremldata.bin`) | ||
| /// permanently caches a corrupt model. We reject any payload that looks like HTML/XML | ||
| /// markup — by its `Content-Type` or by its leading bytes — since no model artifact | ||
| /// (CoreML binary, JSON vocab, `.mil`) is a markup document. See issue #353. | ||
| static func validateDownloadedFile(at fileURL: URL, response: URLResponse?, relativePath: String) throws { | ||
| if let http = response as? HTTPURLResponse, | ||
| let contentType = http.value(forHTTPHeaderField: "Content-Type") { | ||
| let lowered = contentType.lowercased() | ||
| if lowered.contains("text/html") || lowered.contains("text/xml") || lowered.contains("application/xml") { | ||
| throw Self.invalidContentError( | ||
| relativePath: relativePath, | ||
| detail: "the server returned a markup page (Content-Type: \(contentType))" | ||
| ) | ||
| } | ||
| } | ||
|
|
||
| // Sniff the leading bytes in case markup was returned without a markup Content-Type. | ||
| // Read a small prefix only — model files can be gigabytes. | ||
| let handle = try FileHandle(forReadingFrom: fileURL) | ||
| defer { try? handle.close() } | ||
| let prefix = (try? handle.read(upToCount: 512)) ?? Data() | ||
| if Self.looksLikeHTML(prefix) { | ||
| throw Self.invalidContentError( | ||
| relativePath: relativePath, | ||
| detail: "the downloaded file is an HTML/markup document, not the expected model data" | ||
| ) | ||
| } | ||
| } | ||
|
|
||
| /// Returns `true` if a file already on disk is an HTML/markup payload rather than real | ||
| /// model data — a corrupt artifact cached before download-time validation existed (#353). | ||
| /// | ||
| /// This is the cached-file analog of `validateDownloadedFile`'s byte-sniff: it reuses the | ||
| /// same `looksLikeHTML` check on a small leading prefix (model files can be gigabytes, so | ||
| /// only 512 bytes are read). There is no `URLResponse` for a cached file, so only the | ||
| /// content is inspected, not a `Content-Type`. Returns `false` (treat as valid) on any | ||
| /// read error, so an unreadable file is never deleted on uncertainty. | ||
| static func cachedFileIsMarkup(at fileURL: URL) -> Bool { | ||
| guard let handle = try? FileHandle(forReadingFrom: fileURL) else { | ||
| return false | ||
| } | ||
| defer { try? handle.close() } | ||
| let prefix = (try? handle.read(upToCount: 512)) ?? Data() | ||
| return Self.looksLikeHTML(prefix) | ||
| } | ||
|
|
||
| /// Returns `true` if any cached payload under `relativePaths` (each resolved against `root`) | ||
| /// is an HTML/markup document rather than real model data — the cached-*tree* analog of | ||
| /// `cachedFileIsMarkup`, intended for a provider preflight to call before trusting a present | ||
| /// cache and skipping the downloader. The downloader itself already re-validates each file via | ||
| /// `needsDownload`, but a preflight that returns on file-existence alone never reaches it, so a | ||
| /// corrupt-but-present cache would slip through (see #353). | ||
| /// | ||
| /// Each relative path may be a regular file or a directory (e.g. a `.mlpackage` bundle). | ||
| /// Directories are scanned recursively and every regular file inside is byte-sniffed with | ||
| /// `cachedFileIsMarkup`, reusing the single `looksLikeHTML` detector — there is no second | ||
| /// markup heuristic. Conservative on uncertainty, mirroring `cachedFileIsMarkup`: a path that | ||
| /// does not exist, a file that cannot be read, or a directory that cannot be enumerated is | ||
| /// skipped (treated as non-markup), so a valid cache is never reported corrupt. An empty | ||
| /// required directory therefore yields `false` here — its incompleteness is the existence | ||
| /// check's concern, not this markup check's. | ||
| static func cachedPayloadContainsMarkup(root: URL, relativePaths: [String]) -> Bool { | ||
| let fileManager = FileManager.default | ||
| for relativePath in relativePaths { | ||
| let url = root.appendingPathComponent(relativePath) | ||
| var isDirectory: ObjCBool = false | ||
| guard fileManager.fileExists(atPath: url.path, isDirectory: &isDirectory) else { | ||
| continue | ||
| } | ||
| if isDirectory.boolValue { | ||
| guard let enumerator = fileManager.enumerator( | ||
| at: url, | ||
| includingPropertiesForKeys: [.isRegularFileKey], | ||
| options: [.skipsHiddenFiles] | ||
| ) else { | ||
| continue | ||
| } | ||
| for case let fileURL as URL in enumerator { | ||
| let isRegularFile = (try? fileURL.resourceValues(forKeys: [.isRegularFileKey]))?.isRegularFile ?? false | ||
| guard isRegularFile else { continue } | ||
| if Self.cachedFileIsMarkup(at: fileURL) { | ||
| return true | ||
| } | ||
| } | ||
| } else if Self.cachedFileIsMarkup(at: url) { | ||
| return true | ||
| } | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| /// Returns `true` if `data` begins with an HTML / XML markup marker, ignoring a leading | ||
| /// UTF-8 BOM and ASCII whitespace. | ||
| /// | ||
| /// No artifact this downloader fetches legitimately begins with `<`: CoreML compiled | ||
| /// `.mlmodelc` / `.mlpackage` payloads are binary (`coremldata.bin`, `weights/weight.bin`, | ||
| /// `model.mlmodel` protobuf) or JSON (`metadata.json`, `Manifest.json`) starting with | ||
| /// `{` / `[`; the MIL program text (`model.mil`) starts with `program`; the vocab JSON | ||
| /// starts with `{`; and `tokenizer.model` is a SentencePiece binary. So any payload that, | ||
| /// after BOM + whitespace stripping, starts with `<` followed by a markup-ish byte is a | ||
| /// proxy/block page or a markup document standing in for the real file — reject it. This | ||
| /// catches `<!doctype`, `<html`, `<head>`, `<body>`, `<script>`, `<meta>`, comments | ||
| /// (`<!-- -->`) and XML / `<?xml` declarations, not just the two prefixes we used to | ||
| /// match. See issue #353. | ||
| static func looksLikeHTML(_ data: Data) -> Bool { | ||
| var bytes = [UInt8](data.prefix(512)) | ||
| if bytes.starts(with: [0xEF, 0xBB, 0xBF]) { | ||
| bytes.removeFirst(3) | ||
| } | ||
| while let first = bytes.first, | ||
| first == 0x20 || first == 0x09 || first == 0x0A || first == 0x0D { | ||
| bytes.removeFirst() | ||
| } | ||
| // Must begin with `<` (0x3C)… | ||
| guard bytes.first == 0x3C, bytes.count >= 2 else { | ||
| return false | ||
| } | ||
| // …immediately followed by a markup-ish byte: an ASCII letter (a tag such as | ||
| // `<html`), `!` (0x21 — `<!doctype`, `<!--`), `?` (0x3F — `<?xml`), or `/` (0x2F — | ||
| // a stray closing tag). Requiring this second byte avoids over-rejecting a | ||
| // hypothetical text artifact that merely contains a stray `<` not followed by markup. | ||
| let second = bytes[1] | ||
| let isAsciiLetter = (second >= 0x41 && second <= 0x5A) || (second >= 0x61 && second <= 0x7A) | ||
| return isAsciiLetter || second == 0x21 || second == 0x3F || second == 0x2F | ||
| } | ||
|
|
||
| private static func invalidContentError(relativePath: String, detail: String) -> NSError { | ||
| NSError(domain: "HF", code: -3, userInfo: [ | ||
| NSLocalizedDescriptionKey: | ||
| "Could not download \(relativePath): \(detail). A network proxy or firewall may be blocking model downloads.", | ||
| ]) | ||
| } | ||
|
|
||
| private final class DownloadProgressDelegate: NSObject, URLSessionDownloadDelegate { | ||
| private let onProgress: ((Double) -> Void)? | ||
| var onFinish: ((URL, URLResponse) -> Void)? | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For Nemotron/external CoreML caches that already look complete, this new cached-markup check is never reached:
NemotronProvider.preparereturns after a file-existence-onlymodelsExistOnDisk()check, andExternalCoreMLTranscriptionProvider.ensureArtifactsPresentcan return after validation that only proves required entries/manifest metadata exist. A cache with a valid manifest or.mlpackagedirectory but an HTML page saved inside a binary therefore still skipsensureModelsPresentand remains permanently corrupted; move this validation into those preflight checks or force the downloader to run before returning.Useful? React with 👍 / 👎.