From 69f422ff3e8b8306a09643c5effc09064271b75f Mon Sep 17 00:00:00 2001
From: rtibblesbot <richard+githubagent@learningequality.org>
Date: Fri, 13 Feb 2026 00:41:53 -0800
Subject: [PATCH 1/4] Add shared URL extraction and rewriting utilities for
 archive processing

Extract URL detection and rewriting logic into standalone, testable utility
functions in url_utils.py. These operate on file contents (strings) rather
than requiring HTTP sessions, addressing the testability concerns in #303.

Supported reference types:
- HTML/XML: src, href, srcset attributes; inline style; <style> blocks
- CSS: url() references; @import (both url() and bare string forms)
- H5P JSON: path attributes in content structures

Includes path traversal protection in derive_local_filename() to prevent
downloaded resources from escaping the target directory.

Closes #303

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ricecooker/utils/url_utils.py | 345 +++++++++++++++++++++
 tests/test_url_utils.py       | 550 ++++++++++++++++++++++++++++++++++
 2 files changed, 895 insertions(+)
 create mode 100644 ricecooker/utils/url_utils.py
 create mode 100644 tests/test_url_utils.py
diff --git a/ricecooker/utils/url_utils.py b/ricecooker/utils/url_utils.py
new file mode 100644
index 00000000..b2aa913b
--- /dev/null
+++ b/ricecooker/utils/url_utils.py
@@ -0,0 +1,345 @@
+"""
+Shared URL extraction and rewriting utilities for archive processing.
+
+These functions operate on content strings (HTML, CSS, JSON) — no HTTP,
+no filesystem, no platform-specific paths. They can be used by both
+ricecooker's pipeline and Studio's upload processing.
+
+Supersedes issue #303 by making URL detection/rewriting independently
+unit-testable.
+"""
+
+import json
+import os
+import re
+from dataclasses import dataclass
+from typing import Optional
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+
+
+@dataclass
+class ExtractedURL:
+    """A URL reference found in archive content."""
+
+    url: str  # The URL as found in the source
+    source_file: str  # Which file this was found in (for archive context)
+    context: str  # 'html_attr', 'css_url', 'css_import', 'h5p_json', 'html_srcset'
+    tag: Optional[str] = None  # e.g. 'img', 'link', 'script'
+    attr: Optional[str] = None  # e.g. 'src', 'href'
+
+
+# Regex patterns for CSS URL extraction
+# Matches url('...'), url("..."), and url(...)
+_CSS_URL_RE = re.compile(r"url\(['\"]?(.*?)['\"]?\)")
+# Matches @import '...' and @import "..." (bare string form, not url() form)
+_CSS_IMPORT_RE = re.compile(r"@import\s+['\"]([^'\"]+)['\"]")
+
+
+def is_external_url(url):
+    """
+    Classify a URL as external (http/https with a netloc) vs internal
+    (relative path, data: URI, fragment-only, etc.).
+    """
+    parsed = urlparse(url)
+    return parsed.scheme in ("http", "https") and bool(parsed.netloc)
+
+
+def derive_local_filename(url):
+    """
+    Derive a deterministic local filename from an external URL.
+
+    Example:
+        'https://fonts.example.com/font.woff2'
+        -> '_external/fonts.example.com/font.woff2'
+    """
+    parsed = urlparse(url)
+    path = parsed.path.lstrip("/")
+    # Strip path traversal segments to prevent writing outside the archive
+    parts = path.split("/")
+    parts = [p for p in parts if p != ".."]
+    path = "/".join(parts)
+    if parsed.query:
+        path = path + "?" + parsed.query
+    return os.path.join("_external", parsed.netloc, path)
+
+
+def extract_urls_from_css(css_content, source_file=""):
+    """
+    Extract all external URL references from CSS content.
+
+    Finds URLs in:
+    - url('...'), url("..."), url(...)
+    - @import '...' and @import "..." (bare string form)
+
+    Skips data: URIs and relative URLs.
+    Returns list of ExtractedURL instances.
+    """
+    results = []
+    seen = set()
+
+    # First pass: url() references
+    for match in _CSS_URL_RE.finditer(css_content):
+        url = match.group(1).strip()
+        if url and is_external_url(url) and url not in seen:
+            seen.add(url)
+            results.append(
+                ExtractedURL(url=url, source_file=source_file, context="css_url")
+            )
+
+    # Second pass: bare @import strings (not url() form)
+    for match in _CSS_IMPORT_RE.finditer(css_content):
+        url = match.group(1).strip()
+        if url and is_external_url(url) and url not in seen:
+            seen.add(url)
+            results.append(
+                ExtractedURL(url=url, source_file=source_file, context="css_import")
+            )
+
+    return results
+
+
+def _parse_srcset(srcset_value):
+    """Parse an HTML srcset attribute value into a list of URLs."""
+    urls = []
+    for entry in srcset_value.split(","):
+        entry = entry.strip()
+        if entry:
+            # srcset entries are "url descriptor" e.g. "img.jpg 300w"
+            parts = entry.split()
+            if parts:
+                urls.append(parts[0])
+    return urls
+
+
+def extract_urls_from_html(html_content, source_file=""):
+    """
+    Extract all external URL references from HTML content.
+
+    Finds URLs in:
+    - img[src], script[src], source[src]
+    - img[srcset], source[srcset]
+    - link[rel=stylesheet][href]
+    - inline style attributes with url()
+    - <style> blocks
+
+    Skips data: URIs and relative URLs.
+    Returns list of ExtractedURL instances.
+    """
+    if not html_content or not html_content.strip():
+        return []
+
+    soup = BeautifulSoup(html_content, "html.parser")
+    results = []
+    seen = set()
+
+    # img[src], script[src], source[src]
+    for tag_name in ("img", "script", "source"):
+        for node in soup.find_all(tag_name, src=True):
+            url = node["src"]
+            if is_external_url(url) and url not in seen:
+                seen.add(url)
+                results.append(
+                    ExtractedURL(
+                        url=url,
+                        source_file=source_file,
+                        context="html_attr",
+                        tag=tag_name,
+                        attr="src",
+                    )
+                )
+
+    # img[srcset], source[srcset]
+    for tag_name in ("img", "source"):
+        for node in soup.find_all(tag_name, srcset=True):
+            for url in _parse_srcset(node["srcset"]):
+                if is_external_url(url) and url not in seen:
+                    seen.add(url)
+                    results.append(
+                        ExtractedURL(
+                            url=url,
+                            source_file=source_file,
+                            context="html_srcset",
+                            tag=tag_name,
+                            attr="srcset",
+                        )
+                    )
+
+    # link[rel=stylesheet][href] — use "rel" in node.attrs (PR #636 fix)
+    for node in soup.find_all("link", href=True):
+        if "rel" in node.attrs and "stylesheet" in node.get("rel", []):
+            url = node["href"]
+            if is_external_url(url) and url not in seen:
+                seen.add(url)
+                results.append(
+                    ExtractedURL(
+                        url=url,
+                        source_file=source_file,
+                        context="html_attr",
+                        tag="link",
+                        attr="href",
+                    )
+                )
+
+    # Inline style attributes with url()
+    for node in soup.find_all(style=True):
+        style_val = node.get("style", "")
+        for extracted in extract_urls_from_css(style_val, source_file):
+            if extracted.url not in seen:
+                seen.add(extracted.url)
+                results.append(extracted)
+
+    # <style> blocks
+    for style_node in soup.find_all("style"):
+        if style_node.string:
+            for extracted in extract_urls_from_css(style_node.string, source_file):
+                if extracted.url not in seen:
+                    seen.add(extracted.url)
+                    results.append(extracted)
+
+    return results
+
+
+def extract_urls_from_h5p_json(json_content, source_file=""):
+    """
+    Extract external URL references from H5P JSON content.
+
+    Walks the JSON tree recursively, finding any "path" keys whose values
+    are external URLs (start with http:// or https://).
+    """
+    try:
+        data = json.loads(json_content)
+    except (json.JSONDecodeError, TypeError):
+        return []
+
+    results = []
+
+    def _walk(obj):
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                if key == "path" and isinstance(value, str) and is_external_url(value):
+                    results.append(
+                        ExtractedURL(
+                            url=value, source_file=source_file, context="h5p_json"
+                        )
+                    )
+                else:
+                    _walk(value)
+        elif isinstance(obj, list):
+            for item in obj:
+                _walk(item)
+
+    _walk(data)
+    return results
+
+
+def rewrite_urls_in_css(css_content, url_map):
+    """
+    Rewrite URL references in CSS content using the provided mapping.
+
+    Handles both url() and @import bare string forms.
+    URLs not in the map are left unchanged.
+    """
+
+    def _repl_url(match):
+        original = match.group(0)
+        url = match.group(1).strip()
+        if url in url_map:
+            return "url('{}')".format(url_map[url])
+        return original
+
+    def _repl_import(match):
+        original = match.group(0)
+        url = match.group(1).strip()
+        if url in url_map:
+            return "@import '{}'".format(url_map[url])
+        return original
+
+    result = _CSS_URL_RE.sub(_repl_url, css_content)
+    result = _CSS_IMPORT_RE.sub(_repl_import, result)
+    return result
+
+
+def rewrite_urls_in_html(html_content, url_map):
+    """
+    Rewrite URL references in HTML content using the provided mapping.
+
+    Handles the same selectors as extract_urls_from_html:
+    img/script/source src, img/source srcset, link[stylesheet] href,
+    inline styles, and <style> blocks.
+
+    URLs not in the map are left unchanged.
+    """
+    if not html_content or not html_content.strip():
+        return html_content
+
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    # img[src], script[src], source[src]
+    for tag_name in ("img", "script", "source"):
+        for node in soup.find_all(tag_name, src=True):
+            url = node["src"]
+            if url in url_map:
+                node["src"] = url_map[url]
+
+    # img[srcset], source[srcset]
+    for tag_name in ("img", "source"):
+        for node in soup.find_all(tag_name, srcset=True):
+            entries = []
+            for entry in node["srcset"].split(","):
+                entry = entry.strip()
+                if not entry:
+                    continue
+                parts = entry.split()
+                url = parts[0]
+                if url in url_map:
+                    parts[0] = url_map[url]
+                entries.append(" ".join(parts))
+            node["srcset"] = ", ".join(entries)
+
+    # link[rel=stylesheet][href]
+    for node in soup.find_all("link", href=True):
+        if "rel" in node.attrs and "stylesheet" in node.get("rel", []):
+            url = node["href"]
+            if url in url_map:
+                node["href"] = url_map[url]
+
+    # Inline style attributes
+    for node in soup.find_all(style=True):
+        style_val = node.get("style", "")
+        node["style"] = rewrite_urls_in_css(style_val, url_map)
+
+    # <style> blocks
+    for style_node in soup.find_all("style"):
+        if style_node.string:
+            style_node.string = rewrite_urls_in_css(style_node.string, url_map)
+
+    return str(soup)
+
+
+def rewrite_urls_in_h5p_json(json_content, url_map):
+    """
+    Rewrite "path" values in H5P JSON content using the provided mapping.
+
+    Walks the JSON tree recursively, replacing matching "path" values.
+    URLs not in the map are left unchanged.
+    """
+    try:
+        data = json.loads(json_content)
+    except (json.JSONDecodeError, TypeError):
+        return json_content
+
+    def _walk(obj):
+        if isinstance(obj, dict):
+            for key in obj:
+                if key == "path" and isinstance(obj[key], str) and obj[key] in url_map:
+                    obj[key] = url_map[obj[key]]
+                else:
+                    _walk(obj[key])
+        elif isinstance(obj, list):
+            for item in obj:
+                _walk(item)
+
+    _walk(data)
+    return json.dumps(data)
diff --git a/tests/test_url_utils.py b/tests/test_url_utils.py
new file mode 100644
index 00000000..411f72f1
--- /dev/null
+++ b/tests/test_url_utils.py
@@ -0,0 +1,550 @@
+"""
+Tests for ricecooker.utils.url_utils — shared URL extraction and rewriting.
+
+All tests operate on plain strings. No HTTP, no filesystem, no archives.
+"""
+
+import json
+
+from ricecooker.utils.url_utils import (
+    derive_local_filename,
+    extract_urls_from_css,
+    extract_urls_from_h5p_json,
+    extract_urls_from_html,
+    is_external_url,
+    rewrite_urls_in_css,
+    rewrite_urls_in_h5p_json,
+    rewrite_urls_in_html,
+)
+
+
+# ---------------------------------------------------------------------------
+# is_external_url tests
+# ---------------------------------------------------------------------------
+
+
+class TestIsExternalURL:
+    def test_http_url(self):
+        assert is_external_url("http://example.com/file.js") is True
+
+    def test_https_url(self):
+        assert is_external_url("https://example.com/file.js") is True
+
+    def test_relative_path(self):
+        assert is_external_url("images/photo.jpg") is False
+
+    def test_data_uri(self):
+        assert is_external_url("data:image/png;base64,abc123") is False
+
+    def test_fragment_only(self):
+        assert is_external_url("#section") is False
+
+    def test_empty_string(self):
+        assert is_external_url("") is False
+
+    def test_protocol_relative(self):
+        # //cdn.example.com/file.js — no scheme, so not classified as external
+        assert is_external_url("//cdn.example.com/file.js") is False
+
+    def test_mailto(self):
+        assert is_external_url("mailto:user@example.com") is False
+
+    def test_javascript_uri(self):
+        assert is_external_url("javascript:void(0)") is False
+
+
+# ---------------------------------------------------------------------------
+# derive_local_filename tests
+# ---------------------------------------------------------------------------
+
+
+class TestDeriveLocalFilename:
+    def test_simple_url(self):
+        result = derive_local_filename("https://cdn.example.com/image.png")
+        assert result == "_external/cdn.example.com/image.png"
+
+    def test_url_with_subdirs(self):
+        result = derive_local_filename(
+            "https://fonts.example.com/v1/fonts/roboto.woff2"
+        )
+        assert result == "_external/fonts.example.com/v1/fonts/roboto.woff2"
+
+    def test_url_with_query(self):
+        result = derive_local_filename(
+            "https://fonts.googleapis.com/css?family=Roboto"
+        )
+        assert result == "_external/fonts.googleapis.com/css?family=Roboto"
+
+    def test_url_root_path(self):
+        result = derive_local_filename("https://example.com/")
+        assert result == "_external/example.com/"
+
+    def test_starts_with_external_prefix(self):
+        result = derive_local_filename("https://example.com/anything")
+        assert result.startswith("_external/")
+
+    def test_path_traversal_stripped(self):
+        result = derive_local_filename("https://evil.com/../../../etc/passwd")
+        assert ".." not in result
+        assert result.startswith("_external/")
+        assert "etc/passwd" in result
+
+    def test_path_traversal_deep(self):
+        result = derive_local_filename(
+            "https://evil.com/a/../../b/../../../etc/passwd"
+        )
+        assert ".." not in result
+        assert result.startswith("_external/")
+
+
+# ---------------------------------------------------------------------------
+# extract_urls_from_css tests
+# ---------------------------------------------------------------------------
+
+
+class TestExtractUrlsFromCSS:
+    def test_extract_css_url_single_quotes(self):
+        css = "body { background: url('https://example.com/bg.png') }"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/bg.png"
+        assert urls[0].context == "css_url"
+        assert urls[0].source_file == "style.css"
+
+    def test_extract_css_url_double_quotes(self):
+        css = 'body { background: url("https://example.com/bg.png") }'
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/bg.png"
+
+    def test_extract_css_url_no_quotes(self):
+        css = "body { background: url(https://example.com/bg.png) }"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/bg.png"
+
+    def test_extract_css_import_bare_string_single_quotes(self):
+        css = "@import 'https://fonts.googleapis.com/css?family=Roboto';"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.googleapis.com/css?family=Roboto"
+        assert urls[0].context == "css_import"
+
+    def test_extract_css_import_bare_string_double_quotes(self):
+        css = '@import "https://fonts.googleapis.com/css?family=Roboto";'
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.googleapis.com/css?family=Roboto"
+
+    def test_extract_css_import_url_form(self):
+        """@import url('...') should be caught by the url() regex."""
+        css = "@import url('https://fonts.googleapis.com/css');"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.googleapis.com/css"
+        assert urls[0].context == "css_url"
+
+    def test_extract_css_font_face(self):
+        css = """
+        @font-face {
+            font-family: 'Roboto';
+            src: url('https://fonts.example.com/roboto.woff2') format('woff2');
+        }
+        """
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.example.com/roboto.woff2"
+
+    def test_css_data_urls_ignored(self):
+        css = "body { background: url(data:image/png;base64,abc123) }"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 0
+
+    def test_css_relative_urls_ignored(self):
+        css = "body { background: url('../images/bg.png') }"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 0
+
+    def test_multiple_urls(self):
+        css = """
+        body { background: url('https://example.com/bg1.png') }
+        .header { background: url('https://example.com/bg2.png') }
+        """
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 2
+        extracted_urls = {u.url for u in urls}
+        assert "https://example.com/bg1.png" in extracted_urls
+        assert "https://example.com/bg2.png" in extracted_urls
+
+    def test_empty_css(self):
+        urls = extract_urls_from_css("", "style.css")
+        assert len(urls) == 0
+
+    def test_no_duplicate_for_import_url_form(self):
+        """@import url('...') should not produce duplicates from both regexes."""
+        css = "@import url('https://fonts.googleapis.com/css');"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+
+
+# ---------------------------------------------------------------------------
+# extract_urls_from_html tests
+# ---------------------------------------------------------------------------
+
+
+class TestExtractUrlsFromHTML:
+    def test_extract_img_src(self):
+        html = '<img src="https://cdn.example.com/photo.jpg">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://cdn.example.com/photo.jpg"
+        assert urls[0].context == "html_attr"
+        assert urls[0].tag == "img"
+        assert urls[0].attr == "src"
+        assert urls[0].source_file == "index.html"
+
+    def test_extract_img_srcset(self):
+        html = '<img srcset="https://cdn.example.com/img-300.jpg 300w, https://cdn.example.com/img-600.jpg 600w">'
+        urls = extract_urls_from_html(html, "index.html")
+        srcset_urls = [u for u in urls if u.context == "html_srcset"]
+        assert len(srcset_urls) == 2
+        extracted = {u.url for u in srcset_urls}
+        assert "https://cdn.example.com/img-300.jpg" in extracted
+        assert "https://cdn.example.com/img-600.jpg" in extracted
+
+    def test_extract_img_srcset_mixed_relative_external(self):
+        html = '<img srcset="img-300.jpg 300w, https://cdn.example.com/img-600.jpg 600w">'
+        urls = extract_urls_from_html(html, "index.html")
+        srcset_urls = [u for u in urls if u.context == "html_srcset"]
+        assert len(srcset_urls) == 1
+        assert srcset_urls[0].url == "https://cdn.example.com/img-600.jpg"
+
+    def test_extract_link_stylesheet(self):
+        html = '<link rel="stylesheet" href="https://fonts.googleapis.com/css">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.googleapis.com/css"
+        assert urls[0].tag == "link"
+        assert urls[0].attr == "href"
+
+    def test_extract_link_non_stylesheet_ignored(self):
+        html = '<link rel="icon" href="https://example.com/favicon.ico">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 0
+
+    def test_extract_script_src(self):
+        html = '<script src="https://cdn.example.com/lib.js"></script>'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://cdn.example.com/lib.js"
+        assert urls[0].tag == "script"
+
+    def test_extract_source_src(self):
+        html = '<source src="https://example.com/video.mp4" type="video/mp4">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/video.mp4"
+        assert urls[0].tag == "source"
+
+    def test_extract_source_srcset(self):
+        html = '<source srcset="https://example.com/img-lg.jpg 1024w">'
+        urls = extract_urls_from_html(html, "index.html")
+        srcset_urls = [u for u in urls if u.context == "html_srcset"]
+        assert len(srcset_urls) == 1
+        assert srcset_urls[0].url == "https://example.com/img-lg.jpg"
+
+    def test_extract_background_image(self):
+        html = """<div style="background-image: url('https://example.com/bg.png')">text</div>"""
+        urls = extract_urls_from_html(html, "index.html")
+        css_urls = [u for u in urls if u.context == "css_url"]
+        assert len(css_urls) == 1
+        assert css_urls[0].url == "https://example.com/bg.png"
+
+    def test_extract_style_block(self):
+        html = """
+        <html><head>
+        <style>body { background: url('https://example.com/bg.png') }</style>
+        </head><body></body></html>
+        """
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/bg.png"
+
+    def test_relative_urls_ignored(self):
+        html = '<img src="images/photo.jpg"><script src="js/app.js"></script>'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 0
+
+    def test_data_urls_ignored(self):
+        html = '<img src="data:image/png;base64,abc123">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 0
+
+    def test_empty_html(self):
+        urls = extract_urls_from_html("", "index.html")
+        assert len(urls) == 0
+
+    def test_minimal_html(self):
+        urls = extract_urls_from_html("<html><body></body></html>", "index.html")
+        assert len(urls) == 0
+
+    def test_multiple_elements(self):
+        html = """
+        <html><body>
+        <img src="https://cdn.example.com/img1.jpg">
+        <img src="https://cdn.example.com/img2.jpg">
+        <script src="https://cdn.example.com/app.js"></script>
+        </body></html>
+        """
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 3
+        extracted = {u.url for u in urls}
+        assert "https://cdn.example.com/img1.jpg" in extracted
+        assert "https://cdn.example.com/img2.jpg" in extracted
+        assert "https://cdn.example.com/app.js" in extracted
+
+
+# ---------------------------------------------------------------------------
+# extract_urls_from_h5p_json tests
+# ---------------------------------------------------------------------------
+
+
+class TestExtractUrlsFromH5PJSON:
+    def test_extract_external_path(self):
+        data = json.dumps(
+            {
+                "video": {
+                    "files": [
+                        {"path": "https://h5p.org/sites/default/files/h5p/iv.mp4"}
+                    ]
+                }
+            }
+        )
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 1
+        assert urls[0].url == "https://h5p.org/sites/default/files/h5p/iv.mp4"
+        assert urls[0].context == "h5p_json"
+        assert urls[0].source_file == "content.json"
+
+    def test_relative_path_ignored(self):
+        data = json.dumps({"image": {"path": "images/photo.jpg"}})
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 0
+
+    def test_deeply_nested(self):
+        data = json.dumps(
+            {
+                "level1": {
+                    "level2": {
+                        "level3": [
+                            {
+                                "path": "https://cdn.example.com/deep/resource.mp4"
+                            }
+                        ]
+                    }
+                }
+            }
+        )
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 1
+        assert urls[0].url == "https://cdn.example.com/deep/resource.mp4"
+
+    def test_multiple_paths(self):
+        data = json.dumps(
+            {
+                "video": {"path": "https://example.com/video.mp4"},
+                "image": {"path": "https://example.com/image.jpg"},
+                "local": {"path": "images/local.jpg"},
+            }
+        )
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 2
+        extracted = {u.url for u in urls}
+        assert "https://example.com/video.mp4" in extracted
+        assert "https://example.com/image.jpg" in extracted
+
+    def test_empty_json(self):
+        urls = extract_urls_from_h5p_json("{}", "content.json")
+        assert len(urls) == 0
+
+    def test_non_string_path_ignored(self):
+        data = json.dumps({"path": 42})
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 0
+
+
+# ---------------------------------------------------------------------------
+# rewrite_urls_in_css tests
+# ---------------------------------------------------------------------------
+
+
+class TestRewriteUrlsInCSS:
+    def test_rewrite_url(self):
+        css = "body { background: url('https://example.com/bg.png') }"
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_css(css, url_map)
+        assert "url('_external/example.com/bg.png')" in result
+        assert "https://example.com/bg.png" not in result
+
+    def test_rewrite_import(self):
+        css = "@import 'https://fonts.googleapis.com/css?family=Roboto';"
+        url_map = {
+            "https://fonts.googleapis.com/css?family=Roboto": "_external/fonts.googleapis.com/css"
+        }
+        result = rewrite_urls_in_css(css, url_map)
+        assert "_external/fonts.googleapis.com/css" in result
+        assert "https://fonts.googleapis.com/css?family=Roboto" not in result
+
+    def test_rewrite_preserves_unmapped(self):
+        css = """
+        body { background: url('https://example.com/bg.png') }
+        .other { background: url('https://other.com/bg.png') }
+        """
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_css(css, url_map)
+        assert "_external/example.com/bg.png" in result
+        assert "https://other.com/bg.png" in result
+
+    def test_rewrite_url_no_quotes(self):
+        css = "body { background: url(https://example.com/bg.png) }"
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_css(css, url_map)
+        assert "_external/example.com/bg.png" in result
+
+    def test_empty_map(self):
+        css = "body { background: url('https://example.com/bg.png') }"
+        result = rewrite_urls_in_css(css, {})
+        assert "https://example.com/bg.png" in result
+
+
+# ---------------------------------------------------------------------------
+# rewrite_urls_in_html tests
+# ---------------------------------------------------------------------------
+
+
+class TestRewriteUrlsInHTML:
+    def test_rewrite_img_src(self):
+        html = '<img src="https://cdn.example.com/photo.jpg">'
+        url_map = {
+            "https://cdn.example.com/photo.jpg": "_external/cdn.example.com/photo.jpg"
+        }
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/cdn.example.com/photo.jpg" in result
+        assert "https://cdn.example.com/photo.jpg" not in result
+
+    def test_rewrite_srcset(self):
+        html = '<img srcset="https://cdn.example.com/img-300.jpg 300w, https://cdn.example.com/img-600.jpg 600w">'
+        url_map = {
+            "https://cdn.example.com/img-300.jpg": "_external/cdn.example.com/img-300.jpg",
+            "https://cdn.example.com/img-600.jpg": "_external/cdn.example.com/img-600.jpg",
+        }
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/cdn.example.com/img-300.jpg" in result
+        assert "_external/cdn.example.com/img-600.jpg" in result
+
+    def test_rewrite_style_block(self):
+        html = "<style>body { background: url('https://example.com/bg.png') }</style>"
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/example.com/bg.png" in result
+        assert "https://example.com/bg.png" not in result
+
+    def test_rewrite_inline_style(self):
+        html = """<div style="background-image: url('https://example.com/bg.png')">text</div>"""
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/example.com/bg.png" in result
+
+    def test_rewrite_preserves_unmapped(self):
+        html = """
+        <img src="https://cdn.example.com/img1.jpg">
+        <img src="https://cdn.example.com/img2.jpg">
+        """
+        url_map = {
+            "https://cdn.example.com/img1.jpg": "_external/cdn.example.com/img1.jpg"
+        }
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/cdn.example.com/img1.jpg" in result
+        assert "https://cdn.example.com/img2.jpg" in result
+
+    def test_rewrite_link_href(self):
+        html = '<link rel="stylesheet" href="https://fonts.googleapis.com/css">'
+        url_map = {
+            "https://fonts.googleapis.com/css": "_external/fonts.googleapis.com/css"
+        }
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/fonts.googleapis.com/css" in result
+
+    def test_rewrite_script_src(self):
+        html = '<script src="https://cdn.example.com/lib.js"></script>'
+        url_map = {
+            "https://cdn.example.com/lib.js": "_external/cdn.example.com/lib.js"
+        }
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/cdn.example.com/lib.js" in result
+
+
+# ---------------------------------------------------------------------------
+# rewrite_urls_in_h5p_json tests
+# ---------------------------------------------------------------------------
+
+
+class TestRewriteUrlsInH5PJSON:
+    def test_rewrite_path(self):
+        data = json.dumps(
+            {"video": {"path": "https://h5p.org/sites/default/files/h5p/iv.mp4"}}
+        )
+        url_map = {
+            "https://h5p.org/sites/default/files/h5p/iv.mp4": "_external/h5p.org/sites/default/files/h5p/iv.mp4"
+        }
+        result = rewrite_urls_in_h5p_json(data, url_map)
+        parsed = json.loads(result)
+        assert (
+            parsed["video"]["path"]
+            == "_external/h5p.org/sites/default/files/h5p/iv.mp4"
+        )
+
+    def test_rewrite_preserves_unmapped(self):
+        data = json.dumps(
+            {
+                "video": {"path": "https://example.com/video.mp4"},
+                "image": {"path": "https://other.com/image.jpg"},
+            }
+        )
+        url_map = {
+            "https://example.com/video.mp4": "_external/example.com/video.mp4"
+        }
+        result = rewrite_urls_in_h5p_json(data, url_map)
+        parsed = json.loads(result)
+        assert parsed["video"]["path"] == "_external/example.com/video.mp4"
+        assert parsed["image"]["path"] == "https://other.com/image.jpg"
+
+    def test_rewrite_deeply_nested(self):
+        data = json.dumps(
+            {
+                "a": {
+                    "b": [{"path": "https://example.com/deep.mp4"}]
+                }
+            }
+        )
+        url_map = {
+            "https://example.com/deep.mp4": "_external/example.com/deep.mp4"
+        }
+        result = rewrite_urls_in_h5p_json(data, url_map)
+        parsed = json.loads(result)
+        assert parsed["a"]["b"][0]["path"] == "_external/example.com/deep.mp4"
+
+    def test_rewrite_relative_path_unchanged(self):
+        data = json.dumps({"image": {"path": "images/photo.jpg"}})
+        url_map = {
+            "https://example.com/video.mp4": "_external/example.com/video.mp4"
+        }
+        result = rewrite_urls_in_h5p_json(data, url_map)
+        parsed = json.loads(result)
+        assert parsed["image"]["path"] == "images/photo.jpg"
+
+    def test_empty_map(self):
+        data = json.dumps({"video": {"path": "https://example.com/video.mp4"}})
+        result = rewrite_urls_in_h5p_json(data, {})
+        parsed = json.loads(result)
+        assert parsed["video"]["path"] == "https://example.com/video.mp4"

From 6e533fd4453d33507275a52cb3cf9f9617c5cd2b Mon Sep 17 00:00:00 2001
From: rtibblesbot <richard+githubagent@learningequality.org>
Date: Fri, 13 Feb 2026 00:42:01 -0800
Subject: [PATCH 2/4] Add archive external reference processor for offline
 content bundling

Build on url_utils to create an archive-level processor that:
1. Extracts archives to a temp directory
2. Scans text-based files (HTML, CSS, JSON) for external URL references
3. Downloads external resources into an _external/ subdirectory
4. Rewrites references to point to local copies
5. Returns the temp directory path for create_predictable_zip()

Handles edge cases including download failures (preserves original URL),
duplicate URL deduplication, recursive CSS downloads (fonts referenced
by stylesheets), configurable URL blacklisting, and loop detection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ricecooker/utils/archive_assets.py | 256 +++++++++++++++
 tests/test_archive_assets.py       | 492 +++++++++++++++++++++++++++++
 2 files changed, 748 insertions(+)
 create mode 100644 ricecooker/utils/archive_assets.py
 create mode 100644 tests/test_archive_assets.py

diff --git a/ricecooker/utils/archive_assets.py b/ricecooker/utils/archive_assets.py
new file mode 100644
index 00000000..1acf2d03
--- /dev/null
+++ b/ricecooker/utils/archive_assets.py
@@ -0,0 +1,256 @@
+"""
+Archive external reference processor.
+
+Opens an archive (ZIP/H5P), scans text-based files for external URL references,
+downloads those resources, bundles them into the archive, and rewrites references
+to point to local copies.
+"""
+
+import logging
+import os
+import tempfile
+import zipfile
+
+from ricecooker.utils.downloader import make_request
+from ricecooker.utils.url_utils import (
+    derive_local_filename,
+    extract_urls_from_css,
+    extract_urls_from_h5p_json,
+    extract_urls_from_html,
+    rewrite_urls_in_css,
+    rewrite_urls_in_h5p_json,
+    rewrite_urls_in_html,
+)
+
+logger = logging.getLogger(__name__)
+
+# Map file extensions to content type for selecting the right extractor/rewriter
+_TEXT_EXTENSIONS = {
+    ".html": "html",
+    ".htm": "html",
+    ".xhtml": "html",
+    ".xml": "html",
+    ".css": "css",
+    ".json": "json",
+}
+
+
+def _is_h5p_content_json(filepath):
+    """Check if a JSON file is an H5P content.json that should be scanned."""
+    normalized = filepath.replace("\\", "/")
+    return normalized == "content/content.json" or normalized.endswith(
+        "/content/content.json"
+    )
+
+
+def _detect_content_type(filepath):
+    """Detect the content type of a file based on its extension."""
+    ext = os.path.splitext(filepath)[1].lower()
+    if ext == ".json":
+        if _is_h5p_content_json(filepath):
+            return "json"
+        return None  # Skip non-H5P JSON files
+    return _TEXT_EXTENSIONS.get(ext)
+
+
+def _compute_relative_path(from_file, to_file):
+    """Compute relative path from one file to another within the archive."""
+    from_dir = os.path.dirname(from_file)
+    return os.path.relpath(to_file, from_dir).replace("\\", "/")
+
+
+def _is_blacklisted(url, blacklist):
+    """Check if a URL matches any blacklist substring."""
+    if not blacklist:
+        return False
+    return any(pattern in url for pattern in blacklist)
+
+
+def _download_external_url(url, dest_dir, local_path):
+    """
+    Download a single external URL to the destination directory.
+
+    Returns True on success, False on failure.
+    """
+    full_path = os.path.join(dest_dir, local_path)
+    # Guard against path traversal — resolved path must stay within dest_dir
+    resolved = os.path.realpath(full_path)
+    if not resolved.startswith(os.path.realpath(dest_dir) + os.sep):
+        logger.warning("Path traversal detected for %s, skipping download", url)
+        return False
+    os.makedirs(os.path.dirname(full_path), exist_ok=True)
+
+    try:
+        response = make_request(url)
+        if response is None or response.status_code != 200:
+            logger.warning("Failed to download %s (no response or non-200)", url)
+            return False
+        with open(full_path, "wb") as f:
+            f.write(response.content)
+        return True
+    except Exception:
+        logger.warning("Error downloading %s", url, exc_info=True)
+        return False
+
+
+def download_and_rewrite_external_refs(archive_path, url_blacklist=None):
+    """
+    Process an archive to download external URL references and rewrite them
+    to local paths.
+
+    Args:
+        archive_path: Path to the archive file (ZIP or H5P)
+        url_blacklist: Optional list of URL substrings to skip
+
+    Returns:
+        Path to a temporary directory containing the processed archive contents.
+        The caller is responsible for cleaning up this directory.
+    """
+    # Extract archive to temp directory
+    temp_dir = tempfile.mkdtemp(prefix="ricecooker_archive_")
+
+    with zipfile.ZipFile(archive_path, "r") as zf:
+        zf.extractall(temp_dir)
+
+    # Phase 1: Scan all text files for external URLs
+    all_urls = {}  # url -> derive_local_filename result
+    file_urls = {}  # filepath -> list of extracted URLs
+
+    for root, _dirs, filenames in os.walk(temp_dir):
+        for filename in filenames:
+            full_path = os.path.join(root, filename)
+            rel_path = os.path.relpath(full_path, temp_dir)
+            content_type = _detect_content_type(rel_path)
+
+            if content_type is None:
+                continue
+
+            try:
+                with open(full_path, "r", encoding="utf-8") as f:
+                    content = f.read()
+            except (UnicodeDecodeError, OSError):
+                logger.warning("Could not read %s as text, skipping", rel_path)
+                continue
+
+            if content_type == "html":
+                extracted = extract_urls_from_html(content, rel_path)
+            elif content_type == "css":
+                extracted = extract_urls_from_css(content, rel_path)
+            elif content_type == "json":
+                extracted = extract_urls_from_h5p_json(content, rel_path)
+            else:
+                continue
+
+            # Filter out blacklisted URLs
+            external = [
+                e
+                for e in extracted
+                if not _is_blacklisted(e.url, url_blacklist)
+            ]
+
+            if external:
+                file_urls[rel_path] = external
+                for e in external:
+                    if e.url not in all_urls:
+                        all_urls[e.url] = derive_local_filename(e.url)
+
+    if not all_urls:
+        return temp_dir
+
+    # Phase 2: Download all external URLs
+    successful_downloads = set()
+    visited_urls = set()
+
+    for url, local_path in list(all_urls.items()):
+        if url in visited_urls:
+            continue
+        visited_urls.add(url)
+
+        if _download_external_url(url, temp_dir, local_path):
+            successful_downloads.add(url)
+
+            # CSS recursive download: scan downloaded CSS for more external refs
+            if local_path.endswith(".css") or "css" in local_path.split("?")[0]:
+                _process_downloaded_css(
+                    temp_dir,
+                    local_path,
+                    all_urls,
+                    successful_downloads,
+                    visited_urls,
+                    url_blacklist,
+                )
+
+    # Phase 3: Rewrite references in text files
+    url_map_by_file = {}
+    for rel_path, extracted_list in file_urls.items():
+        url_map = {}
+        for e in extracted_list:
+            if e.url in successful_downloads:
+                local_path = all_urls[e.url]
+                url_map[e.url] = _compute_relative_path(rel_path, local_path)
+        if url_map:
+            url_map_by_file[rel_path] = url_map
+
+    for rel_path, url_map in url_map_by_file.items():
+        full_path = os.path.join(temp_dir, rel_path)
+        content_type = _detect_content_type(rel_path)
+
+        with open(full_path, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        if content_type == "html":
+            content = rewrite_urls_in_html(content, url_map)
+        elif content_type == "css":
+            content = rewrite_urls_in_css(content, url_map)
+        elif content_type == "json":
+            content = rewrite_urls_in_h5p_json(content, url_map)
+
+        with open(full_path, "w", encoding="utf-8") as f:
+            f.write(content)
+
+    return temp_dir
+
+
+def _process_downloaded_css(
+    temp_dir, css_local_path, all_urls, successful_downloads, visited_urls, url_blacklist
+):
+    """Scan a downloaded CSS file for additional external references and download them."""
+    full_path = os.path.join(temp_dir, css_local_path)
+    try:
+        with open(full_path, "r", encoding="utf-8") as f:
+            css_content = f.read()
+    except (UnicodeDecodeError, OSError):
+        return
+
+    extracted = extract_urls_from_css(css_content, css_local_path)
+    external = [
+        e for e in extracted if not _is_blacklisted(e.url, url_blacklist)
+    ]
+
+    if not external:
+        return
+
+    # Download newly found external URLs
+    css_url_map = {}
+    for e in external:
+        if e.url in visited_urls:
+            continue
+        visited_urls.add(e.url)
+
+        local_path = derive_local_filename(e.url)
+        all_urls[e.url] = local_path
+
+        if _download_external_url(e.url, temp_dir, local_path):
+            successful_downloads.add(e.url)
+            css_url_map[e.url] = _compute_relative_path(css_local_path, local_path)
+
+    # Also build map for any already-downloaded URLs referenced from this CSS
+    for e in external:
+        if e.url in successful_downloads and e.url not in css_url_map:
+            local_path = all_urls[e.url]
+            css_url_map[e.url] = _compute_relative_path(css_local_path, local_path)
+
+    if css_url_map:
+        rewritten = rewrite_urls_in_css(css_content, css_url_map)
+        with open(full_path, "w", encoding="utf-8") as f:
+            f.write(rewritten)
diff --git a/tests/test_archive_assets.py b/tests/test_archive_assets.py
new file mode 100644
index 00000000..0faa30c6
--- /dev/null
+++ b/tests/test_archive_assets.py
@@ -0,0 +1,492 @@
+"""
+Tests for ricecooker.utils.archive_assets — archive external reference processor.
+
+Tests create in-memory ZIP archives, call download_and_rewrite_external_refs,
+and verify the output directory contents. HTTP downloads are mocked.
+"""
+
+import json
+import os
+import shutil
+import tempfile
+import zipfile
+from unittest.mock import patch
+
+import pytest
+
+from ricecooker.utils.archive_assets import download_and_rewrite_external_refs
+
+
+class MockResponse:
+    """Mock HTTP response for mocked downloads."""
+
+    def __init__(self, content=b"downloaded content", status_code=200):
+        self.content = content
+        self.status_code = status_code
+
+    def raise_for_status(self):
+        if self.status_code >= 400:
+            from requests.exceptions import HTTPError
+
+            raise HTTPError(response=self)
+
+
+def _create_zip(files_dict):
+    """Create a temporary ZIP file from a dict of {path: content}."""
+    fd, zip_path = tempfile.mkstemp(suffix=".zip")
+    os.close(fd)
+    with zipfile.ZipFile(zip_path, "w") as zf:
+        for path, content in files_dict.items():
+            if isinstance(content, str):
+                content = content.encode("utf-8")
+            zf.writestr(path, content)
+    return zip_path
+
+
+@pytest.fixture
+def mock_download():
+    """Mock make_request to return predictable content."""
+    with patch("ricecooker.utils.archive_assets.make_request") as mock:
+        mock.return_value = MockResponse(content=b"downloaded content")
+        yield mock
+
+
+@pytest.fixture
+def mock_download_css_then_font():
+    """Mock that returns CSS on first call and font bytes on subsequent calls."""
+    css_content = b"@font-face { src: url('https://fonts.example.com/roboto.woff2') }"
+    font_content = b"font-binary-data"
+
+    responses = {
+        "https://fonts.googleapis.com/css": MockResponse(content=css_content),
+        "https://fonts.example.com/roboto.woff2": MockResponse(content=font_content),
+    }
+
+    def side_effect(url, *args, **kwargs):
+        return responses.get(url, MockResponse(content=b"unknown", status_code=404))
+
+    with patch("ricecooker.utils.archive_assets.make_request") as mock:
+        mock.side_effect = side_effect
+        yield mock
+
+
+# ---------------------------------------------------------------------------
+# Basic functionality tests
+# ---------------------------------------------------------------------------
+
+
+class TestBasicFunctionality:
+    def test_html_with_external_img(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                # Image should be downloaded
+                img_path = os.path.join(
+                    result_dir, "_external", "cdn.example.com", "photo.jpg"
+                )
+                assert os.path.exists(img_path)
+
+                # HTML should be rewritten
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert "https://cdn.example.com/photo.jpg" not in html
+                assert "_external/cdn.example.com/photo.jpg" in html
+
+                mock_download.assert_called_once()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_html_with_external_css(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><head><link rel="stylesheet" href="https://fonts.googleapis.com/css"></head></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                css_path = os.path.join(
+                    result_dir, "_external", "fonts.googleapis.com", "css"
+                )
+                assert os.path.exists(css_path)
+
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert "https://fonts.googleapis.com/css" not in html
+                assert "_external/fonts.googleapis.com/css" in html
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_html_with_external_script(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><head><script src="https://cdn.example.com/lib.js"></script></head></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                js_path = os.path.join(
+                    result_dir, "_external", "cdn.example.com", "lib.js"
+                )
+                assert os.path.exists(js_path)
+
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert "_external/cdn.example.com/lib.js" in html
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_css_with_external_font(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "styles/main.css": "@font-face { src: url('https://fonts.example.com/roboto.woff2') format('woff2'); }"
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                font_path = os.path.join(
+                    result_dir,
+                    "_external",
+                    "fonts.example.com",
+                    "roboto.woff2",
+                )
+                assert os.path.exists(font_path)
+
+                with open(os.path.join(result_dir, "styles", "main.css")) as f:
+                    css = f.read()
+                assert "https://fonts.example.com/roboto.woff2" not in css
+                # Path should be relative from styles/ to _external/
+                assert "../_external/fonts.example.com/roboto.woff2" in css
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_css_with_import(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "style.css": "@import 'https://fonts.googleapis.com/css?family=Roboto';"
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                css_path = os.path.join(
+                    result_dir,
+                    "_external",
+                    "fonts.googleapis.com",
+                    "css?family=Roboto",
+                )
+                assert os.path.exists(css_path)
+
+                with open(os.path.join(result_dir, "style.css")) as f:
+                    css = f.read()
+                assert "https://fonts.googleapis.com" not in css
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_h5p_with_external_video(self, mock_download):
+        content_json = json.dumps(
+            {
+                "video": {
+                    "files": [
+                        {
+                            "path": "https://h5p.org/sites/default/files/h5p/iv.mp4",
+                            "mime": "video/mp4",
+                        }
+                    ]
+                }
+            }
+        )
+        zip_path = _create_zip(
+            {
+                "content/content.json": content_json,
+                "h5p.json": '{"mainLibrary": "H5P.InteractiveVideo"}',
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                video_path = os.path.join(
+                    result_dir,
+                    "_external",
+                    "h5p.org",
+                    "sites",
+                    "default",
+                    "files",
+                    "h5p",
+                    "iv.mp4",
+                )
+                assert os.path.exists(video_path)
+
+                with open(
+                    os.path.join(result_dir, "content", "content.json")
+                ) as f:
+                    data = json.load(f)
+                path_val = data["video"]["files"][0]["path"]
+                assert "https://h5p.org" not in path_val
+                assert "_external/" in path_val
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+
+# ---------------------------------------------------------------------------
+# Edge case tests
+# ---------------------------------------------------------------------------
+
+
+class TestEdgeCases:
+    def test_relative_urls_unchanged(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="images/photo.jpg"></body></html>',
+                "images/photo.jpg": b"fake-image-data",
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert 'src="images/photo.jpg"' in html
+                mock_download.assert_not_called()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_data_urls_unchanged(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="data:image/png;base64,abc123"></body></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert "data:image/png;base64,abc123" in html
+                mock_download.assert_not_called()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_download_failure_preserves_original(self):
+        with patch("ricecooker.utils.archive_assets.make_request") as mock:
+            mock.return_value = None  # Simulate failed download
+            zip_path = _create_zip(
+                {
+                    "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>'
+                }
+            )
+            try:
+                result_dir = download_and_rewrite_external_refs(zip_path)
+                try:
+                    with open(os.path.join(result_dir, "index.html")) as f:
+                        html = f.read()
+                    # Original URL should be preserved when download fails
+                    assert "https://cdn.example.com/photo.jpg" in html
+                finally:
+                    shutil.rmtree(result_dir, ignore_errors=True)
+            finally:
+                os.unlink(zip_path)
+
+    def test_duplicate_urls_downloaded_once(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "page1.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>',
+                "page2.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>',
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                # Download should only happen once for the same URL
+                mock_download.assert_called_once()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_css_recursive_download(self, mock_download_css_then_font):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><head><link rel="stylesheet" href="https://fonts.googleapis.com/css"></head></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                # Both CSS and font should be downloaded
+                css_path = os.path.join(
+                    result_dir, "_external", "fonts.googleapis.com", "css"
+                )
+                font_path = os.path.join(
+                    result_dir,
+                    "_external",
+                    "fonts.example.com",
+                    "roboto.woff2",
+                )
+                assert os.path.exists(css_path)
+                assert os.path.exists(font_path)
+
+                # The downloaded CSS should have its font URL rewritten too
+                with open(css_path) as f:
+                    css = f.read()
+                assert "https://fonts.example.com/roboto.woff2" not in css
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_empty_archive(self, mock_download):
+        zip_path = _create_zip({"empty.txt": ""})
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                assert os.path.isdir(result_dir)
+                mock_download.assert_not_called()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_binary_files_untouched(self, mock_download):
+        binary_content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>',
+                "images/local.png": binary_content,
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                with open(os.path.join(result_dir, "images", "local.png"), "rb") as f:
+                    assert f.read() == binary_content
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_path_traversal_url_stays_in_temp_dir(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://evil.com/../../../etc/passwd"></body></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                # The downloaded file must be inside the result directory
+                for root, _dirs, filenames in os.walk(result_dir):
+                    for filename in filenames:
+                        full_path = os.path.join(root, filename)
+                        assert os.path.realpath(full_path).startswith(
+                            os.path.realpath(result_dir)
+                        ), f"File {full_path} escapes result directory"
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_blacklisted_urls_skipped(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": """<html><body>
+                <img src="https://cdn.example.com/photo.jpg">
+                <img src="https://blocked.example.com/img.jpg">
+                </body></html>"""
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(
+                zip_path, url_blacklist=["blocked.example.com"]
+            )
+            try:
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                # Allowed URL should be downloaded and rewritten
+                assert "_external/cdn.example.com/photo.jpg" in html
+                # Blocked URL should remain unchanged
+                assert "https://blocked.example.com/img.jpg" in html
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+
+# ---------------------------------------------------------------------------
+# Integration shape tests
+# ---------------------------------------------------------------------------
+
+
+class TestIntegrationShape:
+    def test_returns_directory_path(self, mock_download):
+        zip_path = _create_zip({"index.html": "<html><body></body></html>"})
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                assert os.path.isdir(result_dir)
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_original_files_preserved(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>',
+                "images/local.png": b"png-data",
+                "scripts/app.js": "console.log('hello');",
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                assert os.path.exists(os.path.join(result_dir, "index.html"))
+                assert os.path.exists(
+                    os.path.join(result_dir, "images", "local.png")
+                )
+                assert os.path.exists(
+                    os.path.join(result_dir, "scripts", "app.js")
+                )
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_external_files_in_subdirectory(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                external_dir = os.path.join(result_dir, "_external")
+                assert os.path.isdir(external_dir)
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)

From 1eb128d344b0e7ebff5d5e40a888846103f1da2a Mon Sep 17 00:00:00 2001
From: rtibblesbot <richard+githubagent@learningequality.org>
Date: Fri, 13 Feb 2026 00:42:08 -0800
Subject: [PATCH 3/4] Integrate external reference processing into archive
 pipeline handlers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wire process_archive_external_references() into the archive pipeline,
running before create_predictable_zip() in handle_file(). This ensures
external URLs in H5P, HTML5, and IMSCP archives are downloaded and
bundled for offline use in Kolibri.

The processing step runs for all ArchiveProcessingBaseHandler subclasses
(H5P, HTML5, IMSCP) and gracefully handles failures — if downloading
fails, the original archive is used unchanged.

Closes #233

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ricecooker/utils/pipeline/convert.py |  26 +++++-
 tests/pipeline/test_convert.py       | 134 +++++++++++++++++++++++++++
 2 files changed, 159 insertions(+), 1 deletion(-)

diff --git a/ricecooker/utils/pipeline/convert.py b/ricecooker/utils/pipeline/convert.py
index 4c9e209b..fef557e7 100644
--- a/ricecooker/utils/pipeline/convert.py
+++ b/ricecooker/utils/pipeline/convert.py
@@ -46,6 +46,7 @@
 from ricecooker.utils.videos import validate_media_file
 from ricecooker.utils.videos import VideoCompressionError
 from ricecooker.utils.youtube import get_language_with_alpha2_fallback
+from ricecooker.utils.archive_assets import download_and_rewrite_external_refs
 from ricecooker.utils.zip import create_predictable_zip
 
 
@@ -194,11 +195,32 @@ def FILE_TYPE(self) -> str:
     def validate_archive(self, path: str):
         pass
 
+    def _process_external_refs(self, path):
+        """
+        Process external URL references in the archive.
+
+        Returns the path to process — either a temp directory with downloaded
+        assets, or the original path if processing fails or finds nothing.
+        """
+        try:
+            return download_and_rewrite_external_refs(path)
+        except Exception as e:
+            config.LOGGER.warning(
+                "Failed to process external references in %s: %s. "
+                "Continuing with original archive.",
+                path,
+                e,
+            )
+            return path
+
     def handle_file(self, path, audio_settings=None, video_settings=None):
         self.validate_archive(path)
 
         ext = extract_path_ext(path)
 
+        # Download external references and get processed directory
+        processed_path = self._process_external_refs(path)
+
         # Create partial for reading & compressing subfiles
         file_converter = partial(
             self._read_and_compress_archive_file,
@@ -208,7 +230,7 @@ def handle_file(self, path, audio_settings=None, video_settings=None):
         )
         # create_predictable_zip will iterate over subfiles, call file_converter
         processed_zip_path = create_predictable_zip(
-            path, file_converter=file_converter if config.COMPRESS else None
+            processed_path, file_converter=file_converter if config.COMPRESS else None
         )
 
         with self.write_file(ext) as fh:
@@ -217,6 +239,8 @@ def handle_file(self, path, audio_settings=None, video_settings=None):
 
         # Clean up
         os.unlink(processed_zip_path)
+        if processed_path != path:
+            shutil.rmtree(processed_path, ignore_errors=True)
 
     @contextmanager
     def open_and_verify_archive(self, path):
diff --git a/tests/pipeline/test_convert.py b/tests/pipeline/test_convert.py
index abbdcf99..41f803f9 100644
--- a/tests/pipeline/test_convert.py
+++ b/tests/pipeline/test_convert.py
@@ -1,13 +1,26 @@
 """Tests for audio and video compression in archive files."""
+import json
 import os
 import tempfile
 import zipfile
 from unittest.mock import patch
 
+from ricecooker import config
 from ricecooker.classes.files import H5PFile
 from ricecooker.classes.files import HTMLZipFile
 
 
+class MockResponse:
+    """Mock HTTP response for archive asset downloads."""
+
+    def __init__(self, content=b"downloaded content", status_code=200):
+        self.content = content
+        self.status_code = status_code
+
+    def raise_for_status(self):
+        pass
+
+
 def test_html5_archive_with_mp4_compression(video_file, audio_file):
     """Test that MP4 and MP3 files within HTML5 archives are compressed when compression is enabled."""
     # Create temporary HTML5 archive with media files
@@ -120,3 +133,124 @@ def test_archive_no_compression_when_disabled(video_file, audio_file):
 
     finally:
         os.unlink(temp_archive.name)
+
+
+def test_html5_archive_external_refs_downloaded():
+    """External URLs in HTML5 archives are downloaded and rewritten."""
+    temp_archive = tempfile.NamedTemporaryFile(suffix=".zip", delete=False)
+    temp_archive.close()
+
+    try:
+        with zipfile.ZipFile(temp_archive.name, "w") as zf:
+            zf.writestr(
+                "index.html",
+                '<html><body>Content here <img src="https://cdn.example.com/photo.jpg"></body></html>',
+            )
+
+        with patch("ricecooker.utils.archive_assets.make_request") as mock_request:
+            mock_request.return_value = MockResponse(content=b"fake-image-data")
+
+            with patch("ricecooker.config.COMPRESS", False):
+                html_file = HTMLZipFile(temp_archive.name)
+                result = html_file.process_file()
+
+        assert result is not None, "Processing should succeed"
+
+        # Verify the output ZIP contains the downloaded file and rewritten HTML
+        result_path = config.get_storage_path(result)
+        with zipfile.ZipFile(result_path, "r") as zf:
+            names = zf.namelist()
+            assert any(
+                "_external/" in n for n in names
+            ), f"Expected _external/ directory in output ZIP, got: {names}"
+
+            html = zf.read("index.html").decode("utf-8")
+            assert (
+                "https://cdn.example.com/photo.jpg" not in html
+            ), "External URL should be rewritten"
+            assert "_external/" in html, "Should reference local _external/ path"
+
+    finally:
+        os.unlink(temp_archive.name)
+
+
+def test_h5p_archive_external_video_downloaded():
+    """External video URLs in H5P content.json are downloaded and rewritten."""
+    temp_archive = tempfile.NamedTemporaryFile(suffix=".h5p", delete=False)
+    temp_archive.close()
+
+    content = json.dumps(
+        {
+            "video": {
+                "files": [
+                    {
+                        "path": "https://h5p.org/sites/default/files/h5p/iv.mp4",
+                        "mime": "video/mp4",
+                    }
+                ]
+            }
+        }
+    )
+
+    try:
+        with zipfile.ZipFile(temp_archive.name, "w") as zf:
+            zf.writestr("h5p.json", '{"mainLibrary": "H5P.InteractiveVideo"}')
+            zf.writestr("content/content.json", content)
+
+        with patch("ricecooker.utils.archive_assets.make_request") as mock_request:
+            mock_request.return_value = MockResponse(content=b"fake-video-data")
+
+            with patch("ricecooker.config.COMPRESS", False):
+                h5p_file = H5PFile(temp_archive.name)
+                result = h5p_file.process_file()
+
+        assert result is not None, "Processing should succeed"
+
+        result_path = config.get_storage_path(result)
+        with zipfile.ZipFile(result_path, "r") as zf:
+            names = zf.namelist()
+            assert any(
+                "_external/" in n for n in names
+            ), f"Expected _external/ directory in output ZIP, got: {names}"
+
+            data = json.loads(zf.read("content/content.json"))
+            video_path = data["video"]["files"][0]["path"]
+            assert "https://h5p.org" not in video_path, "External URL should be rewritten"
+            assert "_external/" in video_path, "Should reference local path"
+
+    finally:
+        os.unlink(temp_archive.name)
+
+
+def test_archive_external_refs_failure_graceful():
+    """If external ref downloading fails, archive still processes successfully."""
+    temp_archive = tempfile.NamedTemporaryFile(suffix=".zip", delete=False)
+    temp_archive.close()
+
+    try:
+        with zipfile.ZipFile(temp_archive.name, "w") as zf:
+            # Use unique content to avoid cache hit from previous test
+            zf.writestr(
+                "index.html",
+                '<html><body>Unique failure test content <img src="https://cdn.example.com/photo.jpg"></body></html>',
+            )
+
+        with patch(
+            "ricecooker.utils.pipeline.convert.download_and_rewrite_external_refs"
+        ) as mock_process:
+            mock_process.side_effect = Exception("Network error")
+
+            with patch("ricecooker.config.COMPRESS", False):
+                html_file = HTMLZipFile(temp_archive.name)
+                result = html_file.process_file()
+
+        assert result is not None, "Processing should succeed even on external ref failure"
+
+        # Original URL should be preserved since processing failed
+        result_path = config.get_storage_path(result)
+        with zipfile.ZipFile(result_path, "r") as zf:
+            html = zf.read("index.html").decode("utf-8")
+            assert "https://cdn.example.com/photo.jpg" in html
+
+    finally:
+        os.unlink(temp_archive.name)

From 8a0641158d136742cfc94818ccb6b2d087cf351a Mon Sep 17 00:00:00 2001
From: rtibblesbot <richard+githubagent@learningequality.org>
Date: Fri, 13 Feb 2026 07:14:01 -0800
Subject: [PATCH 4/4] Address review feedback: fix bare exceptions, linting,
 and formatting

- Replace bare `except Exception` with specific exception types
  (OSError, zipfile.BadZipFile, ValueError, IOError)
- Reduce McCabe complexity by extracting helper functions:
  - archive_assets.py: split scan/download/rewrite phases
  - url_utils.py: split HTML extract/rewrite into per-selector helpers
- Fix import ordering (reorder-python-imports) and formatting (black)
- Update test to use OSError instead of bare Exception

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ricecooker/utils/archive_assets.py   | 173 +++++++++++++++------------
 ricecooker/utils/pipeline/convert.py |   6 +-
 ricecooker/utils/url_utils.py        | 121 +++++++++++--------
 tests/pipeline/test_convert.py       |  11 +-
 tests/test_archive_assets.py         |  24 +---
 tests/test_url_utils.py              |  59 +++------
 6 files changed, 202 insertions(+), 192 deletions(-)

diff --git a/ricecooker/utils/archive_assets.py b/ricecooker/utils/archive_assets.py
index 1acf2d03..f464fde4 100644
--- a/ricecooker/utils/archive_assets.py
+++ b/ricecooker/utils/archive_assets.py
@@ -12,15 +12,13 @@
 import zipfile
 
 from ricecooker.utils.downloader import make_request
-from ricecooker.utils.url_utils import (
-    derive_local_filename,
-    extract_urls_from_css,
-    extract_urls_from_h5p_json,
-    extract_urls_from_html,
-    rewrite_urls_in_css,
-    rewrite_urls_in_h5p_json,
-    rewrite_urls_in_html,
-)
+from ricecooker.utils.url_utils import derive_local_filename
+from ricecooker.utils.url_utils import extract_urls_from_css
+from ricecooker.utils.url_utils import extract_urls_from_h5p_json
+from ricecooker.utils.url_utils import extract_urls_from_html
+from ricecooker.utils.url_utils import rewrite_urls_in_css
+from ricecooker.utils.url_utils import rewrite_urls_in_h5p_json
+from ricecooker.utils.url_utils import rewrite_urls_in_html
 
 logger = logging.getLogger(__name__)
 
@@ -88,31 +86,33 @@ def _download_external_url(url, dest_dir, local_path):
         with open(full_path, "wb") as f:
             f.write(response.content)
         return True
-    except Exception:
+    except (OSError, IOError, ValueError):
         logger.warning("Error downloading %s", url, exc_info=True)
         return False
 
 
-def download_and_rewrite_external_refs(archive_path, url_blacklist=None):
-    """
-    Process an archive to download external URL references and rewrite them
-    to local paths.
-
-    Args:
-        archive_path: Path to the archive file (ZIP or H5P)
-        url_blacklist: Optional list of URL substrings to skip
-
-    Returns:
-        Path to a temporary directory containing the processed archive contents.
-        The caller is responsible for cleaning up this directory.
-    """
-    # Extract archive to temp directory
-    temp_dir = tempfile.mkdtemp(prefix="ricecooker_archive_")
-
-    with zipfile.ZipFile(archive_path, "r") as zf:
-        zf.extractall(temp_dir)
-
-    # Phase 1: Scan all text files for external URLs
+def _extract_urls_from_file(full_path, rel_path, content_type):
+    """Extract external URLs from a single file. Returns list or None on error."""
+    try:
+        with open(full_path, "r", encoding="utf-8") as f:
+            content = f.read()
+    except (UnicodeDecodeError, OSError):
+        logger.warning("Could not read %s as text, skipping", rel_path)
+        return None
+
+    extractors = {
+        "html": extract_urls_from_html,
+        "css": extract_urls_from_css,
+        "json": extract_urls_from_h5p_json,
+    }
+    extractor = extractors.get(content_type)
+    if extractor is None:
+        return None
+    return extractor(content, rel_path)
+
+
+def _scan_archive_for_urls(temp_dir, url_blacklist):
+    """Scan all text files in an extracted archive for external URLs."""
     all_urls = {}  # url -> derive_local_filename result
     file_urls = {}  # filepath -> list of extracted URLs
 
@@ -121,43 +121,27 @@ def download_and_rewrite_external_refs(archive_path, url_blacklist=None):
             full_path = os.path.join(root, filename)
             rel_path = os.path.relpath(full_path, temp_dir)
             content_type = _detect_content_type(rel_path)
-
             if content_type is None:
                 continue
 
-            try:
-                with open(full_path, "r", encoding="utf-8") as f:
-                    content = f.read()
-            except (UnicodeDecodeError, OSError):
-                logger.warning("Could not read %s as text, skipping", rel_path)
-                continue
-
-            if content_type == "html":
-                extracted = extract_urls_from_html(content, rel_path)
-            elif content_type == "css":
-                extracted = extract_urls_from_css(content, rel_path)
-            elif content_type == "json":
-                extracted = extract_urls_from_h5p_json(content, rel_path)
-            else:
+            extracted = _extract_urls_from_file(full_path, rel_path, content_type)
+            if extracted is None:
                 continue
 
-            # Filter out blacklisted URLs
             external = [
-                e
-                for e in extracted
-                if not _is_blacklisted(e.url, url_blacklist)
+                e for e in extracted if not _is_blacklisted(e.url, url_blacklist)
             ]
-
             if external:
                 file_urls[rel_path] = external
                 for e in external:
                     if e.url not in all_urls:
                         all_urls[e.url] = derive_local_filename(e.url)
 
-    if not all_urls:
-        return temp_dir
+    return all_urls, file_urls
 
-    # Phase 2: Download all external URLs
+
+def _download_all_urls(temp_dir, all_urls, url_blacklist):
+    """Download all external URLs, including recursive CSS references."""
     successful_downloads = set()
     visited_urls = set()
 
@@ -168,8 +152,6 @@ def download_and_rewrite_external_refs(archive_path, url_blacklist=None):
 
         if _download_external_url(url, temp_dir, local_path):
             successful_downloads.add(url)
-
-            # CSS recursive download: scan downloaded CSS for more external refs
             if local_path.endswith(".css") or "css" in local_path.split("?")[0]:
                 _process_downloaded_css(
                     temp_dir,
@@ -180,8 +162,55 @@ def download_and_rewrite_external_refs(archive_path, url_blacklist=None):
                     url_blacklist,
                 )
 
-    # Phase 3: Rewrite references in text files
-    url_map_by_file = {}
+    return successful_downloads
+
+
+def _rewrite_file(temp_dir, rel_path, url_map):
+    """Rewrite URL references in a single file."""
+    full_path = os.path.join(temp_dir, rel_path)
+    content_type = _detect_content_type(rel_path)
+
+    with open(full_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    rewriters = {
+        "html": rewrite_urls_in_html,
+        "css": rewrite_urls_in_css,
+        "json": rewrite_urls_in_h5p_json,
+    }
+    rewriter = rewriters.get(content_type)
+    if rewriter:
+        content = rewriter(content, url_map)
+
+    with open(full_path, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def download_and_rewrite_external_refs(archive_path, url_blacklist=None):
+    """
+    Process an archive to download external URL references and rewrite them
+    to local paths.
+
+    Args:
+        archive_path: Path to the archive file (ZIP or H5P)
+        url_blacklist: Optional list of URL substrings to skip
+
+    Returns:
+        Path to a temporary directory containing the processed archive contents.
+        The caller is responsible for cleaning up this directory.
+    """
+    temp_dir = tempfile.mkdtemp(prefix="ricecooker_archive_")
+
+    with zipfile.ZipFile(archive_path, "r") as zf:
+        zf.extractall(temp_dir)
+
+    all_urls, file_urls = _scan_archive_for_urls(temp_dir, url_blacklist)
+
+    if not all_urls:
+        return temp_dir
+
+    successful_downloads = _download_all_urls(temp_dir, all_urls, url_blacklist)
+
     for rel_path, extracted_list in file_urls.items():
         url_map = {}
         for e in extracted_list:
@@ -189,30 +218,18 @@ def download_and_rewrite_external_refs(archive_path, url_blacklist=None):
                 local_path = all_urls[e.url]
                 url_map[e.url] = _compute_relative_path(rel_path, local_path)
         if url_map:
-            url_map_by_file[rel_path] = url_map
-
-    for rel_path, url_map in url_map_by_file.items():
-        full_path = os.path.join(temp_dir, rel_path)
-        content_type = _detect_content_type(rel_path)
-
-        with open(full_path, "r", encoding="utf-8") as f:
-            content = f.read()
-
-        if content_type == "html":
-            content = rewrite_urls_in_html(content, url_map)
-        elif content_type == "css":
-            content = rewrite_urls_in_css(content, url_map)
-        elif content_type == "json":
-            content = rewrite_urls_in_h5p_json(content, url_map)
-
-        with open(full_path, "w", encoding="utf-8") as f:
-            f.write(content)
+            _rewrite_file(temp_dir, rel_path, url_map)
 
     return temp_dir
 
 
 def _process_downloaded_css(
-    temp_dir, css_local_path, all_urls, successful_downloads, visited_urls, url_blacklist
+    temp_dir,
+    css_local_path,
+    all_urls,
+    successful_downloads,
+    visited_urls,
+    url_blacklist,
 ):
     """Scan a downloaded CSS file for additional external references and download them."""
     full_path = os.path.join(temp_dir, css_local_path)
@@ -223,9 +240,7 @@ def _process_downloaded_css(
         return
 
     extracted = extract_urls_from_css(css_content, css_local_path)
-    external = [
-        e for e in extracted if not _is_blacklisted(e.url, url_blacklist)
-    ]
+    external = [e for e in extracted if not _is_blacklisted(e.url, url_blacklist)]
 
     if not external:
         return
diff --git a/ricecooker/utils/pipeline/convert.py b/ricecooker/utils/pipeline/convert.py
index fef557e7..faf533cc 100644
--- a/ricecooker/utils/pipeline/convert.py
+++ b/ricecooker/utils/pipeline/convert.py
@@ -2,6 +2,7 @@
 To avoid making the pipeline overly convoluted, these handlers
 both validate and convert files.
 """
+
 import json
 import os
 import shutil
@@ -31,6 +32,7 @@
 from .file_handler import StageHandler
 from ricecooker import config
 from ricecooker.exceptions import UnknownFileTypeError
+from ricecooker.utils.archive_assets import download_and_rewrite_external_refs
 from ricecooker.utils.audio import AudioCompressionError
 from ricecooker.utils.audio import compress_audio
 from ricecooker.utils.caching import generate_key
@@ -46,10 +48,8 @@
 from ricecooker.utils.videos import validate_media_file
 from ricecooker.utils.videos import VideoCompressionError
 from ricecooker.utils.youtube import get_language_with_alpha2_fallback
-from ricecooker.utils.archive_assets import download_and_rewrite_external_refs
 from ricecooker.utils.zip import create_predictable_zip
 
-
 CONVERTIBLE_FORMATS = {p.id: p.convertible_formats for p in format_presets.PRESETLIST}
 
 
@@ -204,7 +204,7 @@ def _process_external_refs(self, path):
         """
         try:
             return download_and_rewrite_external_refs(path)
-        except Exception as e:
+        except (OSError, zipfile.BadZipFile, ValueError) as e:
             config.LOGGER.warning(
                 "Failed to process external references in %s: %s. "
                 "Continuing with original archive.",
diff --git a/ricecooker/utils/url_utils.py b/ricecooker/utils/url_utils.py
index b2aa913b..62055d66 100644
--- a/ricecooker/utils/url_utils.py
+++ b/ricecooker/utils/url_utils.py
@@ -113,28 +113,8 @@ def _parse_srcset(srcset_value):
     return urls
 
 
-def extract_urls_from_html(html_content, source_file=""):
-    """
-    Extract all external URL references from HTML content.
-
-    Finds URLs in:
-    - img[src], script[src], source[src]
-    - img[srcset], source[srcset]
-    - link[rel=stylesheet][href]
-    - inline style attributes with url()
-    - <style> blocks
-
-    Skips data: URIs and relative URLs.
-    Returns list of ExtractedURL instances.
-    """
-    if not html_content or not html_content.strip():
-        return []
-
-    soup = BeautifulSoup(html_content, "html.parser")
-    results = []
-    seen = set()
-
-    # img[src], script[src], source[src]
+def _extract_src_urls(soup, source_file, seen, results):
+    """Extract external URLs from src attributes on img, script, source tags."""
     for tag_name in ("img", "script", "source"):
         for node in soup.find_all(tag_name, src=True):
             url = node["src"]
@@ -150,7 +130,9 @@ def extract_urls_from_html(html_content, source_file=""):
                     )
                 )
 
-    # img[srcset], source[srcset]
+
+def _extract_srcset_urls(soup, source_file, seen, results):
+    """Extract external URLs from srcset attributes on img, source tags."""
     for tag_name in ("img", "source"):
         for node in soup.find_all(tag_name, srcset=True):
             for url in _parse_srcset(node["srcset"]):
@@ -166,7 +148,9 @@ def extract_urls_from_html(html_content, source_file=""):
                         )
                     )
 
-    # link[rel=stylesheet][href] — use "rel" in node.attrs (PR #636 fix)
+
+def _extract_stylesheet_urls(soup, source_file, seen, results):
+    """Extract external URLs from link[rel=stylesheet] href attributes."""
     for node in soup.find_all("link", href=True):
         if "rel" in node.attrs and "stylesheet" in node.get("rel", []):
             url = node["href"]
@@ -182,7 +166,9 @@ def extract_urls_from_html(html_content, source_file=""):
                     )
                 )
 
-    # Inline style attributes with url()
+
+def _extract_style_urls(soup, source_file, seen, results):
+    """Extract external URLs from inline style attributes and style blocks."""
     for node in soup.find_all(style=True):
         style_val = node.get("style", "")
         for extracted in extract_urls_from_css(style_val, source_file):
@@ -190,7 +176,6 @@ def extract_urls_from_html(html_content, source_file=""):
                 seen.add(extracted.url)
                 results.append(extracted)
 
-    # <style> blocks
     for style_node in soup.find_all("style"):
         if style_node.string:
             for extracted in extract_urls_from_css(style_node.string, source_file):
@@ -198,6 +183,33 @@ def extract_urls_from_html(html_content, source_file=""):
                     seen.add(extracted.url)
                     results.append(extracted)
 
+
+def extract_urls_from_html(html_content, source_file=""):
+    """
+    Extract all external URL references from HTML content.
+
+    Finds URLs in:
+    - img[src], script[src], source[src]
+    - img[srcset], source[srcset]
+    - link[rel=stylesheet][href]
+    - inline style attributes with url()
+    - <style> blocks
+
+    Skips data: URIs and relative URLs.
+    Returns list of ExtractedURL instances.
+    """
+    if not html_content or not html_content.strip():
+        return []
+
+    soup = BeautifulSoup(html_content, "html.parser")
+    results = []
+    seen = set()
+
+    _extract_src_urls(soup, source_file, seen, results)
+    _extract_srcset_urls(soup, source_file, seen, results)
+    _extract_stylesheet_urls(soup, source_file, seen, results)
+    _extract_style_urls(soup, source_file, seen, results)
+
     return results
 
 
@@ -261,29 +273,17 @@ def _repl_import(match):
     return result
 
 
-def rewrite_urls_in_html(html_content, url_map):
-    """
-    Rewrite URL references in HTML content using the provided mapping.
-
-    Handles the same selectors as extract_urls_from_html:
-    img/script/source src, img/source srcset, link[stylesheet] href,
-    inline styles, and <style> blocks.
-
-    URLs not in the map are left unchanged.
-    """
-    if not html_content or not html_content.strip():
-        return html_content
-
-    soup = BeautifulSoup(html_content, "html.parser")
-
-    # img[src], script[src], source[src]
+def _rewrite_src_attrs(soup, url_map):
+    """Rewrite src attributes on img, script, source tags."""
     for tag_name in ("img", "script", "source"):
         for node in soup.find_all(tag_name, src=True):
             url = node["src"]
             if url in url_map:
                 node["src"] = url_map[url]
 
-    # img[srcset], source[srcset]
+
+def _rewrite_srcset_attrs(soup, url_map):
+    """Rewrite srcset attributes on img, source tags."""
     for tag_name in ("img", "source"):
         for node in soup.find_all(tag_name, srcset=True):
             entries = []
@@ -292,29 +292,52 @@ def rewrite_urls_in_html(html_content, url_map):
                 if not entry:
                     continue
                 parts = entry.split()
-                url = parts[0]
-                if url in url_map:
-                    parts[0] = url_map[url]
+                if parts[0] in url_map:
+                    parts[0] = url_map[parts[0]]
                 entries.append(" ".join(parts))
             node["srcset"] = ", ".join(entries)
 
-    # link[rel=stylesheet][href]
+
+def _rewrite_stylesheet_hrefs(soup, url_map):
+    """Rewrite href attributes on link[rel=stylesheet] tags."""
     for node in soup.find_all("link", href=True):
         if "rel" in node.attrs and "stylesheet" in node.get("rel", []):
             url = node["href"]
             if url in url_map:
                 node["href"] = url_map[url]
 
-    # Inline style attributes
+
+def _rewrite_style_content(soup, url_map):
+    """Rewrite URLs in inline style attributes and style blocks."""
     for node in soup.find_all(style=True):
         style_val = node.get("style", "")
         node["style"] = rewrite_urls_in_css(style_val, url_map)
 
-    # <style> blocks
     for style_node in soup.find_all("style"):
         if style_node.string:
             style_node.string = rewrite_urls_in_css(style_node.string, url_map)
 
+
+def rewrite_urls_in_html(html_content, url_map):
+    """
+    Rewrite URL references in HTML content using the provided mapping.
+
+    Handles the same selectors as extract_urls_from_html:
+    img/script/source src, img/source srcset, link[stylesheet] href,
+    inline styles, and <style> blocks.
+
+    URLs not in the map are left unchanged.
+    """
+    if not html_content or not html_content.strip():
+        return html_content
+
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    _rewrite_src_attrs(soup, url_map)
+    _rewrite_srcset_attrs(soup, url_map)
+    _rewrite_stylesheet_hrefs(soup, url_map)
+    _rewrite_style_content(soup, url_map)
+
     return str(soup)
 
 
diff --git a/tests/pipeline/test_convert.py b/tests/pipeline/test_convert.py
index 41f803f9..04043156 100644
--- a/tests/pipeline/test_convert.py
+++ b/tests/pipeline/test_convert.py
@@ -1,4 +1,5 @@
 """Tests for audio and video compression in archive files."""
+
 import json
 import os
 import tempfile
@@ -215,7 +216,9 @@ def test_h5p_archive_external_video_downloaded():
 
             data = json.loads(zf.read("content/content.json"))
             video_path = data["video"]["files"][0]["path"]
-            assert "https://h5p.org" not in video_path, "External URL should be rewritten"
+            assert (
+                "https://h5p.org" not in video_path
+            ), "External URL should be rewritten"
             assert "_external/" in video_path, "Should reference local path"
 
     finally:
@@ -238,13 +241,15 @@ def test_archive_external_refs_failure_graceful():
         with patch(
             "ricecooker.utils.pipeline.convert.download_and_rewrite_external_refs"
         ) as mock_process:
-            mock_process.side_effect = Exception("Network error")
+            mock_process.side_effect = OSError("Network error")
 
             with patch("ricecooker.config.COMPRESS", False):
                 html_file = HTMLZipFile(temp_archive.name)
                 result = html_file.process_file()
 
-        assert result is not None, "Processing should succeed even on external ref failure"
+        assert (
+            result is not None
+        ), "Processing should succeed even on external ref failure"
 
         # Original URL should be preserved since processing failed
         result_path = config.get_storage_path(result)
diff --git a/tests/test_archive_assets.py b/tests/test_archive_assets.py
index 0faa30c6..46e3ddb7 100644
--- a/tests/test_archive_assets.py
+++ b/tests/test_archive_assets.py
@@ -177,9 +177,7 @@ def test_css_with_external_font(self, mock_download):
 
     def test_css_with_import(self, mock_download):
         zip_path = _create_zip(
-            {
-                "style.css": "@import 'https://fonts.googleapis.com/css?family=Roboto';"
-            }
+            {"style.css": "@import 'https://fonts.googleapis.com/css?family=Roboto';"}
         )
         try:
             result_dir = download_and_rewrite_external_refs(zip_path)
@@ -234,9 +232,7 @@ def test_h5p_with_external_video(self, mock_download):
                 )
                 assert os.path.exists(video_path)
 
-                with open(
-                    os.path.join(result_dir, "content", "content.json")
-                ) as f:
+                with open(os.path.join(result_dir, "content", "content.json")) as f:
                     data = json.load(f)
                 path_val = data["video"]["files"][0]["path"]
                 assert "https://h5p.org" not in path_val
@@ -410,14 +406,10 @@ def test_path_traversal_url_stays_in_temp_dir(self, mock_download):
             os.unlink(zip_path)
 
     def test_blacklisted_urls_skipped(self, mock_download):
-        zip_path = _create_zip(
-            {
-                "index.html": """<html><body>
+        zip_path = _create_zip({"index.html": """<html><body>
                 <img src="https://cdn.example.com/photo.jpg">
                 <img src="https://blocked.example.com/img.jpg">
-                </body></html>"""
-            }
-        )
+                </body></html>"""})
         try:
             result_dir = download_and_rewrite_external_refs(
                 zip_path, url_blacklist=["blocked.example.com"]
@@ -464,12 +456,8 @@ def test_original_files_preserved(self, mock_download):
             result_dir = download_and_rewrite_external_refs(zip_path)
             try:
                 assert os.path.exists(os.path.join(result_dir, "index.html"))
-                assert os.path.exists(
-                    os.path.join(result_dir, "images", "local.png")
-                )
-                assert os.path.exists(
-                    os.path.join(result_dir, "scripts", "app.js")
-                )
+                assert os.path.exists(os.path.join(result_dir, "images", "local.png"))
+                assert os.path.exists(os.path.join(result_dir, "scripts", "app.js"))
             finally:
                 shutil.rmtree(result_dir, ignore_errors=True)
         finally:
diff --git a/tests/test_url_utils.py b/tests/test_url_utils.py
index 411f72f1..148f9fb5 100644
--- a/tests/test_url_utils.py
+++ b/tests/test_url_utils.py
@@ -6,17 +6,14 @@
 
 import json
 
-from ricecooker.utils.url_utils import (
-    derive_local_filename,
-    extract_urls_from_css,
-    extract_urls_from_h5p_json,
-    extract_urls_from_html,
-    is_external_url,
-    rewrite_urls_in_css,
-    rewrite_urls_in_h5p_json,
-    rewrite_urls_in_html,
-)
-
+from ricecooker.utils.url_utils import derive_local_filename
+from ricecooker.utils.url_utils import extract_urls_from_css
+from ricecooker.utils.url_utils import extract_urls_from_h5p_json
+from ricecooker.utils.url_utils import extract_urls_from_html
+from ricecooker.utils.url_utils import is_external_url
+from ricecooker.utils.url_utils import rewrite_urls_in_css
+from ricecooker.utils.url_utils import rewrite_urls_in_h5p_json
+from ricecooker.utils.url_utils import rewrite_urls_in_html
 
 # ---------------------------------------------------------------------------
 # is_external_url tests
@@ -70,9 +67,7 @@ def test_url_with_subdirs(self):
         assert result == "_external/fonts.example.com/v1/fonts/roboto.woff2"
 
     def test_url_with_query(self):
-        result = derive_local_filename(
-            "https://fonts.googleapis.com/css?family=Roboto"
-        )
+        result = derive_local_filename("https://fonts.googleapis.com/css?family=Roboto")
         assert result == "_external/fonts.googleapis.com/css?family=Roboto"
 
     def test_url_root_path(self):
@@ -90,9 +85,7 @@ def test_path_traversal_stripped(self):
         assert "etc/passwd" in result
 
     def test_path_traversal_deep(self):
-        result = derive_local_filename(
-            "https://evil.com/a/../../b/../../../etc/passwd"
-        )
+        result = derive_local_filename("https://evil.com/a/../../b/../../../etc/passwd")
         assert ".." not in result
         assert result.startswith("_external/")
 
@@ -213,7 +206,9 @@ def test_extract_img_srcset(self):
         assert "https://cdn.example.com/img-600.jpg" in extracted
 
     def test_extract_img_srcset_mixed_relative_external(self):
-        html = '<img srcset="img-300.jpg 300w, https://cdn.example.com/img-600.jpg 600w">'
+        html = (
+            '<img srcset="img-300.jpg 300w, https://cdn.example.com/img-600.jpg 600w">'
+        )
         urls = extract_urls_from_html(html, "index.html")
         srcset_urls = [u for u in urls if u.context == "html_srcset"]
         assert len(srcset_urls) == 1
@@ -337,9 +332,7 @@ def test_deeply_nested(self):
                 "level1": {
                     "level2": {
                         "level3": [
-                            {
-                                "path": "https://cdn.example.com/deep/resource.mp4"
-                            }
+                            {"path": "https://cdn.example.com/deep/resource.mp4"}
                         ]
                     }
                 }
@@ -477,9 +470,7 @@ def test_rewrite_link_href(self):
 
     def test_rewrite_script_src(self):
         html = '<script src="https://cdn.example.com/lib.js"></script>'
-        url_map = {
-            "https://cdn.example.com/lib.js": "_external/cdn.example.com/lib.js"
-        }
+        url_map = {"https://cdn.example.com/lib.js": "_external/cdn.example.com/lib.js"}
         result = rewrite_urls_in_html(html, url_map)
         assert "_external/cdn.example.com/lib.js" in result
 
@@ -511,34 +502,22 @@ def test_rewrite_preserves_unmapped(self):
                 "image": {"path": "https://other.com/image.jpg"},
             }
         )
-        url_map = {
-            "https://example.com/video.mp4": "_external/example.com/video.mp4"
-        }
+        url_map = {"https://example.com/video.mp4": "_external/example.com/video.mp4"}
         result = rewrite_urls_in_h5p_json(data, url_map)
         parsed = json.loads(result)
         assert parsed["video"]["path"] == "_external/example.com/video.mp4"
         assert parsed["image"]["path"] == "https://other.com/image.jpg"
 
     def test_rewrite_deeply_nested(self):
-        data = json.dumps(
-            {
-                "a": {
-                    "b": [{"path": "https://example.com/deep.mp4"}]
-                }
-            }
-        )
-        url_map = {
-            "https://example.com/deep.mp4": "_external/example.com/deep.mp4"
-        }
+        data = json.dumps({"a": {"b": [{"path": "https://example.com/deep.mp4"}]}})
+        url_map = {"https://example.com/deep.mp4": "_external/example.com/deep.mp4"}
         result = rewrite_urls_in_h5p_json(data, url_map)
         parsed = json.loads(result)
         assert parsed["a"]["b"][0]["path"] == "_external/example.com/deep.mp4"
 
     def test_rewrite_relative_path_unchanged(self):
         data = json.dumps({"image": {"path": "images/photo.jpg"}})
-        url_map = {
-            "https://example.com/video.mp4": "_external/example.com/video.mp4"
-        }
+        url_map = {"https://example.com/video.mp4": "_external/example.com/video.mp4"}
         result = rewrite_urls_in_h5p_json(data, url_map)
         parsed = json.loads(result)
         assert parsed["image"]["path"] == "images/photo.jpg"