diff --git a/ricecooker/utils/archive_assets.py b/ricecooker/utils/archive_assets.py
new file mode 100644
index 00000000..f464fde4
--- /dev/null
+++ b/ricecooker/utils/archive_assets.py
@@ -0,0 +1,271 @@
+"""
+Archive external reference processor.
+
+Opens an archive (ZIP/H5P), scans text-based files for external URL references,
+downloads those resources, bundles them into the archive, and rewrites references
+to point to local copies.
+"""
+
+import logging
+import os
+import tempfile
+import zipfile
+
+from ricecooker.utils.downloader import make_request
+from ricecooker.utils.url_utils import derive_local_filename
+from ricecooker.utils.url_utils import extract_urls_from_css
+from ricecooker.utils.url_utils import extract_urls_from_h5p_json
+from ricecooker.utils.url_utils import extract_urls_from_html
+from ricecooker.utils.url_utils import rewrite_urls_in_css
+from ricecooker.utils.url_utils import rewrite_urls_in_h5p_json
+from ricecooker.utils.url_utils import rewrite_urls_in_html
+
+logger = logging.getLogger(__name__)
+
+# Map file extensions to content type for selecting the right extractor/rewriter
+_TEXT_EXTENSIONS = {
+    ".html": "html",
+    ".htm": "html",
+    ".xhtml": "html",
+    ".xml": "html",
+    ".css": "css",
+    ".json": "json",
+}
+
+
+def _is_h5p_content_json(filepath):
+    """Check if a JSON file is an H5P content.json that should be scanned."""
+    normalized = filepath.replace("\\", "/")
+    return normalized == "content/content.json" or normalized.endswith(
+        "/content/content.json"
+    )
+
+
+def _detect_content_type(filepath):
+    """Detect the content type of a file based on its extension."""
+    ext = os.path.splitext(filepath)[1].lower()
+    if ext == ".json":
+        if _is_h5p_content_json(filepath):
+            return "json"
+        return None  # Skip non-H5P JSON files
+    return _TEXT_EXTENSIONS.get(ext)
+
+
+def _compute_relative_path(from_file, to_file):
+    """Compute relative path from one file to another within the archive."""
+    from_dir = os.path.dirname(from_file)
+    return os.path.relpath(to_file, from_dir).replace("\\", "/")
+
+
+def _is_blacklisted(url, blacklist):
+    """Check if a URL matches any blacklist substring."""
+    if not blacklist:
+        return False
+    return any(pattern in url for pattern in blacklist)
+
+
+def _download_external_url(url, dest_dir, local_path):
+    """
+    Download a single external URL to the destination directory.
+
+    Returns True on success, False on failure.
+    """
+    full_path = os.path.join(dest_dir, local_path)
+    # Guard against path traversal — resolved path must stay within dest_dir
+    resolved = os.path.realpath(full_path)
+    if not resolved.startswith(os.path.realpath(dest_dir) + os.sep):
+        logger.warning("Path traversal detected for %s, skipping download", url)
+        return False
+    os.makedirs(os.path.dirname(full_path), exist_ok=True)
+
+    try:
+        response = make_request(url)
+        if response is None or response.status_code != 200:
+            logger.warning("Failed to download %s (no response or non-200)", url)
+            return False
+        with open(full_path, "wb") as f:
+            f.write(response.content)
+        return True
+    except (OSError, IOError, ValueError):
+        logger.warning("Error downloading %s", url, exc_info=True)
+        return False
+
+
+def _extract_urls_from_file(full_path, rel_path, content_type):
+    """Extract external URLs from a single file. Returns list or None on error."""
+    try:
+        with open(full_path, "r", encoding="utf-8") as f:
+            content = f.read()
+    except (UnicodeDecodeError, OSError):
+        logger.warning("Could not read %s as text, skipping", rel_path)
+        return None
+
+    extractors = {
+        "html": extract_urls_from_html,
+        "css": extract_urls_from_css,
+        "json": extract_urls_from_h5p_json,
+    }
+    extractor = extractors.get(content_type)
+    if extractor is None:
+        return None
+    return extractor(content, rel_path)
+
+
+def _scan_archive_for_urls(temp_dir, url_blacklist):
+    """Scan all text files in an extracted archive for external URLs."""
+    all_urls = {}  # url -> derive_local_filename result
+    file_urls = {}  # filepath -> list of extracted URLs
+
+    for root, _dirs, filenames in os.walk(temp_dir):
+        for filename in filenames:
+            full_path = os.path.join(root, filename)
+            rel_path = os.path.relpath(full_path, temp_dir)
+            content_type = _detect_content_type(rel_path)
+            if content_type is None:
+                continue
+
+            extracted = _extract_urls_from_file(full_path, rel_path, content_type)
+            if extracted is None:
+                continue
+
+            external = [
+                e for e in extracted if not _is_blacklisted(e.url, url_blacklist)
+            ]
+            if external:
+                file_urls[rel_path] = external
+                for e in external:
+                    if e.url not in all_urls:
+                        all_urls[e.url] = derive_local_filename(e.url)
+
+    return all_urls, file_urls
+
+
+def _download_all_urls(temp_dir, all_urls, url_blacklist):
+    """Download all external URLs, including recursive CSS references."""
+    successful_downloads = set()
+    visited_urls = set()
+
+    for url, local_path in list(all_urls.items()):
+        if url in visited_urls:
+            continue
+        visited_urls.add(url)
+
+        if _download_external_url(url, temp_dir, local_path):
+            successful_downloads.add(url)
+            if local_path.endswith(".css") or "css" in local_path.split("?")[0]:
+                _process_downloaded_css(
+                    temp_dir,
+                    local_path,
+                    all_urls,
+                    successful_downloads,
+                    visited_urls,
+                    url_blacklist,
+                )
+
+    return successful_downloads
+
+
+def _rewrite_file(temp_dir, rel_path, url_map):
+    """Rewrite URL references in a single file."""
+    full_path = os.path.join(temp_dir, rel_path)
+    content_type = _detect_content_type(rel_path)
+
+    with open(full_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    rewriters = {
+        "html": rewrite_urls_in_html,
+        "css": rewrite_urls_in_css,
+        "json": rewrite_urls_in_h5p_json,
+    }
+    rewriter = rewriters.get(content_type)
+    if rewriter:
+        content = rewriter(content, url_map)
+
+    with open(full_path, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def download_and_rewrite_external_refs(archive_path, url_blacklist=None):
+    """
+    Process an archive to download external URL references and rewrite them
+    to local paths.
+
+    Args:
+        archive_path: Path to the archive file (ZIP or H5P)
+        url_blacklist: Optional list of URL substrings to skip
+
+    Returns:
+        Path to a temporary directory containing the processed archive contents.
+        The caller is responsible for cleaning up this directory.
+    """
+    temp_dir = tempfile.mkdtemp(prefix="ricecooker_archive_")
+
+    with zipfile.ZipFile(archive_path, "r") as zf:
+        zf.extractall(temp_dir)
+
+    all_urls, file_urls = _scan_archive_for_urls(temp_dir, url_blacklist)
+
+    if not all_urls:
+        return temp_dir
+
+    successful_downloads = _download_all_urls(temp_dir, all_urls, url_blacklist)
+
+    for rel_path, extracted_list in file_urls.items():
+        url_map = {}
+        for e in extracted_list:
+            if e.url in successful_downloads:
+                local_path = all_urls[e.url]
+                url_map[e.url] = _compute_relative_path(rel_path, local_path)
+        if url_map:
+            _rewrite_file(temp_dir, rel_path, url_map)
+
+    return temp_dir
+
+
+def _process_downloaded_css(
+    temp_dir,
+    css_local_path,
+    all_urls,
+    successful_downloads,
+    visited_urls,
+    url_blacklist,
+):
+    """Scan a downloaded CSS file for additional external references and download them."""
+    full_path = os.path.join(temp_dir, css_local_path)
+    try:
+        with open(full_path, "r", encoding="utf-8") as f:
+            css_content = f.read()
+    except (UnicodeDecodeError, OSError):
+        return
+
+    extracted = extract_urls_from_css(css_content, css_local_path)
+    external = [e for e in extracted if not _is_blacklisted(e.url, url_blacklist)]
+
+    if not external:
+        return
+
+    # Download newly found external URLs
+    css_url_map = {}
+    for e in external:
+        if e.url in visited_urls:
+            continue
+        visited_urls.add(e.url)
+
+        local_path = derive_local_filename(e.url)
+        all_urls[e.url] = local_path
+
+        if _download_external_url(e.url, temp_dir, local_path):
+            successful_downloads.add(e.url)
+            css_url_map[e.url] = _compute_relative_path(css_local_path, local_path)
+
+    # Also build map for any already-downloaded URLs referenced from this CSS
+    for e in external:
+        if e.url in successful_downloads and e.url not in css_url_map:
+            local_path = all_urls[e.url]
+            css_url_map[e.url] = _compute_relative_path(css_local_path, local_path)
+
+    if css_url_map:
+        rewritten = rewrite_urls_in_css(css_content, css_url_map)
+        with open(full_path, "w", encoding="utf-8") as f:
+            f.write(rewritten)
diff --git a/ricecooker/utils/pipeline/convert.py b/ricecooker/utils/pipeline/convert.py
index 4c9e209b..faf533cc 100644
--- a/ricecooker/utils/pipeline/convert.py
+++ b/ricecooker/utils/pipeline/convert.py
@@ -2,6 +2,7 @@
 To avoid making the pipeline overly convoluted, these handlers
 both validate and convert files.
 """
+
 import json
 import os
 import shutil
@@ -31,6 +32,7 @@
 from .file_handler import StageHandler
 from ricecooker import config
 from ricecooker.exceptions import UnknownFileTypeError
+from ricecooker.utils.archive_assets import download_and_rewrite_external_refs
 from ricecooker.utils.audio import AudioCompressionError
 from ricecooker.utils.audio import compress_audio
 from ricecooker.utils.caching import generate_key
@@ -48,7 +50,6 @@
 from ricecooker.utils.youtube import get_language_with_alpha2_fallback
 from ricecooker.utils.zip import create_predictable_zip
 
-
 CONVERTIBLE_FORMATS = {p.id: p.convertible_formats for p in format_presets.PRESETLIST}
 
 
@@ -194,11 +195,32 @@ def FILE_TYPE(self) -> str:
     def validate_archive(self, path: str):
         pass
 
+    def _process_external_refs(self, path):
+        """
+        Process external URL references in the archive.
+
+        Returns the path to process — either a temp directory with downloaded
+        assets, or the original path if processing fails or finds nothing.
+        """
+        try:
+            return download_and_rewrite_external_refs(path)
+        except (OSError, zipfile.BadZipFile, ValueError) as e:
+            config.LOGGER.warning(
+                "Failed to process external references in %s: %s. "
+                "Continuing with original archive.",
+                path,
+                e,
+            )
+            return path
+
     def handle_file(self, path, audio_settings=None, video_settings=None):
         self.validate_archive(path)
 
         ext = extract_path_ext(path)
 
+        # Download external references and get processed directory
+        processed_path = self._process_external_refs(path)
+
         # Create partial for reading & compressing subfiles
         file_converter = partial(
             self._read_and_compress_archive_file,
@@ -208,7 +230,7 @@ def handle_file(self, path, audio_settings=None, video_settings=None):
         )
         # create_predictable_zip will iterate over subfiles, call file_converter
         processed_zip_path = create_predictable_zip(
-            path, file_converter=file_converter if config.COMPRESS else None
+            processed_path, file_converter=file_converter if config.COMPRESS else None
         )
 
         with self.write_file(ext) as fh:
@@ -217,6 +239,8 @@ def handle_file(self, path, audio_settings=None, video_settings=None):
 
         # Clean up
         os.unlink(processed_zip_path)
+        if processed_path != path:
+            shutil.rmtree(processed_path, ignore_errors=True)
 
     @contextmanager
     def open_and_verify_archive(self, path):
diff --git a/ricecooker/utils/url_utils.py b/ricecooker/utils/url_utils.py
new file mode 100644
index 00000000..62055d66
--- /dev/null
+++ b/ricecooker/utils/url_utils.py
@@ -0,0 +1,368 @@
+"""
+Shared URL extraction and rewriting utilities for archive processing.
+
+These functions operate on content strings (HTML, CSS, JSON) — no HTTP,
+no filesystem, no platform-specific paths. They can be used by both
+ricecooker's pipeline and Studio's upload processing.
+
+Supersedes issue #303 by making URL detection/rewriting independently
+unit-testable.
+"""
+
+import json
+import os
+import re
+from dataclasses import dataclass
+from typing import Optional
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+
+
+@dataclass
+class ExtractedURL:
+    """A URL reference found in archive content."""
+
+    url: str  # The URL as found in the source
+    source_file: str  # Which file this was found in (for archive context)
+    context: str  # 'html_attr', 'css_url', 'css_import', 'h5p_json', 'html_srcset'
+    tag: Optional[str] = None  # e.g. 'img', 'link', 'script'
+    attr: Optional[str] = None  # e.g. 'src', 'href'
+
+
+# Regex patterns for CSS URL extraction
+# Matches url('...'), url("..."), and url(...)
+_CSS_URL_RE = re.compile(r"url\(['\"]?(.*?)['\"]?\)")
+# Matches @import '...' and @import "..." (bare string form, not url() form)
+_CSS_IMPORT_RE = re.compile(r"@import\s+['\"]([^'\"]+)['\"]")
+
+
+def is_external_url(url):
+    """
+    Classify a URL as external (http/https with a netloc) vs internal
+    (relative path, data: URI, fragment-only, etc.).
+    """
+    parsed = urlparse(url)
+    return parsed.scheme in ("http", "https") and bool(parsed.netloc)
+
+
+def derive_local_filename(url):
+    """
+    Derive a deterministic local filename from an external URL.
+
+    Example:
+        'https://fonts.example.com/font.woff2'
+        -> '_external/fonts.example.com/font.woff2'
+    """
+    parsed = urlparse(url)
+    path = parsed.path.lstrip("/")
+    # Strip path traversal segments to prevent writing outside the archive
+    parts = path.split("/")
+    parts = [p for p in parts if p != ".."]
+    path = "/".join(parts)
+    if parsed.query:
+        path = path + "?" + parsed.query
+    return os.path.join("_external", parsed.netloc, path)
+
+
+def extract_urls_from_css(css_content, source_file=""):
+    """
+    Extract all external URL references from CSS content.
+
+    Finds URLs in:
+    - url('...'), url("..."), url(...)
+    - @import '...' and @import "..." (bare string form)
+
+    Skips data: URIs and relative URLs.
+    Returns list of ExtractedURL instances.
+    """
+    results = []
+    seen = set()
+
+    # First pass: url() references
+    for match in _CSS_URL_RE.finditer(css_content):
+        url = match.group(1).strip()
+        if url and is_external_url(url) and url not in seen:
+            seen.add(url)
+            results.append(
+                ExtractedURL(url=url, source_file=source_file, context="css_url")
+            )
+
+    # Second pass: bare @import strings (not url() form)
+    for match in _CSS_IMPORT_RE.finditer(css_content):
+        url = match.group(1).strip()
+        if url and is_external_url(url) and url not in seen:
+            seen.add(url)
+            results.append(
+                ExtractedURL(url=url, source_file=source_file, context="css_import")
+            )
+
+    return results
+
+
+def _parse_srcset(srcset_value):
+    """Parse an HTML srcset attribute value into a list of URLs."""
+    urls = []
+    for entry in srcset_value.split(","):
+        entry = entry.strip()
+        if entry:
+            # srcset entries are "url descriptor" e.g. "img.jpg 300w"
+            parts = entry.split()
+            if parts:
+                urls.append(parts[0])
+    return urls
+
+
+def _extract_src_urls(soup, source_file, seen, results):
+    """Extract external URLs from src attributes on img, script, source tags."""
+    for tag_name in ("img", "script", "source"):
+        for node in soup.find_all(tag_name, src=True):
+            url = node["src"]
+            if is_external_url(url) and url not in seen:
+                seen.add(url)
+                results.append(
+                    ExtractedURL(
+                        url=url,
+                        source_file=source_file,
+                        context="html_attr",
+                        tag=tag_name,
+                        attr="src",
+                    )
+                )
+
+
+def _extract_srcset_urls(soup, source_file, seen, results):
+    """Extract external URLs from srcset attributes on img, source tags."""
+    for tag_name in ("img", "source"):
+        for node in soup.find_all(tag_name, srcset=True):
+            for url in _parse_srcset(node["srcset"]):
+                if is_external_url(url) and url not in seen:
+                    seen.add(url)
+                    results.append(
+                        ExtractedURL(
+                            url=url,
+                            source_file=source_file,
+                            context="html_srcset",
+                            tag=tag_name,
+                            attr="srcset",
+                        )
+                    )
+
+
+def _extract_stylesheet_urls(soup, source_file, seen, results):
+    """Extract external URLs from link[rel=stylesheet] href attributes."""
+    for node in soup.find_all("link", href=True):
+        if "rel" in node.attrs and "stylesheet" in node.get("rel", []):
+            url = node["href"]
+            if is_external_url(url) and url not in seen:
+                seen.add(url)
+                results.append(
+                    ExtractedURL(
+                        url=url,
+                        source_file=source_file,
+                        context="html_attr",
+                        tag="link",
+                        attr="href",
+                    )
+                )
+
+
+def _extract_style_urls(soup, source_file, seen, results):
+    """Extract external URLs from inline style attributes and style blocks."""
+    for node in soup.find_all(style=True):
+        style_val = node.get("style", "")
+        for extracted in extract_urls_from_css(style_val, source_file):
+            if extracted.url not in seen:
+                seen.add(extracted.url)
+                results.append(extracted)
+
+    for style_node in soup.find_all("style"):
+        if style_node.string:
+            for extracted in extract_urls_from_css(style_node.string, source_file):
+                if extracted.url not in seen:
+                    seen.add(extracted.url)
+                    results.append(extracted)
+
+
+def extract_urls_from_html(html_content, source_file=""):
+    """
+    Extract all external URL references from HTML content.
+
+    Finds URLs in:
+    - img[src], script[src], source[src]
+    - img[srcset], source[srcset]
+    - link[rel=stylesheet][href]
+    - inline style attributes with url()
+    - <style> blocks
+
+    Skips data: URIs and relative URLs.
+    Returns list of ExtractedURL instances.
+    """
+    if not html_content or not html_content.strip():
+        return []
+
+    soup = BeautifulSoup(html_content, "html.parser")
+    results = []
+    seen = set()
+
+    _extract_src_urls(soup, source_file, seen, results)
+    _extract_srcset_urls(soup, source_file, seen, results)
+    _extract_stylesheet_urls(soup, source_file, seen, results)
+    _extract_style_urls(soup, source_file, seen, results)
+
+    return results
+
+
+def extract_urls_from_h5p_json(json_content, source_file=""):
+    """
+    Extract external URL references from H5P JSON content.
+
+    Walks the JSON tree recursively, finding any "path" keys whose values
+    are external URLs (start with http:// or https://).
+    """
+    try:
+        data = json.loads(json_content)
+    except (json.JSONDecodeError, TypeError):
+        return []
+
+    results = []
+
+    def _walk(obj):
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                if key == "path" and isinstance(value, str) and is_external_url(value):
+                    results.append(
+                        ExtractedURL(
+                            url=value, source_file=source_file, context="h5p_json"
+                        )
+                    )
+                else:
+                    _walk(value)
+        elif isinstance(obj, list):
+            for item in obj:
+                _walk(item)
+
+    _walk(data)
+    return results
+
+
+def rewrite_urls_in_css(css_content, url_map):
+    """
+    Rewrite URL references in CSS content using the provided mapping.
+
+    Handles both url() and @import bare string forms.
+    URLs not in the map are left unchanged.
+    """
+
+    def _repl_url(match):
+        original = match.group(0)
+        url = match.group(1).strip()
+        if url in url_map:
+            return "url('{}')".format(url_map[url])
+        return original
+
+    def _repl_import(match):
+        original = match.group(0)
+        url = match.group(1).strip()
+        if url in url_map:
+            return "@import '{}'".format(url_map[url])
+        return original
+
+    result = _CSS_URL_RE.sub(_repl_url, css_content)
+    result = _CSS_IMPORT_RE.sub(_repl_import, result)
+    return result
+
+
+def _rewrite_src_attrs(soup, url_map):
+    """Rewrite src attributes on img, script, source tags."""
+    for tag_name in ("img", "script", "source"):
+        for node in soup.find_all(tag_name, src=True):
+            url = node["src"]
+            if url in url_map:
+                node["src"] = url_map[url]
+
+
+def _rewrite_srcset_attrs(soup, url_map):
+    """Rewrite srcset attributes on img, source tags."""
+    for tag_name in ("img", "source"):
+        for node in soup.find_all(tag_name, srcset=True):
+            entries = []
+            for entry in node["srcset"].split(","):
+                entry = entry.strip()
+                if not entry:
+                    continue
+                parts = entry.split()
+                if parts[0] in url_map:
+                    parts[0] = url_map[parts[0]]
+                entries.append(" ".join(parts))
+            node["srcset"] = ", ".join(entries)
+
+
+def _rewrite_stylesheet_hrefs(soup, url_map):
+    """Rewrite href attributes on link[rel=stylesheet] tags."""
+    for node in soup.find_all("link", href=True):
+        if "rel" in node.attrs and "stylesheet" in node.get("rel", []):
+            url = node["href"]
+            if url in url_map:
+                node["href"] = url_map[url]
+
+
+def _rewrite_style_content(soup, url_map):
+    """Rewrite URLs in inline style attributes and style blocks."""
+    for node in soup.find_all(style=True):
+        style_val = node.get("style", "")
+        node["style"] = rewrite_urls_in_css(style_val, url_map)
+
+    for style_node in soup.find_all("style"):
+        if style_node.string:
+            style_node.string = rewrite_urls_in_css(style_node.string, url_map)
+
+
+def rewrite_urls_in_html(html_content, url_map):
+    """
+    Rewrite URL references in HTML content using the provided mapping.
+
+    Handles the same selectors as extract_urls_from_html:
+    img/script/source src, img/source srcset, link[stylesheet] href,
+    inline styles, and <style> blocks.
+
+    URLs not in the map are left unchanged.
+    """
+    if not html_content or not html_content.strip():
+        return html_content
+
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    _rewrite_src_attrs(soup, url_map)
+    _rewrite_srcset_attrs(soup, url_map)
+    _rewrite_stylesheet_hrefs(soup, url_map)
+    _rewrite_style_content(soup, url_map)
+
+    return str(soup)
+
+
+def rewrite_urls_in_h5p_json(json_content, url_map):
+    """
+    Rewrite "path" values in H5P JSON content using the provided mapping.
+
+    Walks the JSON tree recursively, replacing matching "path" values.
+    URLs not in the map are left unchanged.
+    """
+    try:
+        data = json.loads(json_content)
+    except (json.JSONDecodeError, TypeError):
+        return json_content
+
+    def _walk(obj):
+        if isinstance(obj, dict):
+            for key in obj:
+                if key == "path" and isinstance(obj[key], str) and obj[key] in url_map:
+                    obj[key] = url_map[obj[key]]
+                else:
+                    _walk(obj[key])
+        elif isinstance(obj, list):
+            for item in obj:
+                _walk(item)
+
+    _walk(data)
+    return json.dumps(data)
diff --git a/tests/pipeline/test_convert.py b/tests/pipeline/test_convert.py
index abbdcf99..04043156 100644
--- a/tests/pipeline/test_convert.py
+++ b/tests/pipeline/test_convert.py
@@ -1,13 +1,27 @@
 """Tests for audio and video compression in archive files."""
+
+import json
 import os
 import tempfile
 import zipfile
 from unittest.mock import patch
 
+from ricecooker import config
 from ricecooker.classes.files import H5PFile
 from ricecooker.classes.files import HTMLZipFile
 
 
+class MockResponse:
+    """Mock HTTP response for archive asset downloads."""
+
+    def __init__(self, content=b"downloaded content", status_code=200):
+        self.content = content
+        self.status_code = status_code
+
+    def raise_for_status(self):
+        pass
+
+
 def test_html5_archive_with_mp4_compression(video_file, audio_file):
     """Test that MP4 and MP3 files within HTML5 archives are compressed when compression is enabled."""
     # Create temporary HTML5 archive with media files
@@ -120,3 +134,128 @@ def test_archive_no_compression_when_disabled(video_file, audio_file):
 
     finally:
         os.unlink(temp_archive.name)
+
+
+def test_html5_archive_external_refs_downloaded():
+    """External URLs in HTML5 archives are downloaded and rewritten."""
+    temp_archive = tempfile.NamedTemporaryFile(suffix=".zip", delete=False)
+    temp_archive.close()
+
+    try:
+        with zipfile.ZipFile(temp_archive.name, "w") as zf:
+            zf.writestr(
+                "index.html",
+                '<html><body>Content here <img src="https://cdn.example.com/photo.jpg"></body></html>',
+            )
+
+        with patch("ricecooker.utils.archive_assets.make_request") as mock_request:
+            mock_request.return_value = MockResponse(content=b"fake-image-data")
+
+            with patch("ricecooker.config.COMPRESS", False):
+                html_file = HTMLZipFile(temp_archive.name)
+                result = html_file.process_file()
+
+        assert result is not None, "Processing should succeed"
+
+        # Verify the output ZIP contains the downloaded file and rewritten HTML
+        result_path = config.get_storage_path(result)
+        with zipfile.ZipFile(result_path, "r") as zf:
+            names = zf.namelist()
+            assert any(
+                "_external/" in n for n in names
+            ), f"Expected _external/ directory in output ZIP, got: {names}"
+
+            html = zf.read("index.html").decode("utf-8")
+            assert (
+                "https://cdn.example.com/photo.jpg" not in html
+            ), "External URL should be rewritten"
+            assert "_external/" in html, "Should reference local _external/ path"
+
+    finally:
+        os.unlink(temp_archive.name)
+
+
+def test_h5p_archive_external_video_downloaded():
+    """External video URLs in H5P content.json are downloaded and rewritten."""
+    temp_archive = tempfile.NamedTemporaryFile(suffix=".h5p", delete=False)
+    temp_archive.close()
+
+    content = json.dumps(
+        {
+            "video": {
+                "files": [
+                    {
+                        "path": "https://h5p.org/sites/default/files/h5p/iv.mp4",
+                        "mime": "video/mp4",
+                    }
+                ]
+            }
+        }
+    )
+
+    try:
+        with zipfile.ZipFile(temp_archive.name, "w") as zf:
+            zf.writestr("h5p.json", '{"mainLibrary": "H5P.InteractiveVideo"}')
+            zf.writestr("content/content.json", content)
+
+        with patch("ricecooker.utils.archive_assets.make_request") as mock_request:
+            mock_request.return_value = MockResponse(content=b"fake-video-data")
+
+            with patch("ricecooker.config.COMPRESS", False):
+                h5p_file = H5PFile(temp_archive.name)
+                result = h5p_file.process_file()
+
+        assert result is not None, "Processing should succeed"
+
+        result_path = config.get_storage_path(result)
+        with zipfile.ZipFile(result_path, "r") as zf:
+            names = zf.namelist()
+            assert any(
+                "_external/" in n for n in names
+            ), f"Expected _external/ directory in output ZIP, got: {names}"
+
+            data = json.loads(zf.read("content/content.json"))
+            video_path = data["video"]["files"][0]["path"]
+            assert (
+                "https://h5p.org" not in video_path
+            ), "External URL should be rewritten"
+            assert "_external/" in video_path, "Should reference local path"
+
+    finally:
+        os.unlink(temp_archive.name)
+
+
+def test_archive_external_refs_failure_graceful():
+    """If external ref downloading fails, archive still processes successfully."""
+    temp_archive = tempfile.NamedTemporaryFile(suffix=".zip", delete=False)
+    temp_archive.close()
+
+    try:
+        with zipfile.ZipFile(temp_archive.name, "w") as zf:
+            # Use unique content to avoid cache hit from previous test
+            zf.writestr(
+                "index.html",
+                '<html><body>Unique failure test content <img src="https://cdn.example.com/photo.jpg"></body></html>',
+            )
+
+        with patch(
+            "ricecooker.utils.pipeline.convert.download_and_rewrite_external_refs"
+        ) as mock_process:
+            mock_process.side_effect = OSError("Network error")
+
+            with patch("ricecooker.config.COMPRESS", False):
+                html_file = HTMLZipFile(temp_archive.name)
+                result = html_file.process_file()
+
+        assert (
+            result is not None
+        ), "Processing should succeed even on external ref failure"
+
+        # Original URL should be preserved since processing failed
+        result_path = config.get_storage_path(result)
+        with zipfile.ZipFile(result_path, "r") as zf:
+            html = zf.read("index.html").decode("utf-8")
+            assert "https://cdn.example.com/photo.jpg" in html
+
+    finally:
+        os.unlink(temp_archive.name)
diff --git a/tests/test_archive_assets.py b/tests/test_archive_assets.py
new file mode 100644
index 00000000..46e3ddb7
--- /dev/null
+++ b/tests/test_archive_assets.py
@@ -0,0 +1,480 @@
+"""
+Tests for ricecooker.utils.archive_assets — archive external reference processor.
+
+Tests create in-memory ZIP archives, call download_and_rewrite_external_refs,
+and verify the output directory contents. HTTP downloads are mocked.
+"""
+
+import json
+import os
+import shutil
+import tempfile
+import zipfile
+from unittest.mock import patch
+
+import pytest
+
+from ricecooker.utils.archive_assets import download_and_rewrite_external_refs
+
+
+class MockResponse:
+    """Mock HTTP response for mocked downloads."""
+
+    def __init__(self, content=b"downloaded content", status_code=200):
+        self.content = content
+        self.status_code = status_code
+
+    def raise_for_status(self):
+        if self.status_code >= 400:
+            from requests.exceptions import HTTPError
+
+            raise HTTPError(response=self)
+
+
+def _create_zip(files_dict):
+    """Create a temporary ZIP file from a dict of {path: content}."""
+    fd, zip_path = tempfile.mkstemp(suffix=".zip")
+    os.close(fd)
+    with zipfile.ZipFile(zip_path, "w") as zf:
+        for path, content in files_dict.items():
+            if isinstance(content, str):
+                content = content.encode("utf-8")
+            zf.writestr(path, content)
+    return zip_path
+
+
+@pytest.fixture
+def mock_download():
+    """Mock make_request to return predictable content."""
+    with patch("ricecooker.utils.archive_assets.make_request") as mock:
+        mock.return_value = MockResponse(content=b"downloaded content")
+        yield mock
+
+
+@pytest.fixture
+def mock_download_css_then_font():
+    """Mock that returns CSS on first call and font bytes on subsequent calls."""
+    css_content = b"@font-face { src: url('https://fonts.example.com/roboto.woff2') }"
+    font_content = b"font-binary-data"
+
+    responses = {
+        "https://fonts.googleapis.com/css": MockResponse(content=css_content),
+        "https://fonts.example.com/roboto.woff2": MockResponse(content=font_content),
+    }
+
+    def side_effect(url, *args, **kwargs):
+        return responses.get(url, MockResponse(content=b"unknown", status_code=404))
+
+    with patch("ricecooker.utils.archive_assets.make_request") as mock:
+        mock.side_effect = side_effect
+        yield mock
+
+
+# ---------------------------------------------------------------------------
+# Basic functionality tests
+# ---------------------------------------------------------------------------
+
+
+class TestBasicFunctionality:
+    def test_html_with_external_img(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                # Image should be downloaded
+                img_path = os.path.join(
+                    result_dir, "_external", "cdn.example.com", "photo.jpg"
+                )
+                assert os.path.exists(img_path)
+
+                # HTML should be rewritten
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert "https://cdn.example.com/photo.jpg" not in html
+                assert "_external/cdn.example.com/photo.jpg" in html
+
+                mock_download.assert_called_once()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_html_with_external_css(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><head><link rel="stylesheet" href="https://fonts.googleapis.com/css"></head></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                css_path = os.path.join(
+                    result_dir, "_external", "fonts.googleapis.com", "css"
+                )
+                assert os.path.exists(css_path)
+
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert "https://fonts.googleapis.com/css" not in html
+                assert "_external/fonts.googleapis.com/css" in html
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_html_with_external_script(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><head><script src="https://cdn.example.com/lib.js"></script></head></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                js_path = os.path.join(
+                    result_dir, "_external", "cdn.example.com", "lib.js"
+                )
+                assert os.path.exists(js_path)
+
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert "_external/cdn.example.com/lib.js" in html
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_css_with_external_font(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "styles/main.css": "@font-face { src: url('https://fonts.example.com/roboto.woff2') format('woff2'); }"
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                font_path = os.path.join(
+                    result_dir,
+                    "_external",
+                    "fonts.example.com",
+                    "roboto.woff2",
+                )
+                assert os.path.exists(font_path)
+
+                with open(os.path.join(result_dir, "styles", "main.css")) as f:
+                    css = f.read()
+                assert "https://fonts.example.com/roboto.woff2" not in css
+                # Path should be relative from styles/ to _external/
+                assert "../_external/fonts.example.com/roboto.woff2" in css
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_css_with_import(self, mock_download):
+        zip_path = _create_zip(
+            {"style.css": "@import 'https://fonts.googleapis.com/css?family=Roboto';"}
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                css_path = os.path.join(
+                    result_dir,
+                    "_external",
+                    "fonts.googleapis.com",
+                    "css?family=Roboto",
+                )
+                assert os.path.exists(css_path)
+
+                with open(os.path.join(result_dir, "style.css")) as f:
+                    css = f.read()
+                assert "https://fonts.googleapis.com" not in css
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_h5p_with_external_video(self, mock_download):
+        content_json = json.dumps(
+            {
+                "video": {
+                    "files": [
+                        {
+                            "path": "https://h5p.org/sites/default/files/h5p/iv.mp4",
+                            "mime": "video/mp4",
+                        }
+                    ]
+                }
+            }
+        )
+        zip_path = _create_zip(
+            {
+                "content/content.json": content_json,
+                "h5p.json": '{"mainLibrary": "H5P.InteractiveVideo"}',
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                video_path = os.path.join(
+                    result_dir,
+                    "_external",
+                    "h5p.org",
+                    "sites",
+                    "default",
+                    "files",
+                    "h5p",
+                    "iv.mp4",
+                )
+                assert os.path.exists(video_path)
+
+                with open(os.path.join(result_dir, "content", "content.json")) as f:
+                    data = json.load(f)
+                path_val = data["video"]["files"][0]["path"]
+                assert "https://h5p.org" not in path_val
+                assert "_external/" in path_val
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+
+# ---------------------------------------------------------------------------
+# Edge case tests
+# ---------------------------------------------------------------------------
+
+
+class TestEdgeCases:
+    def test_relative_urls_unchanged(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="images/photo.jpg"></body></html>',
+                "images/photo.jpg": b"fake-image-data",
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert 'src="images/photo.jpg"' in html
+                mock_download.assert_not_called()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_data_urls_unchanged(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="data:image/png;base64,abc123"></body></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                assert "data:image/png;base64,abc123" in html
+                mock_download.assert_not_called()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_download_failure_preserves_original(self):
+        with patch("ricecooker.utils.archive_assets.make_request") as mock:
+            mock.return_value = None  # Simulate failed download
+            zip_path = _create_zip(
+                {
+                    "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>'
+                }
+            )
+            try:
+                result_dir = download_and_rewrite_external_refs(zip_path)
+                try:
+                    with open(os.path.join(result_dir, "index.html")) as f:
+                        html = f.read()
+                    # Original URL should be preserved when download fails
+                    assert "https://cdn.example.com/photo.jpg" in html
+                finally:
+                    shutil.rmtree(result_dir, ignore_errors=True)
+            finally:
+                os.unlink(zip_path)
+
+    def test_duplicate_urls_downloaded_once(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "page1.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>',
+                "page2.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>',
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                # Download should only happen once for the same URL
+                mock_download.assert_called_once()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_css_recursive_download(self, mock_download_css_then_font):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><head><link rel="stylesheet" href="https://fonts.googleapis.com/css"></head></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                # Both CSS and font should be downloaded
+                css_path = os.path.join(
+                    result_dir, "_external", "fonts.googleapis.com", "css"
+                )
+                font_path = os.path.join(
+                    result_dir,
+                    "_external",
+                    "fonts.example.com",
+                    "roboto.woff2",
+                )
+                assert os.path.exists(css_path)
+                assert os.path.exists(font_path)
+
+                # The downloaded CSS should have its font URL rewritten too
+                with open(css_path) as f:
+                    css = f.read()
+                assert "https://fonts.example.com/roboto.woff2" not in css
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_empty_archive(self, mock_download):
+        zip_path = _create_zip({"empty.txt": ""})
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                assert os.path.isdir(result_dir)
+                mock_download.assert_not_called()
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_binary_files_untouched(self, mock_download):
+        binary_content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>',
+                "images/local.png": binary_content,
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                with open(os.path.join(result_dir, "images", "local.png"), "rb") as f:
+                    assert f.read() == binary_content
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_path_traversal_url_stays_in_temp_dir(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://evil.com/../../../etc/passwd"></body></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                # The downloaded file must be inside the result directory
+                for root, _dirs, filenames in os.walk(result_dir):
+                    for filename in filenames:
+                        full_path = os.path.join(root, filename)
+                        assert os.path.realpath(full_path).startswith(
+                            os.path.realpath(result_dir)
+                        ), f"File {full_path} escapes result directory"
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_blacklisted_urls_skipped(self, mock_download):
+        zip_path = _create_zip({"index.html": """<html><body>
+                <img src="https://cdn.example.com/photo.jpg">
+                <img src="https://blocked.example.com/img.jpg">
+                </body></html>"""})
+        try:
+            result_dir = download_and_rewrite_external_refs(
+                zip_path, url_blacklist=["blocked.example.com"]
+            )
+            try:
+                with open(os.path.join(result_dir, "index.html")) as f:
+                    html = f.read()
+                # Allowed URL should be downloaded and rewritten
+                assert "_external/cdn.example.com/photo.jpg" in html
+                # Blocked URL should remain unchanged
+                assert "https://blocked.example.com/img.jpg" in html
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+
+# ---------------------------------------------------------------------------
+# Integration shape tests
+# ---------------------------------------------------------------------------
+
+
+class TestIntegrationShape:
+    def test_returns_directory_path(self, mock_download):
+        zip_path = _create_zip({"index.html": "<html><body></body></html>"})
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                assert os.path.isdir(result_dir)
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_original_files_preserved(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>',
+                "images/local.png": b"png-data",
+                "scripts/app.js": "console.log('hello');",
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                assert os.path.exists(os.path.join(result_dir, "index.html"))
+                assert os.path.exists(os.path.join(result_dir, "images", "local.png"))
+                assert os.path.exists(os.path.join(result_dir, "scripts", "app.js"))
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
+
+    def test_external_files_in_subdirectory(self, mock_download):
+        zip_path = _create_zip(
+            {
+                "index.html": '<html><body><img src="https://cdn.example.com/photo.jpg"></body></html>'
+            }
+        )
+        try:
+            result_dir = download_and_rewrite_external_refs(zip_path)
+            try:
+                external_dir = os.path.join(result_dir, "_external")
+                assert os.path.isdir(external_dir)
+            finally:
+                shutil.rmtree(result_dir, ignore_errors=True)
+        finally:
+            os.unlink(zip_path)
diff --git a/tests/test_url_utils.py b/tests/test_url_utils.py
new file mode 100644
index 00000000..148f9fb5
--- /dev/null
+++ b/tests/test_url_utils.py
@@ -0,0 +1,529 @@
+"""
+Tests for ricecooker.utils.url_utils — shared URL extraction and rewriting.
+
+All tests operate on plain strings. No HTTP, no filesystem, no archives.
+"""
+
+import json
+
+from ricecooker.utils.url_utils import derive_local_filename
+from ricecooker.utils.url_utils import extract_urls_from_css
+from ricecooker.utils.url_utils import extract_urls_from_h5p_json
+from ricecooker.utils.url_utils import extract_urls_from_html
+from ricecooker.utils.url_utils import is_external_url
+from ricecooker.utils.url_utils import rewrite_urls_in_css
+from ricecooker.utils.url_utils import rewrite_urls_in_h5p_json
+from ricecooker.utils.url_utils import rewrite_urls_in_html
+
+# ---------------------------------------------------------------------------
+# is_external_url tests
+# ---------------------------------------------------------------------------
+
+
+class TestIsExternalURL:
+    def test_http_url(self):
+        assert is_external_url("http://example.com/file.js") is True
+
+    def test_https_url(self):
+        assert is_external_url("https://example.com/file.js") is True
+
+    def test_relative_path(self):
+        assert is_external_url("images/photo.jpg") is False
+
+    def test_data_uri(self):
+        assert is_external_url("data:image/png;base64,abc123") is False
+
+    def test_fragment_only(self):
+        assert is_external_url("#section") is False
+
+    def test_empty_string(self):
+        assert is_external_url("") is False
+
+    def test_protocol_relative(self):
+        # //cdn.example.com/file.js — no scheme, so not classified as external
+        assert is_external_url("//cdn.example.com/file.js") is False
+
+    def test_mailto(self):
+        assert is_external_url("mailto:user@example.com") is False
+
+    def test_javascript_uri(self):
+        assert is_external_url("javascript:void(0)") is False
+
+
+# ---------------------------------------------------------------------------
+# derive_local_filename tests
+# ---------------------------------------------------------------------------
+
+
+class TestDeriveLocalFilename:
+    def test_simple_url(self):
+        result = derive_local_filename("https://cdn.example.com/image.png")
+        assert result == "_external/cdn.example.com/image.png"
+
+    def test_url_with_subdirs(self):
+        result = derive_local_filename(
+            "https://fonts.example.com/v1/fonts/roboto.woff2"
+        )
+        assert result == "_external/fonts.example.com/v1/fonts/roboto.woff2"
+
+    def test_url_with_query(self):
+        result = derive_local_filename("https://fonts.googleapis.com/css?family=Roboto")
+        assert result == "_external/fonts.googleapis.com/css?family=Roboto"
+
+    def test_url_root_path(self):
+        result = derive_local_filename("https://example.com/")
+        assert result == "_external/example.com/"
+
+    def test_starts_with_external_prefix(self):
+        result = derive_local_filename("https://example.com/anything")
+        assert result.startswith("_external/")
+
+    def test_path_traversal_stripped(self):
+        result = derive_local_filename("https://evil.com/../../../etc/passwd")
+        assert ".." not in result
+        assert result.startswith("_external/")
+        assert "etc/passwd" in result
+
+    def test_path_traversal_deep(self):
+        result = derive_local_filename("https://evil.com/a/../../b/../../../etc/passwd")
+        assert ".." not in result
+        assert result.startswith("_external/")
+
+
+# ---------------------------------------------------------------------------
+# extract_urls_from_css tests
+# ---------------------------------------------------------------------------
+
+
+class TestExtractUrlsFromCSS:
+    def test_extract_css_url_single_quotes(self):
+        css = "body { background: url('https://example.com/bg.png') }"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/bg.png"
+        assert urls[0].context == "css_url"
+        assert urls[0].source_file == "style.css"
+
+    def test_extract_css_url_double_quotes(self):
+        css = 'body { background: url("https://example.com/bg.png") }'
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/bg.png"
+
+    def test_extract_css_url_no_quotes(self):
+        css = "body { background: url(https://example.com/bg.png) }"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/bg.png"
+
+    def test_extract_css_import_bare_string_single_quotes(self):
+        css = "@import 'https://fonts.googleapis.com/css?family=Roboto';"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.googleapis.com/css?family=Roboto"
+        assert urls[0].context == "css_import"
+
+    def test_extract_css_import_bare_string_double_quotes(self):
+        css = '@import "https://fonts.googleapis.com/css?family=Roboto";'
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.googleapis.com/css?family=Roboto"
+
+    def test_extract_css_import_url_form(self):
+        """@import url('...') should be caught by the url() regex."""
+        css = "@import url('https://fonts.googleapis.com/css');"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.googleapis.com/css"
+        assert urls[0].context == "css_url"
+
+    def test_extract_css_font_face(self):
+        css = """
+        @font-face {
+            font-family: 'Roboto';
+            src: url('https://fonts.example.com/roboto.woff2') format('woff2');
+        }
+        """
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.example.com/roboto.woff2"
+
+    def test_css_data_urls_ignored(self):
+        css = "body { background: url(data:image/png;base64,abc123) }"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 0
+
+    def test_css_relative_urls_ignored(self):
+        css = "body { background: url('../images/bg.png') }"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 0
+
+    def test_multiple_urls(self):
+        css = """
+        body { background: url('https://example.com/bg1.png') }
+        .header { background: url('https://example.com/bg2.png') }
+        """
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 2
+        extracted_urls = {u.url for u in urls}
+        assert "https://example.com/bg1.png" in extracted_urls
+        assert "https://example.com/bg2.png" in extracted_urls
+
+    def test_empty_css(self):
+        urls = extract_urls_from_css("", "style.css")
+        assert len(urls) == 0
+
+    def test_no_duplicate_for_import_url_form(self):
+        """@import url('...') should not produce duplicates from both regexes."""
+        css = "@import url('https://fonts.googleapis.com/css');"
+        urls = extract_urls_from_css(css, "style.css")
+        assert len(urls) == 1
+
+
+# ---------------------------------------------------------------------------
+# extract_urls_from_html tests
+# ---------------------------------------------------------------------------
+
+
+class TestExtractUrlsFromHTML:
+    def test_extract_img_src(self):
+        html = '<img src="https://cdn.example.com/photo.jpg">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://cdn.example.com/photo.jpg"
+        assert urls[0].context == "html_attr"
+        assert urls[0].tag == "img"
+        assert urls[0].attr == "src"
+        assert urls[0].source_file == "index.html"
+
+    def test_extract_img_srcset(self):
+        html = '<img srcset="https://cdn.example.com/img-300.jpg 300w, https://cdn.example.com/img-600.jpg 600w">'
+        urls = extract_urls_from_html(html, "index.html")
+        srcset_urls = [u for u in urls if u.context == "html_srcset"]
+        assert len(srcset_urls) == 2
+        extracted = {u.url for u in srcset_urls}
+        assert "https://cdn.example.com/img-300.jpg" in extracted
+        assert "https://cdn.example.com/img-600.jpg" in extracted
+
+    def test_extract_img_srcset_mixed_relative_external(self):
+        html = (
+            '<img srcset="img-300.jpg 300w, https://cdn.example.com/img-600.jpg 600w">'
+        )
+        urls = extract_urls_from_html(html, "index.html")
+        srcset_urls = [u for u in urls if u.context == "html_srcset"]
+        assert len(srcset_urls) == 1
+        assert srcset_urls[0].url == "https://cdn.example.com/img-600.jpg"
+
+    def test_extract_link_stylesheet(self):
+        html = '<link rel="stylesheet" href="https://fonts.googleapis.com/css">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://fonts.googleapis.com/css"
+        assert urls[0].tag == "link"
+        assert urls[0].attr == "href"
+
+    def test_extract_link_non_stylesheet_ignored(self):
+        html = '<link rel="icon" href="https://example.com/favicon.ico">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 0
+
+    def test_extract_script_src(self):
+        html = '<script src="https://cdn.example.com/lib.js"></script>'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://cdn.example.com/lib.js"
+        assert urls[0].tag == "script"
+
+    def test_extract_source_src(self):
+        html = '<source src="https://example.com/video.mp4" type="video/mp4">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/video.mp4"
+        assert urls[0].tag == "source"
+
+    def test_extract_source_srcset(self):
+        html = '<source srcset="https://example.com/img-lg.jpg 1024w">'
+        urls = extract_urls_from_html(html, "index.html")
+        srcset_urls = [u for u in urls if u.context == "html_srcset"]
+        assert len(srcset_urls) == 1
+        assert srcset_urls[0].url == "https://example.com/img-lg.jpg"
+
+    def test_extract_background_image(self):
+        html = """<div style="background-image: url('https://example.com/bg.png')">text</div>"""
+        urls = extract_urls_from_html(html, "index.html")
+        css_urls = [u for u in urls if u.context == "css_url"]
+        assert len(css_urls) == 1
+        assert css_urls[0].url == "https://example.com/bg.png"
+
+    def test_extract_style_block(self):
+        html = """
+        <html><head>
+        <style>body { background: url('https://example.com/bg.png') }</style>
+        </head><body></body></html>
+        """
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 1
+        assert urls[0].url == "https://example.com/bg.png"
+
+    def test_relative_urls_ignored(self):
+        html = '<img src="images/photo.jpg"><script src="js/app.js"></script>'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 0
+
+    def test_data_urls_ignored(self):
+        html = '<img src="data:image/png;base64,abc123">'
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 0
+
+    def test_empty_html(self):
+        urls = extract_urls_from_html("", "index.html")
+        assert len(urls) == 0
+
+    def test_minimal_html(self):
+        urls = extract_urls_from_html("<html><body></body></html>", "index.html")
+        assert len(urls) == 0
+
+    def test_multiple_elements(self):
+        html = """
+        <html><body>
+        <img src="https://cdn.example.com/img1.jpg">
+        <img src="https://cdn.example.com/img2.jpg">
+        <script src="https://cdn.example.com/app.js"></script>
+        </body></html>
+        """
+        urls = extract_urls_from_html(html, "index.html")
+        assert len(urls) == 3
+        extracted = {u.url for u in urls}
+        assert "https://cdn.example.com/img1.jpg" in extracted
+        assert "https://cdn.example.com/img2.jpg" in extracted
+        assert "https://cdn.example.com/app.js" in extracted
+
+
+# ---------------------------------------------------------------------------
+# extract_urls_from_h5p_json tests
+# ---------------------------------------------------------------------------
+
+
+class TestExtractUrlsFromH5PJSON:
+    def test_extract_external_path(self):
+        data = json.dumps(
+            {
+                "video": {
+                    "files": [
+                        {"path": "https://h5p.org/sites/default/files/h5p/iv.mp4"}
+                    ]
+                }
+            }
+        )
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 1
+        assert urls[0].url == "https://h5p.org/sites/default/files/h5p/iv.mp4"
+        assert urls[0].context == "h5p_json"
+        assert urls[0].source_file == "content.json"
+
+    def test_relative_path_ignored(self):
+        data = json.dumps({"image": {"path": "images/photo.jpg"}})
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 0
+
+    def test_deeply_nested(self):
+        data = json.dumps(
+            {
+                "level1": {
+                    "level2": {
+                        "level3": [
+                            {"path": "https://cdn.example.com/deep/resource.mp4"}
+                        ]
+                    }
+                }
+            }
+        )
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 1
+        assert urls[0].url == "https://cdn.example.com/deep/resource.mp4"
+
+    def test_multiple_paths(self):
+        data = json.dumps(
+            {
+                "video": {"path": "https://example.com/video.mp4"},
+                "image": {"path": "https://example.com/image.jpg"},
+                "local": {"path": "images/local.jpg"},
+            }
+        )
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 2
+        extracted = {u.url for u in urls}
+        assert "https://example.com/video.mp4" in extracted
+        assert "https://example.com/image.jpg" in extracted
+
+    def test_empty_json(self):
+        urls = extract_urls_from_h5p_json("{}", "content.json")
+        assert len(urls) == 0
+
+    def test_non_string_path_ignored(self):
+        data = json.dumps({"path": 42})
+        urls = extract_urls_from_h5p_json(data, "content.json")
+        assert len(urls) == 0
+
+
+# ---------------------------------------------------------------------------
+# rewrite_urls_in_css tests
+# ---------------------------------------------------------------------------
+
+
+class TestRewriteUrlsInCSS:
+    def test_rewrite_url(self):
+        css = "body { background: url('https://example.com/bg.png') }"
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_css(css, url_map)
+        assert "url('_external/example.com/bg.png')" in result
+        assert "https://example.com/bg.png" not in result
+
+    def test_rewrite_import(self):
+        css = "@import 'https://fonts.googleapis.com/css?family=Roboto';"
+        url_map = {
+            "https://fonts.googleapis.com/css?family=Roboto": "_external/fonts.googleapis.com/css"
+        }
+        result = rewrite_urls_in_css(css, url_map)
+        assert "_external/fonts.googleapis.com/css" in result
+        assert "https://fonts.googleapis.com/css?family=Roboto" not in result
+
+    def test_rewrite_preserves_unmapped(self):
+        css = """
+        body { background: url('https://example.com/bg.png') }
+        .other { background: url('https://other.com/bg.png') }
+        """
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_css(css, url_map)
+        assert "_external/example.com/bg.png" in result
+        assert "https://other.com/bg.png" in result
+
+    def test_rewrite_url_no_quotes(self):
+        css = "body { background: url(https://example.com/bg.png) }"
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_css(css, url_map)
+        assert "_external/example.com/bg.png" in result
+
+    def test_empty_map(self):
+        css = "body { background: url('https://example.com/bg.png') }"
+        result = rewrite_urls_in_css(css, {})
+        assert "https://example.com/bg.png" in result
+
+
+# ---------------------------------------------------------------------------
+# rewrite_urls_in_html tests
+# ---------------------------------------------------------------------------
+
+
+class TestRewriteUrlsInHTML:
+    def test_rewrite_img_src(self):
+        html = '<img src="https://cdn.example.com/photo.jpg">'
+        url_map = {
+            "https://cdn.example.com/photo.jpg": "_external/cdn.example.com/photo.jpg"
+        }
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/cdn.example.com/photo.jpg" in result
+        assert "https://cdn.example.com/photo.jpg" not in result
+
+    def test_rewrite_srcset(self):
+        html = '<img srcset="https://cdn.example.com/img-300.jpg 300w, https://cdn.example.com/img-600.jpg 600w">'
+        url_map = {
+            "https://cdn.example.com/img-300.jpg": "_external/cdn.example.com/img-300.jpg",
+            "https://cdn.example.com/img-600.jpg": "_external/cdn.example.com/img-600.jpg",
+        }
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/cdn.example.com/img-300.jpg" in result
+        assert "_external/cdn.example.com/img-600.jpg" in result
+
+    def test_rewrite_style_block(self):
+        html = "<style>body { background: url('https://example.com/bg.png') }</style>"
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/example.com/bg.png" in result
+        assert "https://example.com/bg.png" not in result
+
+    def test_rewrite_inline_style(self):
+        html = """<div style="background-image: url('https://example.com/bg.png')">text</div>"""
+        url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"}
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/example.com/bg.png" in result
+
+    def test_rewrite_preserves_unmapped(self):
+        html = """
+        <img src="https://cdn.example.com/img1.jpg">
+        <img src="https://cdn.example.com/img2.jpg">
+        """
+        url_map = {
+            "https://cdn.example.com/img1.jpg": "_external/cdn.example.com/img1.jpg"
+        }
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/cdn.example.com/img1.jpg" in result
+        assert "https://cdn.example.com/img2.jpg" in result
+
+    def test_rewrite_link_href(self):
+        html = '<link rel="stylesheet" href="https://fonts.googleapis.com/css">'
+        url_map = {
+            "https://fonts.googleapis.com/css": "_external/fonts.googleapis.com/css"
+        }
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/fonts.googleapis.com/css" in result
+
+    def test_rewrite_script_src(self):
+        html = '<script src="https://cdn.example.com/lib.js"></script>'
+        url_map = {"https://cdn.example.com/lib.js": "_external/cdn.example.com/lib.js"}
+        result = rewrite_urls_in_html(html, url_map)
+        assert "_external/cdn.example.com/lib.js" in result
+
+
+# ---------------------------------------------------------------------------
+# rewrite_urls_in_h5p_json tests
+# ---------------------------------------------------------------------------
+
+
+class TestRewriteUrlsInH5PJSON:
+    def test_rewrite_path(self):
+        data = json.dumps(
+            {"video": {"path": "https://h5p.org/sites/default/files/h5p/iv.mp4"}}
+        )
+        url_map = {
+            "https://h5p.org/sites/default/files/h5p/iv.mp4": "_external/h5p.org/sites/default/files/h5p/iv.mp4"
+        }
+        result = rewrite_urls_in_h5p_json(data, url_map)
+        parsed = json.loads(result)
+        assert (
+            parsed["video"]["path"]
+            == "_external/h5p.org/sites/default/files/h5p/iv.mp4"
+        )
+
+    def test_rewrite_preserves_unmapped(self):
+        data = json.dumps(
+            {
+                "video": {"path": "https://example.com/video.mp4"},
+                "image": {"path": "https://other.com/image.jpg"},
+            }
+        )
+        url_map = {"https://example.com/video.mp4": "_external/example.com/video.mp4"}
+        result = rewrite_urls_in_h5p_json(data, url_map)
+        parsed = json.loads(result)
+        assert parsed["video"]["path"] == "_external/example.com/video.mp4"
+        assert parsed["image"]["path"] == "https://other.com/image.jpg"
+
+    def test_rewrite_deeply_nested(self):
+        data = json.dumps({"a": {"b": [{"path": "https://example.com/deep.mp4"}]}})
+        url_map = {"https://example.com/deep.mp4": "_external/example.com/deep.mp4"}
+        result = rewrite_urls_in_h5p_json(data, url_map)
+        parsed = json.loads(result)
+        assert parsed["a"]["b"][0]["path"] == "_external/example.com/deep.mp4"
+
+    def test_rewrite_relative_path_unchanged(self):
+        data = json.dumps({"image": {"path": "images/photo.jpg"}})
+        url_map = {"https://example.com/video.mp4": "_external/example.com/video.mp4"}
+        result = rewrite_urls_in_h5p_json(data, url_map)
+        parsed = json.loads(result)
+        assert parsed["image"]["path"] == "images/photo.jpg"
+
+    def test_empty_map(self):
+        data = json.dumps({"video": {"path": "https://example.com/video.mp4"}})
+        result = rewrite_urls_in_h5p_json(data, {})
+        parsed = json.loads(result)
+        assert parsed["video"]["path"] == "https://example.com/video.mp4"