From 69f422ff3e8b8306a09643c5effc09064271b75f Mon Sep 17 00:00:00 2001 From: rtibblesbot Date: Fri, 13 Feb 2026 00:41:53 -0800 Subject: [PATCH 1/4] Add shared URL extraction and rewriting utilities for archive processing Extract URL detection and rewriting logic into standalone, testable utility functions in url_utils.py. These operate on file contents (strings) rather than requiring HTTP sessions, addressing the testability concerns in #303. Supported reference types: - HTML/XML: src, href, srcset attributes; inline style; + + """ + urls = extract_urls_from_html(html, "index.html") + assert len(urls) == 1 + assert urls[0].url == "https://example.com/bg.png" + + def test_relative_urls_ignored(self): + html = '' + urls = extract_urls_from_html(html, "index.html") + assert len(urls) == 0 + + def test_data_urls_ignored(self): + html = '' + urls = extract_urls_from_html(html, "index.html") + assert len(urls) == 0 + + def test_empty_html(self): + urls = extract_urls_from_html("", "index.html") + assert len(urls) == 0 + + def test_minimal_html(self): + urls = extract_urls_from_html("", "index.html") + assert len(urls) == 0 + + def test_multiple_elements(self): + html = """ + + + + + + """ + urls = extract_urls_from_html(html, "index.html") + assert len(urls) == 3 + extracted = {u.url for u in urls} + assert "https://cdn.example.com/img1.jpg" in extracted + assert "https://cdn.example.com/img2.jpg" in extracted + assert "https://cdn.example.com/app.js" in extracted + + +# --------------------------------------------------------------------------- +# extract_urls_from_h5p_json tests +# --------------------------------------------------------------------------- + + +class TestExtractUrlsFromH5PJSON: + def test_extract_external_path(self): + data = json.dumps( + { + "video": { + "files": [ + {"path": "https://h5p.org/sites/default/files/h5p/iv.mp4"} + ] + } + } + ) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 1 + assert urls[0].url == "https://h5p.org/sites/default/files/h5p/iv.mp4" + assert urls[0].context == "h5p_json" + assert urls[0].source_file == "content.json" + + def test_relative_path_ignored(self): + data = json.dumps({"image": {"path": "images/photo.jpg"}}) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 0 + + def test_deeply_nested(self): + data = json.dumps( + { + "level1": { + "level2": { + "level3": [ + { + "path": "https://cdn.example.com/deep/resource.mp4" + } + ] + } + } + } + ) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 1 + assert urls[0].url == "https://cdn.example.com/deep/resource.mp4" + + def test_multiple_paths(self): + data = json.dumps( + { + "video": {"path": "https://example.com/video.mp4"}, + "image": {"path": "https://example.com/image.jpg"}, + "local": {"path": "images/local.jpg"}, + } + ) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 2 + extracted = {u.url for u in urls} + assert "https://example.com/video.mp4" in extracted + assert "https://example.com/image.jpg" in extracted + + def test_empty_json(self): + urls = extract_urls_from_h5p_json("{}", "content.json") + assert len(urls) == 0 + + def test_non_string_path_ignored(self): + data = json.dumps({"path": 42}) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 0 + + +# --------------------------------------------------------------------------- +# rewrite_urls_in_css tests +# --------------------------------------------------------------------------- + + +class TestRewriteUrlsInCSS: + def test_rewrite_url(self): + css = "body { background: url('https://example.com/bg.png') }" + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_css(css, url_map) + assert "url('_external/example.com/bg.png')" in result + assert "https://example.com/bg.png" not in result + + def test_rewrite_import(self): + css = "@import 'https://fonts.googleapis.com/css?family=Roboto';" + url_map = { + "https://fonts.googleapis.com/css?family=Roboto": "_external/fonts.googleapis.com/css" + } + result = rewrite_urls_in_css(css, url_map) + assert "_external/fonts.googleapis.com/css" in result + assert "https://fonts.googleapis.com/css?family=Roboto" not in result + + def test_rewrite_preserves_unmapped(self): + css = """ + body { background: url('https://example.com/bg.png') } + .other { background: url('https://other.com/bg.png') } + """ + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_css(css, url_map) + assert "_external/example.com/bg.png" in result + assert "https://other.com/bg.png" in result + + def test_rewrite_url_no_quotes(self): + css = "body { background: url(https://example.com/bg.png) }" + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_css(css, url_map) + assert "_external/example.com/bg.png" in result + + def test_empty_map(self): + css = "body { background: url('https://example.com/bg.png') }" + result = rewrite_urls_in_css(css, {}) + assert "https://example.com/bg.png" in result + + +# --------------------------------------------------------------------------- +# rewrite_urls_in_html tests +# --------------------------------------------------------------------------- + + +class TestRewriteUrlsInHTML: + def test_rewrite_img_src(self): + html = '' + url_map = { + "https://cdn.example.com/photo.jpg": "_external/cdn.example.com/photo.jpg" + } + result = rewrite_urls_in_html(html, url_map) + assert "_external/cdn.example.com/photo.jpg" in result + assert "https://cdn.example.com/photo.jpg" not in result + + def test_rewrite_srcset(self): + html = '' + url_map = { + "https://cdn.example.com/img-300.jpg": "_external/cdn.example.com/img-300.jpg", + "https://cdn.example.com/img-600.jpg": "_external/cdn.example.com/img-600.jpg", + } + result = rewrite_urls_in_html(html, url_map) + assert "_external/cdn.example.com/img-300.jpg" in result + assert "_external/cdn.example.com/img-600.jpg" in result + + def test_rewrite_style_block(self): + html = "" + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_html(html, url_map) + assert "_external/example.com/bg.png" in result + assert "https://example.com/bg.png" not in result + + def test_rewrite_inline_style(self): + html = """
text
""" + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_html(html, url_map) + assert "_external/example.com/bg.png" in result + + def test_rewrite_preserves_unmapped(self): + html = """ + + + """ + url_map = { + "https://cdn.example.com/img1.jpg": "_external/cdn.example.com/img1.jpg" + } + result = rewrite_urls_in_html(html, url_map) + assert "_external/cdn.example.com/img1.jpg" in result + assert "https://cdn.example.com/img2.jpg" in result + + def test_rewrite_link_href(self): + html = '' + url_map = { + "https://fonts.googleapis.com/css": "_external/fonts.googleapis.com/css" + } + result = rewrite_urls_in_html(html, url_map) + assert "_external/fonts.googleapis.com/css" in result + + def test_rewrite_script_src(self): + html = '' + url_map = { + "https://cdn.example.com/lib.js": "_external/cdn.example.com/lib.js" + } + result = rewrite_urls_in_html(html, url_map) + assert "_external/cdn.example.com/lib.js" in result + + +# --------------------------------------------------------------------------- +# rewrite_urls_in_h5p_json tests +# --------------------------------------------------------------------------- + + +class TestRewriteUrlsInH5PJSON: + def test_rewrite_path(self): + data = json.dumps( + {"video": {"path": "https://h5p.org/sites/default/files/h5p/iv.mp4"}} + ) + url_map = { + "https://h5p.org/sites/default/files/h5p/iv.mp4": "_external/h5p.org/sites/default/files/h5p/iv.mp4" + } + result = rewrite_urls_in_h5p_json(data, url_map) + parsed = json.loads(result) + assert ( + parsed["video"]["path"] + == "_external/h5p.org/sites/default/files/h5p/iv.mp4" + ) + + def test_rewrite_preserves_unmapped(self): + data = json.dumps( + { + "video": {"path": "https://example.com/video.mp4"}, + "image": {"path": "https://other.com/image.jpg"}, + } + ) + url_map = { + "https://example.com/video.mp4": "_external/example.com/video.mp4" + } + result = rewrite_urls_in_h5p_json(data, url_map) + parsed = json.loads(result) + assert parsed["video"]["path"] == "_external/example.com/video.mp4" + assert parsed["image"]["path"] == "https://other.com/image.jpg" + + def test_rewrite_deeply_nested(self): + data = json.dumps( + { + "a": { + "b": [{"path": "https://example.com/deep.mp4"}] + } + } + ) + url_map = { + "https://example.com/deep.mp4": "_external/example.com/deep.mp4" + } + result = rewrite_urls_in_h5p_json(data, url_map) + parsed = json.loads(result) + assert parsed["a"]["b"][0]["path"] == "_external/example.com/deep.mp4" + + def test_rewrite_relative_path_unchanged(self): + data = json.dumps({"image": {"path": "images/photo.jpg"}}) + url_map = { + "https://example.com/video.mp4": "_external/example.com/video.mp4" + } + result = rewrite_urls_in_h5p_json(data, url_map) + parsed = json.loads(result) + assert parsed["image"]["path"] == "images/photo.jpg" + + def test_empty_map(self): + data = json.dumps({"video": {"path": "https://example.com/video.mp4"}}) + result = rewrite_urls_in_h5p_json(data, {}) + parsed = json.loads(result) + assert parsed["video"]["path"] == "https://example.com/video.mp4" From 6e533fd4453d33507275a52cb3cf9f9617c5cd2b Mon Sep 17 00:00:00 2001 From: rtibblesbot Date: Fri, 13 Feb 2026 00:42:01 -0800 Subject: [PATCH 2/4] Add archive external reference processor for offline content bundling Build on url_utils to create an archive-level processor that: 1. Extracts archives to a temp directory 2. Scans text-based files (HTML, CSS, JSON) for external URL references 3. Downloads external resources into an _external/ subdirectory 4. Rewrites references to point to local copies 5. Returns the temp directory path for create_predictable_zip() Handles edge cases including download failures (preserves original URL), duplicate URL deduplication, recursive CSS downloads (fonts referenced by stylesheets), configurable URL blacklisting, and loop detection. Co-Authored-By: Claude Opus 4.6 --- ricecooker/utils/archive_assets.py | 256 +++++++++++++++ tests/test_archive_assets.py | 492 +++++++++++++++++++++++++++++ 2 files changed, 748 insertions(+) create mode 100644 ricecooker/utils/archive_assets.py create mode 100644 tests/test_archive_assets.py diff --git a/ricecooker/utils/archive_assets.py b/ricecooker/utils/archive_assets.py new file mode 100644 index 00000000..1acf2d03 --- /dev/null +++ b/ricecooker/utils/archive_assets.py @@ -0,0 +1,256 @@ +""" +Archive external reference processor. + +Opens an archive (ZIP/H5P), scans text-based files for external URL references, +downloads those resources, bundles them into the archive, and rewrites references +to point to local copies. +""" + +import logging +import os +import tempfile +import zipfile + +from ricecooker.utils.downloader import make_request +from ricecooker.utils.url_utils import ( + derive_local_filename, + extract_urls_from_css, + extract_urls_from_h5p_json, + extract_urls_from_html, + rewrite_urls_in_css, + rewrite_urls_in_h5p_json, + rewrite_urls_in_html, +) + +logger = logging.getLogger(__name__) + +# Map file extensions to content type for selecting the right extractor/rewriter +_TEXT_EXTENSIONS = { + ".html": "html", + ".htm": "html", + ".xhtml": "html", + ".xml": "html", + ".css": "css", + ".json": "json", +} + + +def _is_h5p_content_json(filepath): + """Check if a JSON file is an H5P content.json that should be scanned.""" + normalized = filepath.replace("\\", "/") + return normalized == "content/content.json" or normalized.endswith( + "/content/content.json" + ) + + +def _detect_content_type(filepath): + """Detect the content type of a file based on its extension.""" + ext = os.path.splitext(filepath)[1].lower() + if ext == ".json": + if _is_h5p_content_json(filepath): + return "json" + return None # Skip non-H5P JSON files + return _TEXT_EXTENSIONS.get(ext) + + +def _compute_relative_path(from_file, to_file): + """Compute relative path from one file to another within the archive.""" + from_dir = os.path.dirname(from_file) + return os.path.relpath(to_file, from_dir).replace("\\", "/") + + +def _is_blacklisted(url, blacklist): + """Check if a URL matches any blacklist substring.""" + if not blacklist: + return False + return any(pattern in url for pattern in blacklist) + + +def _download_external_url(url, dest_dir, local_path): + """ + Download a single external URL to the destination directory. + + Returns True on success, False on failure. + """ + full_path = os.path.join(dest_dir, local_path) + # Guard against path traversal — resolved path must stay within dest_dir + resolved = os.path.realpath(full_path) + if not resolved.startswith(os.path.realpath(dest_dir) + os.sep): + logger.warning("Path traversal detected for %s, skipping download", url) + return False + os.makedirs(os.path.dirname(full_path), exist_ok=True) + + try: + response = make_request(url) + if response is None or response.status_code != 200: + logger.warning("Failed to download %s (no response or non-200)", url) + return False + with open(full_path, "wb") as f: + f.write(response.content) + return True + except Exception: + logger.warning("Error downloading %s", url, exc_info=True) + return False + + +def download_and_rewrite_external_refs(archive_path, url_blacklist=None): + """ + Process an archive to download external URL references and rewrite them + to local paths. + + Args: + archive_path: Path to the archive file (ZIP or H5P) + url_blacklist: Optional list of URL substrings to skip + + Returns: + Path to a temporary directory containing the processed archive contents. + The caller is responsible for cleaning up this directory. + """ + # Extract archive to temp directory + temp_dir = tempfile.mkdtemp(prefix="ricecooker_archive_") + + with zipfile.ZipFile(archive_path, "r") as zf: + zf.extractall(temp_dir) + + # Phase 1: Scan all text files for external URLs + all_urls = {} # url -> derive_local_filename result + file_urls = {} # filepath -> list of extracted URLs + + for root, _dirs, filenames in os.walk(temp_dir): + for filename in filenames: + full_path = os.path.join(root, filename) + rel_path = os.path.relpath(full_path, temp_dir) + content_type = _detect_content_type(rel_path) + + if content_type is None: + continue + + try: + with open(full_path, "r", encoding="utf-8") as f: + content = f.read() + except (UnicodeDecodeError, OSError): + logger.warning("Could not read %s as text, skipping", rel_path) + continue + + if content_type == "html": + extracted = extract_urls_from_html(content, rel_path) + elif content_type == "css": + extracted = extract_urls_from_css(content, rel_path) + elif content_type == "json": + extracted = extract_urls_from_h5p_json(content, rel_path) + else: + continue + + # Filter out blacklisted URLs + external = [ + e + for e in extracted + if not _is_blacklisted(e.url, url_blacklist) + ] + + if external: + file_urls[rel_path] = external + for e in external: + if e.url not in all_urls: + all_urls[e.url] = derive_local_filename(e.url) + + if not all_urls: + return temp_dir + + # Phase 2: Download all external URLs + successful_downloads = set() + visited_urls = set() + + for url, local_path in list(all_urls.items()): + if url in visited_urls: + continue + visited_urls.add(url) + + if _download_external_url(url, temp_dir, local_path): + successful_downloads.add(url) + + # CSS recursive download: scan downloaded CSS for more external refs + if local_path.endswith(".css") or "css" in local_path.split("?")[0]: + _process_downloaded_css( + temp_dir, + local_path, + all_urls, + successful_downloads, + visited_urls, + url_blacklist, + ) + + # Phase 3: Rewrite references in text files + url_map_by_file = {} + for rel_path, extracted_list in file_urls.items(): + url_map = {} + for e in extracted_list: + if e.url in successful_downloads: + local_path = all_urls[e.url] + url_map[e.url] = _compute_relative_path(rel_path, local_path) + if url_map: + url_map_by_file[rel_path] = url_map + + for rel_path, url_map in url_map_by_file.items(): + full_path = os.path.join(temp_dir, rel_path) + content_type = _detect_content_type(rel_path) + + with open(full_path, "r", encoding="utf-8") as f: + content = f.read() + + if content_type == "html": + content = rewrite_urls_in_html(content, url_map) + elif content_type == "css": + content = rewrite_urls_in_css(content, url_map) + elif content_type == "json": + content = rewrite_urls_in_h5p_json(content, url_map) + + with open(full_path, "w", encoding="utf-8") as f: + f.write(content) + + return temp_dir + + +def _process_downloaded_css( + temp_dir, css_local_path, all_urls, successful_downloads, visited_urls, url_blacklist +): + """Scan a downloaded CSS file for additional external references and download them.""" + full_path = os.path.join(temp_dir, css_local_path) + try: + with open(full_path, "r", encoding="utf-8") as f: + css_content = f.read() + except (UnicodeDecodeError, OSError): + return + + extracted = extract_urls_from_css(css_content, css_local_path) + external = [ + e for e in extracted if not _is_blacklisted(e.url, url_blacklist) + ] + + if not external: + return + + # Download newly found external URLs + css_url_map = {} + for e in external: + if e.url in visited_urls: + continue + visited_urls.add(e.url) + + local_path = derive_local_filename(e.url) + all_urls[e.url] = local_path + + if _download_external_url(e.url, temp_dir, local_path): + successful_downloads.add(e.url) + css_url_map[e.url] = _compute_relative_path(css_local_path, local_path) + + # Also build map for any already-downloaded URLs referenced from this CSS + for e in external: + if e.url in successful_downloads and e.url not in css_url_map: + local_path = all_urls[e.url] + css_url_map[e.url] = _compute_relative_path(css_local_path, local_path) + + if css_url_map: + rewritten = rewrite_urls_in_css(css_content, css_url_map) + with open(full_path, "w", encoding="utf-8") as f: + f.write(rewritten) diff --git a/tests/test_archive_assets.py b/tests/test_archive_assets.py new file mode 100644 index 00000000..0faa30c6 --- /dev/null +++ b/tests/test_archive_assets.py @@ -0,0 +1,492 @@ +""" +Tests for ricecooker.utils.archive_assets — archive external reference processor. + +Tests create in-memory ZIP archives, call download_and_rewrite_external_refs, +and verify the output directory contents. HTTP downloads are mocked. +""" + +import json +import os +import shutil +import tempfile +import zipfile +from unittest.mock import patch + +import pytest + +from ricecooker.utils.archive_assets import download_and_rewrite_external_refs + + +class MockResponse: + """Mock HTTP response for mocked downloads.""" + + def __init__(self, content=b"downloaded content", status_code=200): + self.content = content + self.status_code = status_code + + def raise_for_status(self): + if self.status_code >= 400: + from requests.exceptions import HTTPError + + raise HTTPError(response=self) + + +def _create_zip(files_dict): + """Create a temporary ZIP file from a dict of {path: content}.""" + fd, zip_path = tempfile.mkstemp(suffix=".zip") + os.close(fd) + with zipfile.ZipFile(zip_path, "w") as zf: + for path, content in files_dict.items(): + if isinstance(content, str): + content = content.encode("utf-8") + zf.writestr(path, content) + return zip_path + + +@pytest.fixture +def mock_download(): + """Mock make_request to return predictable content.""" + with patch("ricecooker.utils.archive_assets.make_request") as mock: + mock.return_value = MockResponse(content=b"downloaded content") + yield mock + + +@pytest.fixture +def mock_download_css_then_font(): + """Mock that returns CSS on first call and font bytes on subsequent calls.""" + css_content = b"@font-face { src: url('https://fonts.example.com/roboto.woff2') }" + font_content = b"font-binary-data" + + responses = { + "https://fonts.googleapis.com/css": MockResponse(content=css_content), + "https://fonts.example.com/roboto.woff2": MockResponse(content=font_content), + } + + def side_effect(url, *args, **kwargs): + return responses.get(url, MockResponse(content=b"unknown", status_code=404)) + + with patch("ricecooker.utils.archive_assets.make_request") as mock: + mock.side_effect = side_effect + yield mock + + +# --------------------------------------------------------------------------- +# Basic functionality tests +# --------------------------------------------------------------------------- + + +class TestBasicFunctionality: + def test_html_with_external_img(self, mock_download): + zip_path = _create_zip( + { + "index.html": '' + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + # Image should be downloaded + img_path = os.path.join( + result_dir, "_external", "cdn.example.com", "photo.jpg" + ) + assert os.path.exists(img_path) + + # HTML should be rewritten + with open(os.path.join(result_dir, "index.html")) as f: + html = f.read() + assert "https://cdn.example.com/photo.jpg" not in html + assert "_external/cdn.example.com/photo.jpg" in html + + mock_download.assert_called_once() + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_html_with_external_css(self, mock_download): + zip_path = _create_zip( + { + "index.html": '' + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + css_path = os.path.join( + result_dir, "_external", "fonts.googleapis.com", "css" + ) + assert os.path.exists(css_path) + + with open(os.path.join(result_dir, "index.html")) as f: + html = f.read() + assert "https://fonts.googleapis.com/css" not in html + assert "_external/fonts.googleapis.com/css" in html + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_html_with_external_script(self, mock_download): + zip_path = _create_zip( + { + "index.html": '' + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + js_path = os.path.join( + result_dir, "_external", "cdn.example.com", "lib.js" + ) + assert os.path.exists(js_path) + + with open(os.path.join(result_dir, "index.html")) as f: + html = f.read() + assert "_external/cdn.example.com/lib.js" in html + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_css_with_external_font(self, mock_download): + zip_path = _create_zip( + { + "styles/main.css": "@font-face { src: url('https://fonts.example.com/roboto.woff2') format('woff2'); }" + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + font_path = os.path.join( + result_dir, + "_external", + "fonts.example.com", + "roboto.woff2", + ) + assert os.path.exists(font_path) + + with open(os.path.join(result_dir, "styles", "main.css")) as f: + css = f.read() + assert "https://fonts.example.com/roboto.woff2" not in css + # Path should be relative from styles/ to _external/ + assert "../_external/fonts.example.com/roboto.woff2" in css + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_css_with_import(self, mock_download): + zip_path = _create_zip( + { + "style.css": "@import 'https://fonts.googleapis.com/css?family=Roboto';" + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + css_path = os.path.join( + result_dir, + "_external", + "fonts.googleapis.com", + "css?family=Roboto", + ) + assert os.path.exists(css_path) + + with open(os.path.join(result_dir, "style.css")) as f: + css = f.read() + assert "https://fonts.googleapis.com" not in css + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_h5p_with_external_video(self, mock_download): + content_json = json.dumps( + { + "video": { + "files": [ + { + "path": "https://h5p.org/sites/default/files/h5p/iv.mp4", + "mime": "video/mp4", + } + ] + } + } + ) + zip_path = _create_zip( + { + "content/content.json": content_json, + "h5p.json": '{"mainLibrary": "H5P.InteractiveVideo"}', + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + video_path = os.path.join( + result_dir, + "_external", + "h5p.org", + "sites", + "default", + "files", + "h5p", + "iv.mp4", + ) + assert os.path.exists(video_path) + + with open( + os.path.join(result_dir, "content", "content.json") + ) as f: + data = json.load(f) + path_val = data["video"]["files"][0]["path"] + assert "https://h5p.org" not in path_val + assert "_external/" in path_val + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + +# --------------------------------------------------------------------------- +# Edge case tests +# --------------------------------------------------------------------------- + + +class TestEdgeCases: + def test_relative_urls_unchanged(self, mock_download): + zip_path = _create_zip( + { + "index.html": '', + "images/photo.jpg": b"fake-image-data", + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + with open(os.path.join(result_dir, "index.html")) as f: + html = f.read() + assert 'src="images/photo.jpg"' in html + mock_download.assert_not_called() + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_data_urls_unchanged(self, mock_download): + zip_path = _create_zip( + { + "index.html": '' + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + with open(os.path.join(result_dir, "index.html")) as f: + html = f.read() + assert "data:image/png;base64,abc123" in html + mock_download.assert_not_called() + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_download_failure_preserves_original(self): + with patch("ricecooker.utils.archive_assets.make_request") as mock: + mock.return_value = None # Simulate failed download + zip_path = _create_zip( + { + "index.html": '' + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + with open(os.path.join(result_dir, "index.html")) as f: + html = f.read() + # Original URL should be preserved when download fails + assert "https://cdn.example.com/photo.jpg" in html + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_duplicate_urls_downloaded_once(self, mock_download): + zip_path = _create_zip( + { + "page1.html": '', + "page2.html": '', + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + # Download should only happen once for the same URL + mock_download.assert_called_once() + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_css_recursive_download(self, mock_download_css_then_font): + zip_path = _create_zip( + { + "index.html": '' + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + # Both CSS and font should be downloaded + css_path = os.path.join( + result_dir, "_external", "fonts.googleapis.com", "css" + ) + font_path = os.path.join( + result_dir, + "_external", + "fonts.example.com", + "roboto.woff2", + ) + assert os.path.exists(css_path) + assert os.path.exists(font_path) + + # The downloaded CSS should have its font URL rewritten too + with open(css_path) as f: + css = f.read() + assert "https://fonts.example.com/roboto.woff2" not in css + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_empty_archive(self, mock_download): + zip_path = _create_zip({"empty.txt": ""}) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + assert os.path.isdir(result_dir) + mock_download.assert_not_called() + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_binary_files_untouched(self, mock_download): + binary_content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100 + zip_path = _create_zip( + { + "index.html": '', + "images/local.png": binary_content, + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + with open(os.path.join(result_dir, "images", "local.png"), "rb") as f: + assert f.read() == binary_content + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_path_traversal_url_stays_in_temp_dir(self, mock_download): + zip_path = _create_zip( + { + "index.html": '' + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + # The downloaded file must be inside the result directory + for root, _dirs, filenames in os.walk(result_dir): + for filename in filenames: + full_path = os.path.join(root, filename) + assert os.path.realpath(full_path).startswith( + os.path.realpath(result_dir) + ), f"File {full_path} escapes result directory" + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_blacklisted_urls_skipped(self, mock_download): + zip_path = _create_zip( + { + "index.html": """ + + + """ + } + ) + try: + result_dir = download_and_rewrite_external_refs( + zip_path, url_blacklist=["blocked.example.com"] + ) + try: + with open(os.path.join(result_dir, "index.html")) as f: + html = f.read() + # Allowed URL should be downloaded and rewritten + assert "_external/cdn.example.com/photo.jpg" in html + # Blocked URL should remain unchanged + assert "https://blocked.example.com/img.jpg" in html + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + +# --------------------------------------------------------------------------- +# Integration shape tests +# --------------------------------------------------------------------------- + + +class TestIntegrationShape: + def test_returns_directory_path(self, mock_download): + zip_path = _create_zip({"index.html": ""}) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + assert os.path.isdir(result_dir) + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_original_files_preserved(self, mock_download): + zip_path = _create_zip( + { + "index.html": '', + "images/local.png": b"png-data", + "scripts/app.js": "console.log('hello');", + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + assert os.path.exists(os.path.join(result_dir, "index.html")) + assert os.path.exists( + os.path.join(result_dir, "images", "local.png") + ) + assert os.path.exists( + os.path.join(result_dir, "scripts", "app.js") + ) + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) + + def test_external_files_in_subdirectory(self, mock_download): + zip_path = _create_zip( + { + "index.html": '' + } + ) + try: + result_dir = download_and_rewrite_external_refs(zip_path) + try: + external_dir = os.path.join(result_dir, "_external") + assert os.path.isdir(external_dir) + finally: + shutil.rmtree(result_dir, ignore_errors=True) + finally: + os.unlink(zip_path) From 1eb128d344b0e7ebff5d5e40a888846103f1da2a Mon Sep 17 00:00:00 2001 From: rtibblesbot Date: Fri, 13 Feb 2026 00:42:08 -0800 Subject: [PATCH 3/4] Integrate external reference processing into archive pipeline handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire process_archive_external_references() into the archive pipeline, running before create_predictable_zip() in handle_file(). This ensures external URLs in H5P, HTML5, and IMSCP archives are downloaded and bundled for offline use in Kolibri. The processing step runs for all ArchiveProcessingBaseHandler subclasses (H5P, HTML5, IMSCP) and gracefully handles failures — if downloading fails, the original archive is used unchanged. Closes #233 Co-Authored-By: Claude Opus 4.6 --- ricecooker/utils/pipeline/convert.py | 26 +++++- tests/pipeline/test_convert.py | 134 +++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 1 deletion(-) diff --git a/ricecooker/utils/pipeline/convert.py b/ricecooker/utils/pipeline/convert.py index 4c9e209b..fef557e7 100644 --- a/ricecooker/utils/pipeline/convert.py +++ b/ricecooker/utils/pipeline/convert.py @@ -46,6 +46,7 @@ from ricecooker.utils.videos import validate_media_file from ricecooker.utils.videos import VideoCompressionError from ricecooker.utils.youtube import get_language_with_alpha2_fallback +from ricecooker.utils.archive_assets import download_and_rewrite_external_refs from ricecooker.utils.zip import create_predictable_zip @@ -194,11 +195,32 @@ def FILE_TYPE(self) -> str: def validate_archive(self, path: str): pass + def _process_external_refs(self, path): + """ + Process external URL references in the archive. + + Returns the path to process — either a temp directory with downloaded + assets, or the original path if processing fails or finds nothing. + """ + try: + return download_and_rewrite_external_refs(path) + except Exception as e: + config.LOGGER.warning( + "Failed to process external references in %s: %s. " + "Continuing with original archive.", + path, + e, + ) + return path + def handle_file(self, path, audio_settings=None, video_settings=None): self.validate_archive(path) ext = extract_path_ext(path) + # Download external references and get processed directory + processed_path = self._process_external_refs(path) + # Create partial for reading & compressing subfiles file_converter = partial( self._read_and_compress_archive_file, @@ -208,7 +230,7 @@ def handle_file(self, path, audio_settings=None, video_settings=None): ) # create_predictable_zip will iterate over subfiles, call file_converter processed_zip_path = create_predictable_zip( - path, file_converter=file_converter if config.COMPRESS else None + processed_path, file_converter=file_converter if config.COMPRESS else None ) with self.write_file(ext) as fh: @@ -217,6 +239,8 @@ def handle_file(self, path, audio_settings=None, video_settings=None): # Clean up os.unlink(processed_zip_path) + if processed_path != path: + shutil.rmtree(processed_path, ignore_errors=True) @contextmanager def open_and_verify_archive(self, path): diff --git a/tests/pipeline/test_convert.py b/tests/pipeline/test_convert.py index abbdcf99..41f803f9 100644 --- a/tests/pipeline/test_convert.py +++ b/tests/pipeline/test_convert.py @@ -1,13 +1,26 @@ """Tests for audio and video compression in archive files.""" +import json import os import tempfile import zipfile from unittest.mock import patch +from ricecooker import config from ricecooker.classes.files import H5PFile from ricecooker.classes.files import HTMLZipFile +class MockResponse: + """Mock HTTP response for archive asset downloads.""" + + def __init__(self, content=b"downloaded content", status_code=200): + self.content = content + self.status_code = status_code + + def raise_for_status(self): + pass + + def test_html5_archive_with_mp4_compression(video_file, audio_file): """Test that MP4 and MP3 files within HTML5 archives are compressed when compression is enabled.""" # Create temporary HTML5 archive with media files @@ -120,3 +133,124 @@ def test_archive_no_compression_when_disabled(video_file, audio_file): finally: os.unlink(temp_archive.name) + + +def test_html5_archive_external_refs_downloaded(): + """External URLs in HTML5 archives are downloaded and rewritten.""" + temp_archive = tempfile.NamedTemporaryFile(suffix=".zip", delete=False) + temp_archive.close() + + try: + with zipfile.ZipFile(temp_archive.name, "w") as zf: + zf.writestr( + "index.html", + 'Content here ', + ) + + with patch("ricecooker.utils.archive_assets.make_request") as mock_request: + mock_request.return_value = MockResponse(content=b"fake-image-data") + + with patch("ricecooker.config.COMPRESS", False): + html_file = HTMLZipFile(temp_archive.name) + result = html_file.process_file() + + assert result is not None, "Processing should succeed" + + # Verify the output ZIP contains the downloaded file and rewritten HTML + result_path = config.get_storage_path(result) + with zipfile.ZipFile(result_path, "r") as zf: + names = zf.namelist() + assert any( + "_external/" in n for n in names + ), f"Expected _external/ directory in output ZIP, got: {names}" + + html = zf.read("index.html").decode("utf-8") + assert ( + "https://cdn.example.com/photo.jpg" not in html + ), "External URL should be rewritten" + assert "_external/" in html, "Should reference local _external/ path" + + finally: + os.unlink(temp_archive.name) + + +def test_h5p_archive_external_video_downloaded(): + """External video URLs in H5P content.json are downloaded and rewritten.""" + temp_archive = tempfile.NamedTemporaryFile(suffix=".h5p", delete=False) + temp_archive.close() + + content = json.dumps( + { + "video": { + "files": [ + { + "path": "https://h5p.org/sites/default/files/h5p/iv.mp4", + "mime": "video/mp4", + } + ] + } + } + ) + + try: + with zipfile.ZipFile(temp_archive.name, "w") as zf: + zf.writestr("h5p.json", '{"mainLibrary": "H5P.InteractiveVideo"}') + zf.writestr("content/content.json", content) + + with patch("ricecooker.utils.archive_assets.make_request") as mock_request: + mock_request.return_value = MockResponse(content=b"fake-video-data") + + with patch("ricecooker.config.COMPRESS", False): + h5p_file = H5PFile(temp_archive.name) + result = h5p_file.process_file() + + assert result is not None, "Processing should succeed" + + result_path = config.get_storage_path(result) + with zipfile.ZipFile(result_path, "r") as zf: + names = zf.namelist() + assert any( + "_external/" in n for n in names + ), f"Expected _external/ directory in output ZIP, got: {names}" + + data = json.loads(zf.read("content/content.json")) + video_path = data["video"]["files"][0]["path"] + assert "https://h5p.org" not in video_path, "External URL should be rewritten" + assert "_external/" in video_path, "Should reference local path" + + finally: + os.unlink(temp_archive.name) + + +def test_archive_external_refs_failure_graceful(): + """If external ref downloading fails, archive still processes successfully.""" + temp_archive = tempfile.NamedTemporaryFile(suffix=".zip", delete=False) + temp_archive.close() + + try: + with zipfile.ZipFile(temp_archive.name, "w") as zf: + # Use unique content to avoid cache hit from previous test + zf.writestr( + "index.html", + 'Unique failure test content ', + ) + + with patch( + "ricecooker.utils.pipeline.convert.download_and_rewrite_external_refs" + ) as mock_process: + mock_process.side_effect = Exception("Network error") + + with patch("ricecooker.config.COMPRESS", False): + html_file = HTMLZipFile(temp_archive.name) + result = html_file.process_file() + + assert result is not None, "Processing should succeed even on external ref failure" + + # Original URL should be preserved since processing failed + result_path = config.get_storage_path(result) + with zipfile.ZipFile(result_path, "r") as zf: + html = zf.read("index.html").decode("utf-8") + assert "https://cdn.example.com/photo.jpg" in html + + finally: + os.unlink(temp_archive.name) From 8a0641158d136742cfc94818ccb6b2d087cf351a Mon Sep 17 00:00:00 2001 From: rtibblesbot Date: Fri, 13 Feb 2026 07:14:01 -0800 Subject: [PATCH 4/4] Address review feedback: fix bare exceptions, linting, and formatting - Replace bare `except Exception` with specific exception types (OSError, zipfile.BadZipFile, ValueError, IOError) - Reduce McCabe complexity by extracting helper functions: - archive_assets.py: split scan/download/rewrite phases - url_utils.py: split HTML extract/rewrite into per-selector helpers - Fix import ordering (reorder-python-imports) and formatting (black) - Update test to use OSError instead of bare Exception Co-Authored-By: Claude Opus 4.6 --- ricecooker/utils/archive_assets.py | 173 +++++++++++++++------------ ricecooker/utils/pipeline/convert.py | 6 +- ricecooker/utils/url_utils.py | 121 +++++++++++-------- tests/pipeline/test_convert.py | 11 +- tests/test_archive_assets.py | 24 +--- tests/test_url_utils.py | 59 +++------ 6 files changed, 202 insertions(+), 192 deletions(-) diff --git a/ricecooker/utils/archive_assets.py b/ricecooker/utils/archive_assets.py index 1acf2d03..f464fde4 100644 --- a/ricecooker/utils/archive_assets.py +++ b/ricecooker/utils/archive_assets.py @@ -12,15 +12,13 @@ import zipfile from ricecooker.utils.downloader import make_request -from ricecooker.utils.url_utils import ( - derive_local_filename, - extract_urls_from_css, - extract_urls_from_h5p_json, - extract_urls_from_html, - rewrite_urls_in_css, - rewrite_urls_in_h5p_json, - rewrite_urls_in_html, -) +from ricecooker.utils.url_utils import derive_local_filename +from ricecooker.utils.url_utils import extract_urls_from_css +from ricecooker.utils.url_utils import extract_urls_from_h5p_json +from ricecooker.utils.url_utils import extract_urls_from_html +from ricecooker.utils.url_utils import rewrite_urls_in_css +from ricecooker.utils.url_utils import rewrite_urls_in_h5p_json +from ricecooker.utils.url_utils import rewrite_urls_in_html logger = logging.getLogger(__name__) @@ -88,31 +86,33 @@ def _download_external_url(url, dest_dir, local_path): with open(full_path, "wb") as f: f.write(response.content) return True - except Exception: + except (OSError, IOError, ValueError): logger.warning("Error downloading %s", url, exc_info=True) return False -def download_and_rewrite_external_refs(archive_path, url_blacklist=None): - """ - Process an archive to download external URL references and rewrite them - to local paths. - - Args: - archive_path: Path to the archive file (ZIP or H5P) - url_blacklist: Optional list of URL substrings to skip - - Returns: - Path to a temporary directory containing the processed archive contents. - The caller is responsible for cleaning up this directory. - """ - # Extract archive to temp directory - temp_dir = tempfile.mkdtemp(prefix="ricecooker_archive_") - - with zipfile.ZipFile(archive_path, "r") as zf: - zf.extractall(temp_dir) - - # Phase 1: Scan all text files for external URLs +def _extract_urls_from_file(full_path, rel_path, content_type): + """Extract external URLs from a single file. Returns list or None on error.""" + try: + with open(full_path, "r", encoding="utf-8") as f: + content = f.read() + except (UnicodeDecodeError, OSError): + logger.warning("Could not read %s as text, skipping", rel_path) + return None + + extractors = { + "html": extract_urls_from_html, + "css": extract_urls_from_css, + "json": extract_urls_from_h5p_json, + } + extractor = extractors.get(content_type) + if extractor is None: + return None + return extractor(content, rel_path) + + +def _scan_archive_for_urls(temp_dir, url_blacklist): + """Scan all text files in an extracted archive for external URLs.""" all_urls = {} # url -> derive_local_filename result file_urls = {} # filepath -> list of extracted URLs @@ -121,43 +121,27 @@ def download_and_rewrite_external_refs(archive_path, url_blacklist=None): full_path = os.path.join(root, filename) rel_path = os.path.relpath(full_path, temp_dir) content_type = _detect_content_type(rel_path) - if content_type is None: continue - try: - with open(full_path, "r", encoding="utf-8") as f: - content = f.read() - except (UnicodeDecodeError, OSError): - logger.warning("Could not read %s as text, skipping", rel_path) - continue - - if content_type == "html": - extracted = extract_urls_from_html(content, rel_path) - elif content_type == "css": - extracted = extract_urls_from_css(content, rel_path) - elif content_type == "json": - extracted = extract_urls_from_h5p_json(content, rel_path) - else: + extracted = _extract_urls_from_file(full_path, rel_path, content_type) + if extracted is None: continue - # Filter out blacklisted URLs external = [ - e - for e in extracted - if not _is_blacklisted(e.url, url_blacklist) + e for e in extracted if not _is_blacklisted(e.url, url_blacklist) ] - if external: file_urls[rel_path] = external for e in external: if e.url not in all_urls: all_urls[e.url] = derive_local_filename(e.url) - if not all_urls: - return temp_dir + return all_urls, file_urls - # Phase 2: Download all external URLs + +def _download_all_urls(temp_dir, all_urls, url_blacklist): + """Download all external URLs, including recursive CSS references.""" successful_downloads = set() visited_urls = set() @@ -168,8 +152,6 @@ def download_and_rewrite_external_refs(archive_path, url_blacklist=None): if _download_external_url(url, temp_dir, local_path): successful_downloads.add(url) - - # CSS recursive download: scan downloaded CSS for more external refs if local_path.endswith(".css") or "css" in local_path.split("?")[0]: _process_downloaded_css( temp_dir, @@ -180,8 +162,55 @@ def download_and_rewrite_external_refs(archive_path, url_blacklist=None): url_blacklist, ) - # Phase 3: Rewrite references in text files - url_map_by_file = {} + return successful_downloads + + +def _rewrite_file(temp_dir, rel_path, url_map): + """Rewrite URL references in a single file.""" + full_path = os.path.join(temp_dir, rel_path) + content_type = _detect_content_type(rel_path) + + with open(full_path, "r", encoding="utf-8") as f: + content = f.read() + + rewriters = { + "html": rewrite_urls_in_html, + "css": rewrite_urls_in_css, + "json": rewrite_urls_in_h5p_json, + } + rewriter = rewriters.get(content_type) + if rewriter: + content = rewriter(content, url_map) + + with open(full_path, "w", encoding="utf-8") as f: + f.write(content) + + +def download_and_rewrite_external_refs(archive_path, url_blacklist=None): + """ + Process an archive to download external URL references and rewrite them + to local paths. + + Args: + archive_path: Path to the archive file (ZIP or H5P) + url_blacklist: Optional list of URL substrings to skip + + Returns: + Path to a temporary directory containing the processed archive contents. + The caller is responsible for cleaning up this directory. + """ + temp_dir = tempfile.mkdtemp(prefix="ricecooker_archive_") + + with zipfile.ZipFile(archive_path, "r") as zf: + zf.extractall(temp_dir) + + all_urls, file_urls = _scan_archive_for_urls(temp_dir, url_blacklist) + + if not all_urls: + return temp_dir + + successful_downloads = _download_all_urls(temp_dir, all_urls, url_blacklist) + for rel_path, extracted_list in file_urls.items(): url_map = {} for e in extracted_list: @@ -189,30 +218,18 @@ def download_and_rewrite_external_refs(archive_path, url_blacklist=None): local_path = all_urls[e.url] url_map[e.url] = _compute_relative_path(rel_path, local_path) if url_map: - url_map_by_file[rel_path] = url_map - - for rel_path, url_map in url_map_by_file.items(): - full_path = os.path.join(temp_dir, rel_path) - content_type = _detect_content_type(rel_path) - - with open(full_path, "r", encoding="utf-8") as f: - content = f.read() - - if content_type == "html": - content = rewrite_urls_in_html(content, url_map) - elif content_type == "css": - content = rewrite_urls_in_css(content, url_map) - elif content_type == "json": - content = rewrite_urls_in_h5p_json(content, url_map) - - with open(full_path, "w", encoding="utf-8") as f: - f.write(content) + _rewrite_file(temp_dir, rel_path, url_map) return temp_dir def _process_downloaded_css( - temp_dir, css_local_path, all_urls, successful_downloads, visited_urls, url_blacklist + temp_dir, + css_local_path, + all_urls, + successful_downloads, + visited_urls, + url_blacklist, ): """Scan a downloaded CSS file for additional external references and download them.""" full_path = os.path.join(temp_dir, css_local_path) @@ -223,9 +240,7 @@ def _process_downloaded_css( return extracted = extract_urls_from_css(css_content, css_local_path) - external = [ - e for e in extracted if not _is_blacklisted(e.url, url_blacklist) - ] + external = [e for e in extracted if not _is_blacklisted(e.url, url_blacklist)] if not external: return diff --git a/ricecooker/utils/pipeline/convert.py b/ricecooker/utils/pipeline/convert.py index fef557e7..faf533cc 100644 --- a/ricecooker/utils/pipeline/convert.py +++ b/ricecooker/utils/pipeline/convert.py @@ -2,6 +2,7 @@ To avoid making the pipeline overly convoluted, these handlers both validate and convert files. """ + import json import os import shutil @@ -31,6 +32,7 @@ from .file_handler import StageHandler from ricecooker import config from ricecooker.exceptions import UnknownFileTypeError +from ricecooker.utils.archive_assets import download_and_rewrite_external_refs from ricecooker.utils.audio import AudioCompressionError from ricecooker.utils.audio import compress_audio from ricecooker.utils.caching import generate_key @@ -46,10 +48,8 @@ from ricecooker.utils.videos import validate_media_file from ricecooker.utils.videos import VideoCompressionError from ricecooker.utils.youtube import get_language_with_alpha2_fallback -from ricecooker.utils.archive_assets import download_and_rewrite_external_refs from ricecooker.utils.zip import create_predictable_zip - CONVERTIBLE_FORMATS = {p.id: p.convertible_formats for p in format_presets.PRESETLIST} @@ -204,7 +204,7 @@ def _process_external_refs(self, path): """ try: return download_and_rewrite_external_refs(path) - except Exception as e: + except (OSError, zipfile.BadZipFile, ValueError) as e: config.LOGGER.warning( "Failed to process external references in %s: %s. " "Continuing with original archive.", diff --git a/ricecooker/utils/url_utils.py b/ricecooker/utils/url_utils.py index b2aa913b..62055d66 100644 --- a/ricecooker/utils/url_utils.py +++ b/ricecooker/utils/url_utils.py @@ -113,28 +113,8 @@ def _parse_srcset(srcset_value): return urls -def extract_urls_from_html(html_content, source_file=""): - """ - Extract all external URL references from HTML content. - - Finds URLs in: - - img[src], script[src], source[src] - - img[srcset], source[srcset] - - link[rel=stylesheet][href] - - inline style attributes with url() - -