diff --git a/ricecooker/utils/archive_assets.py b/ricecooker/utils/archive_assets.py new file mode 100644 index 00000000..f464fde4 --- /dev/null +++ b/ricecooker/utils/archive_assets.py @@ -0,0 +1,271 @@ +""" +Archive external reference processor. + +Opens an archive (ZIP/H5P), scans text-based files for external URL references, +downloads those resources, bundles them into the archive, and rewrites references +to point to local copies. +""" + +import logging +import os +import tempfile +import zipfile + +from ricecooker.utils.downloader import make_request +from ricecooker.utils.url_utils import derive_local_filename +from ricecooker.utils.url_utils import extract_urls_from_css +from ricecooker.utils.url_utils import extract_urls_from_h5p_json +from ricecooker.utils.url_utils import extract_urls_from_html +from ricecooker.utils.url_utils import rewrite_urls_in_css +from ricecooker.utils.url_utils import rewrite_urls_in_h5p_json +from ricecooker.utils.url_utils import rewrite_urls_in_html + +logger = logging.getLogger(__name__) + +# Map file extensions to content type for selecting the right extractor/rewriter +_TEXT_EXTENSIONS = { + ".html": "html", + ".htm": "html", + ".xhtml": "html", + ".xml": "html", + ".css": "css", + ".json": "json", +} + + +def _is_h5p_content_json(filepath): + """Check if a JSON file is an H5P content.json that should be scanned.""" + normalized = filepath.replace("\\", "/") + return normalized == "content/content.json" or normalized.endswith( + "/content/content.json" + ) + + +def _detect_content_type(filepath): + """Detect the content type of a file based on its extension.""" + ext = os.path.splitext(filepath)[1].lower() + if ext == ".json": + if _is_h5p_content_json(filepath): + return "json" + return None # Skip non-H5P JSON files + return _TEXT_EXTENSIONS.get(ext) + + +def _compute_relative_path(from_file, to_file): + """Compute relative path from one file to another within the archive.""" + from_dir = os.path.dirname(from_file) + return os.path.relpath(to_file, from_dir).replace("\\", "/") + + +def _is_blacklisted(url, blacklist): + """Check if a URL matches any blacklist substring.""" + if not blacklist: + return False + return any(pattern in url for pattern in blacklist) + + +def _download_external_url(url, dest_dir, local_path): + """ + Download a single external URL to the destination directory. + + Returns True on success, False on failure. + """ + full_path = os.path.join(dest_dir, local_path) + # Guard against path traversal — resolved path must stay within dest_dir + resolved = os.path.realpath(full_path) + if not resolved.startswith(os.path.realpath(dest_dir) + os.sep): + logger.warning("Path traversal detected for %s, skipping download", url) + return False + os.makedirs(os.path.dirname(full_path), exist_ok=True) + + try: + response = make_request(url) + if response is None or response.status_code != 200: + logger.warning("Failed to download %s (no response or non-200)", url) + return False + with open(full_path, "wb") as f: + f.write(response.content) + return True + except (OSError, IOError, ValueError): + logger.warning("Error downloading %s", url, exc_info=True) + return False + + +def _extract_urls_from_file(full_path, rel_path, content_type): + """Extract external URLs from a single file. Returns list or None on error.""" + try: + with open(full_path, "r", encoding="utf-8") as f: + content = f.read() + except (UnicodeDecodeError, OSError): + logger.warning("Could not read %s as text, skipping", rel_path) + return None + + extractors = { + "html": extract_urls_from_html, + "css": extract_urls_from_css, + "json": extract_urls_from_h5p_json, + } + extractor = extractors.get(content_type) + if extractor is None: + return None + return extractor(content, rel_path) + + +def _scan_archive_for_urls(temp_dir, url_blacklist): + """Scan all text files in an extracted archive for external URLs.""" + all_urls = {} # url -> derive_local_filename result + file_urls = {} # filepath -> list of extracted URLs + + for root, _dirs, filenames in os.walk(temp_dir): + for filename in filenames: + full_path = os.path.join(root, filename) + rel_path = os.path.relpath(full_path, temp_dir) + content_type = _detect_content_type(rel_path) + if content_type is None: + continue + + extracted = _extract_urls_from_file(full_path, rel_path, content_type) + if extracted is None: + continue + + external = [ + e for e in extracted if not _is_blacklisted(e.url, url_blacklist) + ] + if external: + file_urls[rel_path] = external + for e in external: + if e.url not in all_urls: + all_urls[e.url] = derive_local_filename(e.url) + + return all_urls, file_urls + + +def _download_all_urls(temp_dir, all_urls, url_blacklist): + """Download all external URLs, including recursive CSS references.""" + successful_downloads = set() + visited_urls = set() + + for url, local_path in list(all_urls.items()): + if url in visited_urls: + continue + visited_urls.add(url) + + if _download_external_url(url, temp_dir, local_path): + successful_downloads.add(url) + if local_path.endswith(".css") or "css" in local_path.split("?")[0]: + _process_downloaded_css( + temp_dir, + local_path, + all_urls, + successful_downloads, + visited_urls, + url_blacklist, + ) + + return successful_downloads + + +def _rewrite_file(temp_dir, rel_path, url_map): + """Rewrite URL references in a single file.""" + full_path = os.path.join(temp_dir, rel_path) + content_type = _detect_content_type(rel_path) + + with open(full_path, "r", encoding="utf-8") as f: + content = f.read() + + rewriters = { + "html": rewrite_urls_in_html, + "css": rewrite_urls_in_css, + "json": rewrite_urls_in_h5p_json, + } + rewriter = rewriters.get(content_type) + if rewriter: + content = rewriter(content, url_map) + + with open(full_path, "w", encoding="utf-8") as f: + f.write(content) + + +def download_and_rewrite_external_refs(archive_path, url_blacklist=None): + """ + Process an archive to download external URL references and rewrite them + to local paths. + + Args: + archive_path: Path to the archive file (ZIP or H5P) + url_blacklist: Optional list of URL substrings to skip + + Returns: + Path to a temporary directory containing the processed archive contents. + The caller is responsible for cleaning up this directory. + """ + temp_dir = tempfile.mkdtemp(prefix="ricecooker_archive_") + + with zipfile.ZipFile(archive_path, "r") as zf: + zf.extractall(temp_dir) + + all_urls, file_urls = _scan_archive_for_urls(temp_dir, url_blacklist) + + if not all_urls: + return temp_dir + + successful_downloads = _download_all_urls(temp_dir, all_urls, url_blacklist) + + for rel_path, extracted_list in file_urls.items(): + url_map = {} + for e in extracted_list: + if e.url in successful_downloads: + local_path = all_urls[e.url] + url_map[e.url] = _compute_relative_path(rel_path, local_path) + if url_map: + _rewrite_file(temp_dir, rel_path, url_map) + + return temp_dir + + +def _process_downloaded_css( + temp_dir, + css_local_path, + all_urls, + successful_downloads, + visited_urls, + url_blacklist, +): + """Scan a downloaded CSS file for additional external references and download them.""" + full_path = os.path.join(temp_dir, css_local_path) + try: + with open(full_path, "r", encoding="utf-8") as f: + css_content = f.read() + except (UnicodeDecodeError, OSError): + return + + extracted = extract_urls_from_css(css_content, css_local_path) + external = [e for e in extracted if not _is_blacklisted(e.url, url_blacklist)] + + if not external: + return + + # Download newly found external URLs + css_url_map = {} + for e in external: + if e.url in visited_urls: + continue + visited_urls.add(e.url) + + local_path = derive_local_filename(e.url) + all_urls[e.url] = local_path + + if _download_external_url(e.url, temp_dir, local_path): + successful_downloads.add(e.url) + css_url_map[e.url] = _compute_relative_path(css_local_path, local_path) + + # Also build map for any already-downloaded URLs referenced from this CSS + for e in external: + if e.url in successful_downloads and e.url not in css_url_map: + local_path = all_urls[e.url] + css_url_map[e.url] = _compute_relative_path(css_local_path, local_path) + + if css_url_map: + rewritten = rewrite_urls_in_css(css_content, css_url_map) + with open(full_path, "w", encoding="utf-8") as f: + f.write(rewritten) diff --git a/ricecooker/utils/pipeline/convert.py b/ricecooker/utils/pipeline/convert.py index 4c9e209b..faf533cc 100644 --- a/ricecooker/utils/pipeline/convert.py +++ b/ricecooker/utils/pipeline/convert.py @@ -2,6 +2,7 @@ To avoid making the pipeline overly convoluted, these handlers both validate and convert files. """ + import json import os import shutil @@ -31,6 +32,7 @@ from .file_handler import StageHandler from ricecooker import config from ricecooker.exceptions import UnknownFileTypeError +from ricecooker.utils.archive_assets import download_and_rewrite_external_refs from ricecooker.utils.audio import AudioCompressionError from ricecooker.utils.audio import compress_audio from ricecooker.utils.caching import generate_key @@ -48,7 +50,6 @@ from ricecooker.utils.youtube import get_language_with_alpha2_fallback from ricecooker.utils.zip import create_predictable_zip - CONVERTIBLE_FORMATS = {p.id: p.convertible_formats for p in format_presets.PRESETLIST} @@ -194,11 +195,32 @@ def FILE_TYPE(self) -> str: def validate_archive(self, path: str): pass + def _process_external_refs(self, path): + """ + Process external URL references in the archive. + + Returns the path to process — either a temp directory with downloaded + assets, or the original path if processing fails or finds nothing. + """ + try: + return download_and_rewrite_external_refs(path) + except (OSError, zipfile.BadZipFile, ValueError) as e: + config.LOGGER.warning( + "Failed to process external references in %s: %s. " + "Continuing with original archive.", + path, + e, + ) + return path + def handle_file(self, path, audio_settings=None, video_settings=None): self.validate_archive(path) ext = extract_path_ext(path) + # Download external references and get processed directory + processed_path = self._process_external_refs(path) + # Create partial for reading & compressing subfiles file_converter = partial( self._read_and_compress_archive_file, @@ -208,7 +230,7 @@ def handle_file(self, path, audio_settings=None, video_settings=None): ) # create_predictable_zip will iterate over subfiles, call file_converter processed_zip_path = create_predictable_zip( - path, file_converter=file_converter if config.COMPRESS else None + processed_path, file_converter=file_converter if config.COMPRESS else None ) with self.write_file(ext) as fh: @@ -217,6 +239,8 @@ def handle_file(self, path, audio_settings=None, video_settings=None): # Clean up os.unlink(processed_zip_path) + if processed_path != path: + shutil.rmtree(processed_path, ignore_errors=True) @contextmanager def open_and_verify_archive(self, path): diff --git a/ricecooker/utils/url_utils.py b/ricecooker/utils/url_utils.py new file mode 100644 index 00000000..62055d66 --- /dev/null +++ b/ricecooker/utils/url_utils.py @@ -0,0 +1,368 @@ +""" +Shared URL extraction and rewriting utilities for archive processing. + +These functions operate on content strings (HTML, CSS, JSON) — no HTTP, +no filesystem, no platform-specific paths. They can be used by both +ricecooker's pipeline and Studio's upload processing. + +Supersedes issue #303 by making URL detection/rewriting independently +unit-testable. +""" + +import json +import os +import re +from dataclasses import dataclass +from typing import Optional +from urllib.parse import urlparse + +from bs4 import BeautifulSoup + + +@dataclass +class ExtractedURL: + """A URL reference found in archive content.""" + + url: str # The URL as found in the source + source_file: str # Which file this was found in (for archive context) + context: str # 'html_attr', 'css_url', 'css_import', 'h5p_json', 'html_srcset' + tag: Optional[str] = None # e.g. 'img', 'link', 'script' + attr: Optional[str] = None # e.g. 'src', 'href' + + +# Regex patterns for CSS URL extraction +# Matches url('...'), url("..."), and url(...) +_CSS_URL_RE = re.compile(r"url\(['\"]?(.*?)['\"]?\)") +# Matches @import '...' and @import "..." (bare string form, not url() form) +_CSS_IMPORT_RE = re.compile(r"@import\s+['\"]([^'\"]+)['\"]") + + +def is_external_url(url): + """ + Classify a URL as external (http/https with a netloc) vs internal + (relative path, data: URI, fragment-only, etc.). + """ + parsed = urlparse(url) + return parsed.scheme in ("http", "https") and bool(parsed.netloc) + + +def derive_local_filename(url): + """ + Derive a deterministic local filename from an external URL. + + Example: + 'https://fonts.example.com/font.woff2' + -> '_external/fonts.example.com/font.woff2' + """ + parsed = urlparse(url) + path = parsed.path.lstrip("/") + # Strip path traversal segments to prevent writing outside the archive + parts = path.split("/") + parts = [p for p in parts if p != ".."] + path = "/".join(parts) + if parsed.query: + path = path + "?" + parsed.query + return os.path.join("_external", parsed.netloc, path) + + +def extract_urls_from_css(css_content, source_file=""): + """ + Extract all external URL references from CSS content. + + Finds URLs in: + - url('...'), url("..."), url(...) + - @import '...' and @import "..." (bare string form) + + Skips data: URIs and relative URLs. + Returns list of ExtractedURL instances. + """ + results = [] + seen = set() + + # First pass: url() references + for match in _CSS_URL_RE.finditer(css_content): + url = match.group(1).strip() + if url and is_external_url(url) and url not in seen: + seen.add(url) + results.append( + ExtractedURL(url=url, source_file=source_file, context="css_url") + ) + + # Second pass: bare @import strings (not url() form) + for match in _CSS_IMPORT_RE.finditer(css_content): + url = match.group(1).strip() + if url and is_external_url(url) and url not in seen: + seen.add(url) + results.append( + ExtractedURL(url=url, source_file=source_file, context="css_import") + ) + + return results + + +def _parse_srcset(srcset_value): + """Parse an HTML srcset attribute value into a list of URLs.""" + urls = [] + for entry in srcset_value.split(","): + entry = entry.strip() + if entry: + # srcset entries are "url descriptor" e.g. "img.jpg 300w" + parts = entry.split() + if parts: + urls.append(parts[0]) + return urls + + +def _extract_src_urls(soup, source_file, seen, results): + """Extract external URLs from src attributes on img, script, source tags.""" + for tag_name in ("img", "script", "source"): + for node in soup.find_all(tag_name, src=True): + url = node["src"] + if is_external_url(url) and url not in seen: + seen.add(url) + results.append( + ExtractedURL( + url=url, + source_file=source_file, + context="html_attr", + tag=tag_name, + attr="src", + ) + ) + + +def _extract_srcset_urls(soup, source_file, seen, results): + """Extract external URLs from srcset attributes on img, source tags.""" + for tag_name in ("img", "source"): + for node in soup.find_all(tag_name, srcset=True): + for url in _parse_srcset(node["srcset"]): + if is_external_url(url) and url not in seen: + seen.add(url) + results.append( + ExtractedURL( + url=url, + source_file=source_file, + context="html_srcset", + tag=tag_name, + attr="srcset", + ) + ) + + +def _extract_stylesheet_urls(soup, source_file, seen, results): + """Extract external URLs from link[rel=stylesheet] href attributes.""" + for node in soup.find_all("link", href=True): + if "rel" in node.attrs and "stylesheet" in node.get("rel", []): + url = node["href"] + if is_external_url(url) and url not in seen: + seen.add(url) + results.append( + ExtractedURL( + url=url, + source_file=source_file, + context="html_attr", + tag="link", + attr="href", + ) + ) + + +def _extract_style_urls(soup, source_file, seen, results): + """Extract external URLs from inline style attributes and style blocks.""" + for node in soup.find_all(style=True): + style_val = node.get("style", "") + for extracted in extract_urls_from_css(style_val, source_file): + if extracted.url not in seen: + seen.add(extracted.url) + results.append(extracted) + + for style_node in soup.find_all("style"): + if style_node.string: + for extracted in extract_urls_from_css(style_node.string, source_file): + if extracted.url not in seen: + seen.add(extracted.url) + results.append(extracted) + + +def extract_urls_from_html(html_content, source_file=""): + """ + Extract all external URL references from HTML content. + + Finds URLs in: + - img[src], script[src], source[src] + - img[srcset], source[srcset] + - link[rel=stylesheet][href] + - inline style attributes with url() + - + + """ + urls = extract_urls_from_html(html, "index.html") + assert len(urls) == 1 + assert urls[0].url == "https://example.com/bg.png" + + def test_relative_urls_ignored(self): + html = '' + urls = extract_urls_from_html(html, "index.html") + assert len(urls) == 0 + + def test_data_urls_ignored(self): + html = '' + urls = extract_urls_from_html(html, "index.html") + assert len(urls) == 0 + + def test_empty_html(self): + urls = extract_urls_from_html("", "index.html") + assert len(urls) == 0 + + def test_minimal_html(self): + urls = extract_urls_from_html("", "index.html") + assert len(urls) == 0 + + def test_multiple_elements(self): + html = """ + + + + + + """ + urls = extract_urls_from_html(html, "index.html") + assert len(urls) == 3 + extracted = {u.url for u in urls} + assert "https://cdn.example.com/img1.jpg" in extracted + assert "https://cdn.example.com/img2.jpg" in extracted + assert "https://cdn.example.com/app.js" in extracted + + +# --------------------------------------------------------------------------- +# extract_urls_from_h5p_json tests +# --------------------------------------------------------------------------- + + +class TestExtractUrlsFromH5PJSON: + def test_extract_external_path(self): + data = json.dumps( + { + "video": { + "files": [ + {"path": "https://h5p.org/sites/default/files/h5p/iv.mp4"} + ] + } + } + ) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 1 + assert urls[0].url == "https://h5p.org/sites/default/files/h5p/iv.mp4" + assert urls[0].context == "h5p_json" + assert urls[0].source_file == "content.json" + + def test_relative_path_ignored(self): + data = json.dumps({"image": {"path": "images/photo.jpg"}}) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 0 + + def test_deeply_nested(self): + data = json.dumps( + { + "level1": { + "level2": { + "level3": [ + {"path": "https://cdn.example.com/deep/resource.mp4"} + ] + } + } + } + ) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 1 + assert urls[0].url == "https://cdn.example.com/deep/resource.mp4" + + def test_multiple_paths(self): + data = json.dumps( + { + "video": {"path": "https://example.com/video.mp4"}, + "image": {"path": "https://example.com/image.jpg"}, + "local": {"path": "images/local.jpg"}, + } + ) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 2 + extracted = {u.url for u in urls} + assert "https://example.com/video.mp4" in extracted + assert "https://example.com/image.jpg" in extracted + + def test_empty_json(self): + urls = extract_urls_from_h5p_json("{}", "content.json") + assert len(urls) == 0 + + def test_non_string_path_ignored(self): + data = json.dumps({"path": 42}) + urls = extract_urls_from_h5p_json(data, "content.json") + assert len(urls) == 0 + + +# --------------------------------------------------------------------------- +# rewrite_urls_in_css tests +# --------------------------------------------------------------------------- + + +class TestRewriteUrlsInCSS: + def test_rewrite_url(self): + css = "body { background: url('https://example.com/bg.png') }" + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_css(css, url_map) + assert "url('_external/example.com/bg.png')" in result + assert "https://example.com/bg.png" not in result + + def test_rewrite_import(self): + css = "@import 'https://fonts.googleapis.com/css?family=Roboto';" + url_map = { + "https://fonts.googleapis.com/css?family=Roboto": "_external/fonts.googleapis.com/css" + } + result = rewrite_urls_in_css(css, url_map) + assert "_external/fonts.googleapis.com/css" in result + assert "https://fonts.googleapis.com/css?family=Roboto" not in result + + def test_rewrite_preserves_unmapped(self): + css = """ + body { background: url('https://example.com/bg.png') } + .other { background: url('https://other.com/bg.png') } + """ + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_css(css, url_map) + assert "_external/example.com/bg.png" in result + assert "https://other.com/bg.png" in result + + def test_rewrite_url_no_quotes(self): + css = "body { background: url(https://example.com/bg.png) }" + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_css(css, url_map) + assert "_external/example.com/bg.png" in result + + def test_empty_map(self): + css = "body { background: url('https://example.com/bg.png') }" + result = rewrite_urls_in_css(css, {}) + assert "https://example.com/bg.png" in result + + +# --------------------------------------------------------------------------- +# rewrite_urls_in_html tests +# --------------------------------------------------------------------------- + + +class TestRewriteUrlsInHTML: + def test_rewrite_img_src(self): + html = '' + url_map = { + "https://cdn.example.com/photo.jpg": "_external/cdn.example.com/photo.jpg" + } + result = rewrite_urls_in_html(html, url_map) + assert "_external/cdn.example.com/photo.jpg" in result + assert "https://cdn.example.com/photo.jpg" not in result + + def test_rewrite_srcset(self): + html = '' + url_map = { + "https://cdn.example.com/img-300.jpg": "_external/cdn.example.com/img-300.jpg", + "https://cdn.example.com/img-600.jpg": "_external/cdn.example.com/img-600.jpg", + } + result = rewrite_urls_in_html(html, url_map) + assert "_external/cdn.example.com/img-300.jpg" in result + assert "_external/cdn.example.com/img-600.jpg" in result + + def test_rewrite_style_block(self): + html = "" + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_html(html, url_map) + assert "_external/example.com/bg.png" in result + assert "https://example.com/bg.png" not in result + + def test_rewrite_inline_style(self): + html = """
text
""" + url_map = {"https://example.com/bg.png": "_external/example.com/bg.png"} + result = rewrite_urls_in_html(html, url_map) + assert "_external/example.com/bg.png" in result + + def test_rewrite_preserves_unmapped(self): + html = """ + + + """ + url_map = { + "https://cdn.example.com/img1.jpg": "_external/cdn.example.com/img1.jpg" + } + result = rewrite_urls_in_html(html, url_map) + assert "_external/cdn.example.com/img1.jpg" in result + assert "https://cdn.example.com/img2.jpg" in result + + def test_rewrite_link_href(self): + html = '' + url_map = { + "https://fonts.googleapis.com/css": "_external/fonts.googleapis.com/css" + } + result = rewrite_urls_in_html(html, url_map) + assert "_external/fonts.googleapis.com/css" in result + + def test_rewrite_script_src(self): + html = '' + url_map = {"https://cdn.example.com/lib.js": "_external/cdn.example.com/lib.js"} + result = rewrite_urls_in_html(html, url_map) + assert "_external/cdn.example.com/lib.js" in result + + +# --------------------------------------------------------------------------- +# rewrite_urls_in_h5p_json tests +# --------------------------------------------------------------------------- + + +class TestRewriteUrlsInH5PJSON: + def test_rewrite_path(self): + data = json.dumps( + {"video": {"path": "https://h5p.org/sites/default/files/h5p/iv.mp4"}} + ) + url_map = { + "https://h5p.org/sites/default/files/h5p/iv.mp4": "_external/h5p.org/sites/default/files/h5p/iv.mp4" + } + result = rewrite_urls_in_h5p_json(data, url_map) + parsed = json.loads(result) + assert ( + parsed["video"]["path"] + == "_external/h5p.org/sites/default/files/h5p/iv.mp4" + ) + + def test_rewrite_preserves_unmapped(self): + data = json.dumps( + { + "video": {"path": "https://example.com/video.mp4"}, + "image": {"path": "https://other.com/image.jpg"}, + } + ) + url_map = {"https://example.com/video.mp4": "_external/example.com/video.mp4"} + result = rewrite_urls_in_h5p_json(data, url_map) + parsed = json.loads(result) + assert parsed["video"]["path"] == "_external/example.com/video.mp4" + assert parsed["image"]["path"] == "https://other.com/image.jpg" + + def test_rewrite_deeply_nested(self): + data = json.dumps({"a": {"b": [{"path": "https://example.com/deep.mp4"}]}}) + url_map = {"https://example.com/deep.mp4": "_external/example.com/deep.mp4"} + result = rewrite_urls_in_h5p_json(data, url_map) + parsed = json.loads(result) + assert parsed["a"]["b"][0]["path"] == "_external/example.com/deep.mp4" + + def test_rewrite_relative_path_unchanged(self): + data = json.dumps({"image": {"path": "images/photo.jpg"}}) + url_map = {"https://example.com/video.mp4": "_external/example.com/video.mp4"} + result = rewrite_urls_in_h5p_json(data, url_map) + parsed = json.loads(result) + assert parsed["image"]["path"] == "images/photo.jpg" + + def test_empty_map(self): + data = json.dumps({"video": {"path": "https://example.com/video.mp4"}}) + result = rewrite_urls_in_h5p_json(data, {}) + parsed = json.loads(result) + assert parsed["video"]["path"] == "https://example.com/video.mp4"