diff --git a/ricecooker/utils/pipeline/convert.py b/ricecooker/utils/pipeline/convert.py index f1dab505..816871e4 100644 --- a/ricecooker/utils/pipeline/convert.py +++ b/ricecooker/utils/pipeline/convert.py @@ -35,6 +35,7 @@ from ricecooker.utils.audio import AudioCompressionError from ricecooker.utils.audio import compress_audio from ricecooker.utils.caching import generate_key +from ricecooker.utils.pipeline.context import ContentNodeMetadata from ricecooker.utils.pipeline.context import ContextMetadata from ricecooker.utils.pipeline.context import FileMetadata from ricecooker.utils.pipeline.exceptions import InvalidFileException @@ -235,15 +236,15 @@ def read_file_from_archive(self, zf, filepath): f"File {zf.filename} is not a valid {self.FILE_TYPE} file, {filepath} is missing." ) - def _validate_index_html_body(self, zf, path): - """Validate that index.html exists and has a non-empty body.""" - index_html = self.read_file_from_archive(zf, "index.html") + def _validate_index_html_body(self, zf, path, index_path="index.html"): + """Validate that the entry HTML exists and has a non-empty body.""" + index_html = self.read_file_from_archive(zf, index_path) try: dom = html5lib.parse(index_html, namespaceHTMLElements=False) body = dom.find("body") if body is None: raise InvalidFileException( - f"File {path} is not a valid {self.FILE_TYPE} file, index.html is missing a body element." + f"File {path} is not a valid {self.FILE_TYPE} file, {index_path} is missing a body element." ) # Check that the body has at least one child element # for some reason it seems like comments don't get a string tag attribute @@ -252,12 +253,12 @@ def _validate_index_html_body(self, zf, path): ] if not (body.text and body.text.strip()) and not body_children: raise InvalidFileException( - f"File {path} is not a valid {self.FILE_TYPE} file, index.html is empty." + f"File {path} is not a valid {self.FILE_TYPE} file, {index_path} is empty." ) return dom except ParseError: raise InvalidFileException( - f"File {path} is not a valid {self.FILE_TYPE} file, index.html is not well-formed." + f"File {path} is not a valid {self.FILE_TYPE} file, {index_path} is not well-formed." ) def _read_and_compress_archive_file( @@ -308,13 +309,118 @@ def _read_and_compress_archive_file( return reader(filepath) +def _find_common_root(names): + """Return the common parent directory shared by all file paths. + + Ported from Studio's ``findCommonRoot`` (frontend/shared/utils/zipFile.js). + ``names`` are POSIX-style, non-directory archive member paths. + """ + paths = [n.split("/")[:-1] for n in names] + if not paths: + return "" + if len(paths) == 1: + return "/".join(paths[0]) + first = paths[0] + common = [] + for i, part in enumerate(first): + for other in paths[1:]: + if i >= len(other) or other[i] != part: + return "/".join(common) + common.append(part) + return "/".join(common) + + +def _find_entry_html(names): + """Return the archive member that is the HTML entry point, or None. + + Ported from Studio's ``findFirstHtml`` (frontend/shared/utils/zipFile.js): + prefer ``index.html`` at the common-root-stripped root, then any + ``index.html``, then the shallowest / shortest-named ``.html`` file. + """ + html_files = [n for n in names if n.lower().endswith(".html")] + if not html_files: + return None + common_root = _find_common_root(names) + prefix = common_root + "/" if common_root else "" + normalized = [ + (n, n[len(prefix) :] if prefix and n.startswith(prefix) else n) + for n in html_files + ] + for original, norm in normalized: + if norm == "index.html": + return original + for original, norm in normalized: + if norm.split("/")[-1] == "index.html": + return original + normalized.sort(key=lambda t: (t[1].count("/"), len(t[1]))) + return normalized[0][0] + + class HTML5ConversionHandler(ArchiveProcessingBaseHandler): EXTENSIONS = {file_formats.HTML5} FILE_TYPE = "HTML5" + def handle_file(self, path, audio_settings=None, video_settings=None): + prepared_path, entry = self._prepare_archive(path) + try: + super().handle_file( + prepared_path, + audio_settings=audio_settings, + video_settings=video_settings, + ) + finally: + if prepared_path != path and os.path.exists(prepared_path): + os.unlink(prepared_path) + # Mirror Studio: when the entry point is not index.html at the root, + # record it in extra_fields.options.entry so Kolibri loads it. + if entry and entry != "index.html": + return FileMetadata( + content_node_metadata=ContentNodeMetadata( + extra_fields={"options": {"entry": entry}} + ) + ) + return None + def validate_archive(self, path: str): with self.open_and_verify_archive(path) as zf: - self._validate_index_html_body(zf, path) + names = [n for n in zf.namelist() if not n.endswith("/")] + entry = _find_entry_html(names) + if entry is None: + raise InvalidFileException( + f"File {path} is not a valid {self.FILE_TYPE} file, " + "no HTML file was found in the archive." + ) + self._validate_index_html_body(zf, path, entry) + + def _prepare_archive(self, path): + """Denest a zip whose files all share a common parent directory + (mirroring Studio's ``cleanHTML5Zip``), and return the path to use + along with the detected HTML entry point. + + Returns ``(path, entry)`` unchanged when there is nothing to strip; + otherwise returns the path to a denested temporary zip. + """ + try: + with zipfile.ZipFile(path) as zf: + names = [n for n in zf.namelist() if not n.endswith("/")] + except zipfile.BadZipFile: + return path, None # let validate_archive raise the standard error + + common_root = _find_common_root(names) + if not common_root: + return path, _find_entry_html(names) + + prefix = common_root + "/" + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmp: + tmp_path = tmp.name + with ( + zipfile.ZipFile(path) as zin, + zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_DEFLATED) as zout, + ): + for name in names: + zout.writestr(name[len(prefix) :], zin.read(name)) + denested_names = [n[len(prefix) :] for n in names] + return tmp_path, _find_entry_html(denested_names) class H5PConversionHandler(ArchiveProcessingBaseHandler): diff --git a/tests/conftest.py b/tests/conftest.py index 7790612b..a7fabd1a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -731,15 +731,21 @@ def html_invalid_files(html_data, document_file): @pytest.fixture def html_invalid_file(): + """ + Invalid because the only HTML file in the zip has an empty body. + """ local_path = os.path.abspath( os.path.join( - os.path.dirname(__file__), "testcontent", "generated", "testinvalidhtml.zip" + os.path.dirname(__file__), + "testcontent", + "generated", + "testinvalidemptyhtml.zip", ) ) if not os.path.exists(local_path): with zipfile.ZipFile(local_path, "w", zipfile.ZIP_DEFLATED) as archive: - archive.writestr("notindex.html", "
") + archive.writestr("index.html", "") return HTMLZipFile(local_path) diff --git a/tests/pipeline/test_convert.py b/tests/pipeline/test_convert.py index 8237dbc9..40efc615 100644 --- a/tests/pipeline/test_convert.py +++ b/tests/pipeline/test_convert.py @@ -9,6 +9,8 @@ from ricecooker.classes.files import H5PFile from ricecooker.classes.files import HTMLZipFile +from ricecooker.utils.pipeline.convert import _find_common_root +from ricecooker.utils.pipeline.convert import _find_entry_html from ricecooker.utils.pipeline.convert import HTML5ConversionHandler from ricecooker.utils.pipeline.convert import KPUBConversionHandler from ricecooker.utils.pipeline.exceptions import InvalidFileException @@ -170,6 +172,86 @@ def test_body_with_text_only_accepted(self): self._validate({"index.html": "Hello world"}) +class TestHTML5EntryPoint: + """Tests for HTML entry point detection and zip denesting, + mirroring Studio's findFirstHtml/cleanHTML5Zip behavior.""" + + VALID_HTML = "Hello
" + + def _execute(self, files): + """Create an HTML5 archive with given files and run the handler.""" + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "test.zip") + _create_archive(path, files) + return HTML5ConversionHandler().execute(path, skip_cache=True) + + def test_find_common_root(self): + assert _find_common_root([]) == "" + assert _find_common_root(["index.html"]) == "" + assert _find_common_root(["dist/index.html"]) == "dist" + assert _find_common_root(["a/b/x.html", "a/b/y.css"]) == "a/b" + assert _find_common_root(["a/b/x.html", "a/c/y.css"]) == "a" + assert _find_common_root(["a/x.html", "y.css"]) == "" + + def test_find_entry_html(self): + # index.html at the root is preferred + assert _find_entry_html(["other.html", "index.html"]) == "index.html" + # then index.html relative to the common root + assert ( + _find_entry_html(["dist/other.html", "dist/index.html"]) + == "dist/index.html" + ) + # then any index.html + assert _find_entry_html(["main.html", "sub/index.html"]) == "sub/index.html" + # then the shallowest html file + assert _find_entry_html(["b/page.html", "a.html"]) == "a.html" + # no html files at all + assert _find_entry_html(["style.css", "script.js"]) is None + + def test_no_html_file_rejected(self): + with pytest.raises(InvalidFileException, match="(?i)no HTML file"): + self._execute({"script.js": "console.log('hello');"}) + + def test_non_index_entry_accepted_and_recorded(self): + results = self._execute( + {"app.html": self.VALID_HTML, "script.js": "console.log('hello');"} + ) + assert results[0].content_node_metadata.extra_fields == { + "options": {"entry": "app.html"} + } + + def test_non_index_entry_body_validated(self): + with pytest.raises(InvalidFileException, match="(?i)empty"): + self._execute({"app.html": ""}) + + def test_root_index_entry_not_recorded(self): + results = self._execute({"index.html": self.VALID_HTML}) + assert results[0].content_node_metadata is None + + def test_nested_archive_denested(self): + results = self._execute( + { + "dist/index.html": self.VALID_HTML, + "dist/css/style.css": "body { color: red; }", + } + ) + # The common root is stripped, so index.html ends up at the root + # and no entry point needs to be recorded. + assert results[0].content_node_metadata is None + with zipfile.ZipFile(results[0].path) as zf: + names = set(zf.namelist()) + assert "index.html" in names + assert "css/style.css" in names + + def test_nested_non_index_entry_denested_and_recorded(self): + results = self._execute({"dist/app.html": self.VALID_HTML}) + assert results[0].content_node_metadata.extra_fields == { + "options": {"entry": "app.html"} + } + with zipfile.ZipFile(results[0].path) as zf: + assert "app.html" in zf.namelist() + + class TestKPUBValidation: """Tests for KPUBConversionHandler validation.""" diff --git a/tests/test_files.py b/tests/test_files.py index 99e470d4..7ceb42d3 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -593,7 +593,20 @@ def valid_zip(): @pytest.fixture def invalid_zip(): - # Create a temporary zip file without index.html + # Create a temporary zip file without any HTML file + fd, path = tempfile.mkstemp(suffix=".zip") + os.close(fd) + + with zipfile.ZipFile(path, "w") as zf: + zf.writestr("script.js", "console.log('test');") + + yield path + os.unlink(path) + + +@pytest.fixture +def non_index_entry_zip(): + # Create a temporary zip file whose only HTML file is not index.html fd, path = tempfile.mkstemp(suffix=".zip") os.close(fd) @@ -631,16 +644,27 @@ def test_invalid_htmlzip_validation(invalid_zip): assert html_file.filename is None assert html_file.error is not None - assert "index.html" in html_file.error + assert "no HTML file" in html_file.error + + +def test_non_index_entry_htmlzip_validation(non_index_entry_zip): + # A zip whose only HTML file is not index.html is valid; the entry + # point is detected and recorded in extra_fields.options.entry. + html_file = HTMLZipFile(non_index_entry_zip) + html_file.process_file() + + assert html_file.filename is not None + assert html_file.error is None def test_nested_index_htmlzip_validation(nested_index_zip): + # A zip whose files all live under a common parent directory is + # denested so that index.html ends up at the root. html_file = HTMLZipFile(nested_index_zip) html_file.process_file() - assert html_file.filename is None - assert html_file.error is not None - assert "index.html" in html_file.error + assert html_file.filename is not None + assert html_file.error is None @pytest.mark.skip(