learningequality · rtibbles · Jun 4, 2026
diff --git a/ricecooker/utils/pipeline/convert.py b/ricecooker/utils/pipeline/convert.py
@@ -35,6 +35,7 @@
 from ricecooker.utils.audio import AudioCompressionError
 from ricecooker.utils.audio import compress_audio
 from ricecooker.utils.caching import generate_key
+from ricecooker.utils.pipeline.context import ContentNodeMetadata
 from ricecooker.utils.pipeline.context import ContextMetadata
 from ricecooker.utils.pipeline.context import FileMetadata
 from ricecooker.utils.pipeline.exceptions import InvalidFileException
@@ -235,15 +236,15 @@ def read_file_from_archive(self, zf, filepath):
                 f"File {zf.filename} is not a valid {self.FILE_TYPE} file, {filepath} is missing."
             )
 
-    def _validate_index_html_body(self, zf, path):
-        """Validate that index.html exists and has a non-empty body."""
-        index_html = self.read_file_from_archive(zf, "index.html")
+    def _validate_index_html_body(self, zf, path, index_path="index.html"):
+        """Validate that the entry HTML exists and has a non-empty body."""
+        index_html = self.read_file_from_archive(zf, index_path)
         try:
             dom = html5lib.parse(index_html, namespaceHTMLElements=False)
             body = dom.find("body")
             if body is None:
                 raise InvalidFileException(
-                    f"File {path} is not a valid {self.FILE_TYPE} file, index.html is missing a body element."
+                    f"File {path} is not a valid {self.FILE_TYPE} file, {index_path} is missing a body element."
                 )
             # Check that the body has at least one child element
             # for some reason it seems like comments don't get a string tag attribute
@@ -252,12 +253,12 @@ def _validate_index_html_body(self, zf, path):
             ]
             if not (body.text and body.text.strip()) and not body_children:
                 raise InvalidFileException(
-                    f"File {path} is not a valid {self.FILE_TYPE} file, index.html is empty."
+                    f"File {path} is not a valid {self.FILE_TYPE} file, {index_path} is empty."
                 )
             return dom
         except ParseError:
             raise InvalidFileException(
-                f"File {path} is not a valid {self.FILE_TYPE} file, index.html is not well-formed."
+                f"File {path} is not a valid {self.FILE_TYPE} file, {index_path} is not well-formed."
             )
 
     def _read_and_compress_archive_file(
@@ -308,13 +309,118 @@ def _read_and_compress_archive_file(
         return reader(filepath)
 
 
+def _find_common_root(names):
+    """Return the common parent directory shared by all file paths.
+
+    Ported from Studio's ``findCommonRoot`` (frontend/shared/utils/zipFile.js).
+    ``names`` are POSIX-style, non-directory archive member paths.
+    """
+    paths = [n.split("/")[:-1] for n in names]
+    if not paths:
+        return ""
+    if len(paths) == 1:
+        return "/".join(paths[0])
+    first = paths[0]
+    common = []
+    for i, part in enumerate(first):
+        for other in paths[1:]:
+            if i >= len(other) or other[i] != part:
+                return "/".join(common)
+        common.append(part)
+    return "/".join(common)
+
+
+def _find_entry_html(names):
+    """Return the archive member that is the HTML entry point, or None.
+
+    Ported from Studio's ``findFirstHtml`` (frontend/shared/utils/zipFile.js):
+    prefer ``index.html`` at the common-root-stripped root, then any
+    ``index.html``, then the shallowest / shortest-named ``.html`` file.
+    """
+    html_files = [n for n in names if n.lower().endswith(".html")]
+    if not html_files:
+        return None
+    common_root = _find_common_root(names)
+    prefix = common_root + "/" if common_root else ""
+    normalized = [
+        (n, n[len(prefix) :] if prefix and n.startswith(prefix) else n)
+        for n in html_files
+    ]
+    for original, norm in normalized:
+        if norm == "index.html":
+            return original
+    for original, norm in normalized:
+        if norm.split("/")[-1] == "index.html":
+            return original
+    normalized.sort(key=lambda t: (t[1].count("/"), len(t[1])))
+    return normalized[0][0]
+
+
 class HTML5ConversionHandler(ArchiveProcessingBaseHandler):
     EXTENSIONS = {file_formats.HTML5}
     FILE_TYPE = "HTML5"
 
+    def handle_file(self, path, audio_settings=None, video_settings=None):
+        prepared_path, entry = self._prepare_archive(path)
+        try:
+            super().handle_file(
+                prepared_path,
+                audio_settings=audio_settings,
+                video_settings=video_settings,
+            )
+        finally:
+            if prepared_path != path and os.path.exists(prepared_path):
+                os.unlink(prepared_path)
+        # Mirror Studio: when the entry point is not index.html at the root,
+        # record it in extra_fields.options.entry so Kolibri loads it.
+        if entry and entry != "index.html":
+            return FileMetadata(
+                content_node_metadata=ContentNodeMetadata(
+                    extra_fields={"options": {"entry": entry}}
+                )
+            )
+        return None
+
     def validate_archive(self, path: str):
         with self.open_and_verify_archive(path) as zf:
-            self._validate_index_html_body(zf, path)
+            names = [n for n in zf.namelist() if not n.endswith("/")]
+            entry = _find_entry_html(names)
+            if entry is None:
+                raise InvalidFileException(
+                    f"File {path} is not a valid {self.FILE_TYPE} file, "
+                    "no HTML file was found in the archive."
+                )
+            self._validate_index_html_body(zf, path, entry)
+
+    def _prepare_archive(self, path):
+        """Denest a zip whose files all share a common parent directory
+        (mirroring Studio's ``cleanHTML5Zip``), and return the path to use
+        along with the detected HTML entry point.
+
+        Returns ``(path, entry)`` unchanged when there is nothing to strip;
+        otherwise returns the path to a denested temporary zip.
+        """
+        try:
+            with zipfile.ZipFile(path) as zf:
+                names = [n for n in zf.namelist() if not n.endswith("/")]
+        except zipfile.BadZipFile:
+            return path, None  # let validate_archive raise the standard error
+
+        common_root = _find_common_root(names)
+        if not common_root:
+            return path, _find_entry_html(names)
+
+        prefix = common_root + "/"
+        with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmp:
+            tmp_path = tmp.name
+        with (
+            zipfile.ZipFile(path) as zin,
+            zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_DEFLATED) as zout,
+        ):
+            for name in names:
+                zout.writestr(name[len(prefix) :], zin.read(name))
+        denested_names = [n[len(prefix) :] for n in names]
+        return tmp_path, _find_entry_html(denested_names)
 
 
 class H5PConversionHandler(ArchiveProcessingBaseHandler):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -731,15 +731,21 @@ def html_invalid_files(html_data, document_file):
 
 @pytest.fixture
 def html_invalid_file():
+    """
+    Invalid because the only HTML file in the zip has an empty body.
+    """
     local_path = os.path.abspath(
         os.path.join(
-            os.path.dirname(__file__), "testcontent", "generated", "testinvalidhtml.zip"
+            os.path.dirname(__file__),
+            "testcontent",
+            "generated",
+            "testinvalidemptyhtml.zip",
         )
     )
 
     if not os.path.exists(local_path):
         with zipfile.ZipFile(local_path, "w", zipfile.ZIP_DEFLATED) as archive:
-            archive.writestr("notindex.html", "<div></div>")
+            archive.writestr("index.html", "<html><head></head><body></body></html>")
     return HTMLZipFile(local_path)
 
 

diff --git a/tests/pipeline/test_convert.py b/tests/pipeline/test_convert.py
@@ -9,6 +9,8 @@
 
 from ricecooker.classes.files import H5PFile
 from ricecooker.classes.files import HTMLZipFile
+from ricecooker.utils.pipeline.convert import _find_common_root
+from ricecooker.utils.pipeline.convert import _find_entry_html
 from ricecooker.utils.pipeline.convert import HTML5ConversionHandler
 from ricecooker.utils.pipeline.convert import KPUBConversionHandler
 from ricecooker.utils.pipeline.exceptions import InvalidFileException
@@ -170,6 +172,86 @@ def test_body_with_text_only_accepted(self):
         self._validate({"index.html": "<html><body>Hello world</body></html>"})
 
 
+class TestHTML5EntryPoint:
+    """Tests for HTML entry point detection and zip denesting,
+    mirroring Studio's findFirstHtml/cleanHTML5Zip behavior."""
+
+    VALID_HTML = "<html><body><p>Hello</p></body></html>"
+
+    def _execute(self, files):
+        """Create an HTML5 archive with given files and run the handler."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "test.zip")
+            _create_archive(path, files)
+            return HTML5ConversionHandler().execute(path, skip_cache=True)
+
+    def test_find_common_root(self):
+        assert _find_common_root([]) == ""
+        assert _find_common_root(["index.html"]) == ""
+        assert _find_common_root(["dist/index.html"]) == "dist"
+        assert _find_common_root(["a/b/x.html", "a/b/y.css"]) == "a/b"
+        assert _find_common_root(["a/b/x.html", "a/c/y.css"]) == "a"
+        assert _find_common_root(["a/x.html", "y.css"]) == ""
+
+    def test_find_entry_html(self):
+        # index.html at the root is preferred
+        assert _find_entry_html(["other.html", "index.html"]) == "index.html"
+        # then index.html relative to the common root
+        assert (
+            _find_entry_html(["dist/other.html", "dist/index.html"])
+            == "dist/index.html"
+        )
+        # then any index.html
+        assert _find_entry_html(["main.html", "sub/index.html"]) == "sub/index.html"
+        # then the shallowest html file
+        assert _find_entry_html(["b/page.html", "a.html"]) == "a.html"
+        # no html files at all
+        assert _find_entry_html(["style.css", "script.js"]) is None
+
+    def test_no_html_file_rejected(self):
+        with pytest.raises(InvalidFileException, match="(?i)no HTML file"):
+            self._execute({"script.js": "console.log('hello');"})
+
+    def test_non_index_entry_accepted_and_recorded(self):
+        results = self._execute(
+            {"app.html": self.VALID_HTML, "script.js": "console.log('hello');"}
+        )
+        assert results[0].content_node_metadata.extra_fields == {
+            "options": {"entry": "app.html"}
+        }
+
+    def test_non_index_entry_body_validated(self):
+        with pytest.raises(InvalidFileException, match="(?i)empty"):
+            self._execute({"app.html": "<html><body></body></html>"})
+
+    def test_root_index_entry_not_recorded(self):
+        results = self._execute({"index.html": self.VALID_HTML})
+        assert results[0].content_node_metadata is None
+
+    def test_nested_archive_denested(self):
+        results = self._execute(
+            {
+                "dist/index.html": self.VALID_HTML,
+                "dist/css/style.css": "body { color: red; }",
+            }
+        )
+        # The common root is stripped, so index.html ends up at the root
+        # and no entry point needs to be recorded.
+        assert results[0].content_node_metadata is None
+        with zipfile.ZipFile(results[0].path) as zf:
+            names = set(zf.namelist())
+        assert "index.html" in names
+        assert "css/style.css" in names
+
+    def test_nested_non_index_entry_denested_and_recorded(self):
+        results = self._execute({"dist/app.html": self.VALID_HTML})
+        assert results[0].content_node_metadata.extra_fields == {
+            "options": {"entry": "app.html"}
+        }
+        with zipfile.ZipFile(results[0].path) as zf:
+            assert "app.html" in zf.namelist()
+
+
 class TestKPUBValidation:
     """Tests for KPUBConversionHandler validation."""
 

diff --git a/tests/test_files.py b/tests/test_files.py
@@ -593,7 +593,20 @@ def valid_zip():
 
 @pytest.fixture
 def invalid_zip():
-    # Create a temporary zip file without index.html
+    # Create a temporary zip file without any HTML file
+    fd, path = tempfile.mkstemp(suffix=".zip")
+    os.close(fd)
+
+    with zipfile.ZipFile(path, "w") as zf:
+        zf.writestr("script.js", "console.log('test');")
+
+    yield path
+    os.unlink(path)
+
+
+@pytest.fixture
+def non_index_entry_zip():
+    # Create a temporary zip file whose only HTML file is not index.html
     fd, path = tempfile.mkstemp(suffix=".zip")
     os.close(fd)
 
@@ -631,16 +644,27 @@ def test_invalid_htmlzip_validation(invalid_zip):
 
     assert html_file.filename is None
     assert html_file.error is not None
-    assert "index.html" in html_file.error
+    assert "no HTML file" in html_file.error
+
+
+def test_non_index_entry_htmlzip_validation(non_index_entry_zip):
+    # A zip whose only HTML file is not index.html is valid; the entry
+    # point is detected and recorded in extra_fields.options.entry.
+    html_file = HTMLZipFile(non_index_entry_zip)
+    html_file.process_file()
+
+    assert html_file.filename is not None
+    assert html_file.error is None
 
 
 def test_nested_index_htmlzip_validation(nested_index_zip):
+    # A zip whose files all live under a common parent directory is
+    # denested so that index.html ends up at the root.
     html_file = HTMLZipFile(nested_index_zip)
     html_file.process_file()
 
-    assert html_file.filename is None
-    assert html_file.error is not None
-    assert "index.html" in html_file.error
+    assert html_file.filename is not None
+    assert html_file.error is None
 
 
 @pytest.mark.skip(