Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 113 additions & 7 deletions ricecooker/utils/pipeline/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from ricecooker.utils.audio import AudioCompressionError
from ricecooker.utils.audio import compress_audio
from ricecooker.utils.caching import generate_key
from ricecooker.utils.pipeline.context import ContentNodeMetadata
from ricecooker.utils.pipeline.context import ContextMetadata
from ricecooker.utils.pipeline.context import FileMetadata
from ricecooker.utils.pipeline.exceptions import InvalidFileException
Expand Down Expand Up @@ -235,15 +236,15 @@ def read_file_from_archive(self, zf, filepath):
f"File {zf.filename} is not a valid {self.FILE_TYPE} file, {filepath} is missing."
)

def _validate_index_html_body(self, zf, path):
"""Validate that index.html exists and has a non-empty body."""
index_html = self.read_file_from_archive(zf, "index.html")
def _validate_index_html_body(self, zf, path, index_path="index.html"):
"""Validate that the entry HTML exists and has a non-empty body."""
index_html = self.read_file_from_archive(zf, index_path)
try:
dom = html5lib.parse(index_html, namespaceHTMLElements=False)
body = dom.find("body")
if body is None:
raise InvalidFileException(
f"File {path} is not a valid {self.FILE_TYPE} file, index.html is missing a body element."
f"File {path} is not a valid {self.FILE_TYPE} file, {index_path} is missing a body element."
)
# Check that the body has at least one child element
# for some reason it seems like comments don't get a string tag attribute
Expand All @@ -252,12 +253,12 @@ def _validate_index_html_body(self, zf, path):
]
if not (body.text and body.text.strip()) and not body_children:
raise InvalidFileException(
f"File {path} is not a valid {self.FILE_TYPE} file, index.html is empty."
f"File {path} is not a valid {self.FILE_TYPE} file, {index_path} is empty."
)
return dom
except ParseError:
raise InvalidFileException(
f"File {path} is not a valid {self.FILE_TYPE} file, index.html is not well-formed."
f"File {path} is not a valid {self.FILE_TYPE} file, {index_path} is not well-formed."
)

def _read_and_compress_archive_file(
Expand Down Expand Up @@ -308,13 +309,118 @@ def _read_and_compress_archive_file(
return reader(filepath)


def _find_common_root(names):
"""Return the common parent directory shared by all file paths.

Ported from Studio's ``findCommonRoot`` (frontend/shared/utils/zipFile.js).
``names`` are POSIX-style, non-directory archive member paths.
"""
paths = [n.split("/")[:-1] for n in names]
if not paths:
return ""
if len(paths) == 1:
return "/".join(paths[0])
first = paths[0]
common = []
for i, part in enumerate(first):
for other in paths[1:]:
if i >= len(other) or other[i] != part:
return "/".join(common)
common.append(part)
return "/".join(common)


def _find_entry_html(names):
"""Return the archive member that is the HTML entry point, or None.

Ported from Studio's ``findFirstHtml`` (frontend/shared/utils/zipFile.js):
prefer ``index.html`` at the common-root-stripped root, then any
``index.html``, then the shallowest / shortest-named ``.html`` file.
"""
html_files = [n for n in names if n.lower().endswith(".html")]
if not html_files:
return None
common_root = _find_common_root(names)
prefix = common_root + "/" if common_root else ""
normalized = [
(n, n[len(prefix) :] if prefix and n.startswith(prefix) else n)
for n in html_files
]
for original, norm in normalized:
if norm == "index.html":
return original
for original, norm in normalized:
if norm.split("/")[-1] == "index.html":
return original
normalized.sort(key=lambda t: (t[1].count("/"), len(t[1])))
return normalized[0][0]


class HTML5ConversionHandler(ArchiveProcessingBaseHandler):
EXTENSIONS = {file_formats.HTML5}
FILE_TYPE = "HTML5"

def handle_file(self, path, audio_settings=None, video_settings=None):
prepared_path, entry = self._prepare_archive(path)
try:
super().handle_file(
prepared_path,
audio_settings=audio_settings,
video_settings=video_settings,
)
finally:
if prepared_path != path and os.path.exists(prepared_path):
os.unlink(prepared_path)
# Mirror Studio: when the entry point is not index.html at the root,
# record it in extra_fields.options.entry so Kolibri loads it.
if entry and entry != "index.html":
return FileMetadata(
content_node_metadata=ContentNodeMetadata(
extra_fields={"options": {"entry": entry}}
)
)
return None

def validate_archive(self, path: str):
with self.open_and_verify_archive(path) as zf:
self._validate_index_html_body(zf, path)
names = [n for n in zf.namelist() if not n.endswith("/")]
entry = _find_entry_html(names)
if entry is None:
raise InvalidFileException(
f"File {path} is not a valid {self.FILE_TYPE} file, "
"no HTML file was found in the archive."
)
self._validate_index_html_body(zf, path, entry)

def _prepare_archive(self, path):
"""Denest a zip whose files all share a common parent directory
(mirroring Studio's ``cleanHTML5Zip``), and return the path to use
along with the detected HTML entry point.

Returns ``(path, entry)`` unchanged when there is nothing to strip;
otherwise returns the path to a denested temporary zip.
"""
try:
with zipfile.ZipFile(path) as zf:
names = [n for n in zf.namelist() if not n.endswith("/")]
except zipfile.BadZipFile:
return path, None # let validate_archive raise the standard error

common_root = _find_common_root(names)
if not common_root:
return path, _find_entry_html(names)

prefix = common_root + "/"
with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmp:
tmp_path = tmp.name
with (
zipfile.ZipFile(path) as zin,
zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_DEFLATED) as zout,
):
for name in names:
zout.writestr(name[len(prefix) :], zin.read(name))
denested_names = [n[len(prefix) :] for n in names]
return tmp_path, _find_entry_html(denested_names)


class H5PConversionHandler(ArchiveProcessingBaseHandler):
Expand Down
10 changes: 8 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,15 +731,21 @@ def html_invalid_files(html_data, document_file):

@pytest.fixture
def html_invalid_file():
"""
Invalid because the only HTML file in the zip has an empty body.
"""
local_path = os.path.abspath(
os.path.join(
os.path.dirname(__file__), "testcontent", "generated", "testinvalidhtml.zip"
os.path.dirname(__file__),
"testcontent",
"generated",
"testinvalidemptyhtml.zip",
)
)

if not os.path.exists(local_path):
with zipfile.ZipFile(local_path, "w", zipfile.ZIP_DEFLATED) as archive:
archive.writestr("notindex.html", "<div></div>")
archive.writestr("index.html", "<html><head></head><body></body></html>")
return HTMLZipFile(local_path)


Expand Down
82 changes: 82 additions & 0 deletions tests/pipeline/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from ricecooker.classes.files import H5PFile
from ricecooker.classes.files import HTMLZipFile
from ricecooker.utils.pipeline.convert import _find_common_root
from ricecooker.utils.pipeline.convert import _find_entry_html
from ricecooker.utils.pipeline.convert import HTML5ConversionHandler
from ricecooker.utils.pipeline.convert import KPUBConversionHandler
from ricecooker.utils.pipeline.exceptions import InvalidFileException
Expand Down Expand Up @@ -170,6 +172,86 @@ def test_body_with_text_only_accepted(self):
self._validate({"index.html": "<html><body>Hello world</body></html>"})


class TestHTML5EntryPoint:
"""Tests for HTML entry point detection and zip denesting,
mirroring Studio's findFirstHtml/cleanHTML5Zip behavior."""

VALID_HTML = "<html><body><p>Hello</p></body></html>"

def _execute(self, files):
"""Create an HTML5 archive with given files and run the handler."""
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "test.zip")
_create_archive(path, files)
return HTML5ConversionHandler().execute(path, skip_cache=True)

def test_find_common_root(self):
assert _find_common_root([]) == ""
assert _find_common_root(["index.html"]) == ""
assert _find_common_root(["dist/index.html"]) == "dist"
assert _find_common_root(["a/b/x.html", "a/b/y.css"]) == "a/b"
assert _find_common_root(["a/b/x.html", "a/c/y.css"]) == "a"
assert _find_common_root(["a/x.html", "y.css"]) == ""

def test_find_entry_html(self):
# index.html at the root is preferred
assert _find_entry_html(["other.html", "index.html"]) == "index.html"
# then index.html relative to the common root
assert (
_find_entry_html(["dist/other.html", "dist/index.html"])
== "dist/index.html"
)
# then any index.html
assert _find_entry_html(["main.html", "sub/index.html"]) == "sub/index.html"
# then the shallowest html file
assert _find_entry_html(["b/page.html", "a.html"]) == "a.html"
# no html files at all
assert _find_entry_html(["style.css", "script.js"]) is None

def test_no_html_file_rejected(self):
with pytest.raises(InvalidFileException, match="(?i)no HTML file"):
self._execute({"script.js": "console.log('hello');"})

def test_non_index_entry_accepted_and_recorded(self):
results = self._execute(
{"app.html": self.VALID_HTML, "script.js": "console.log('hello');"}
)
assert results[0].content_node_metadata.extra_fields == {
"options": {"entry": "app.html"}
}

def test_non_index_entry_body_validated(self):
with pytest.raises(InvalidFileException, match="(?i)empty"):
self._execute({"app.html": "<html><body></body></html>"})

def test_root_index_entry_not_recorded(self):
results = self._execute({"index.html": self.VALID_HTML})
assert results[0].content_node_metadata is None

def test_nested_archive_denested(self):
results = self._execute(
{
"dist/index.html": self.VALID_HTML,
"dist/css/style.css": "body { color: red; }",
}
)
# The common root is stripped, so index.html ends up at the root
# and no entry point needs to be recorded.
assert results[0].content_node_metadata is None
with zipfile.ZipFile(results[0].path) as zf:
names = set(zf.namelist())
assert "index.html" in names
assert "css/style.css" in names

def test_nested_non_index_entry_denested_and_recorded(self):
results = self._execute({"dist/app.html": self.VALID_HTML})
assert results[0].content_node_metadata.extra_fields == {
"options": {"entry": "app.html"}
}
with zipfile.ZipFile(results[0].path) as zf:
assert "app.html" in zf.namelist()


class TestKPUBValidation:
"""Tests for KPUBConversionHandler validation."""

Expand Down
34 changes: 29 additions & 5 deletions tests/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,20 @@ def valid_zip():

@pytest.fixture
def invalid_zip():
# Create a temporary zip file without index.html
# Create a temporary zip file without any HTML file
fd, path = tempfile.mkstemp(suffix=".zip")
os.close(fd)

with zipfile.ZipFile(path, "w") as zf:
zf.writestr("script.js", "console.log('test');")

yield path
os.unlink(path)


@pytest.fixture
def non_index_entry_zip():
# Create a temporary zip file whose only HTML file is not index.html
fd, path = tempfile.mkstemp(suffix=".zip")
os.close(fd)

Expand Down Expand Up @@ -631,16 +644,27 @@ def test_invalid_htmlzip_validation(invalid_zip):

assert html_file.filename is None
assert html_file.error is not None
assert "index.html" in html_file.error
assert "no HTML file" in html_file.error


def test_non_index_entry_htmlzip_validation(non_index_entry_zip):
# A zip whose only HTML file is not index.html is valid; the entry
# point is detected and recorded in extra_fields.options.entry.
html_file = HTMLZipFile(non_index_entry_zip)
html_file.process_file()

assert html_file.filename is not None
assert html_file.error is None


def test_nested_index_htmlzip_validation(nested_index_zip):
# A zip whose files all live under a common parent directory is
# denested so that index.html ends up at the root.
html_file = HTMLZipFile(nested_index_zip)
html_file.process_file()

assert html_file.filename is None
assert html_file.error is not None
assert "index.html" in html_file.error
assert html_file.filename is not None
assert html_file.error is None


@pytest.mark.skip(
Expand Down
Loading