Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 277 additions & 0 deletions test/test_default_box_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
"""
Tests for DefaultBoxGenerator — specifically the CUDA device-mismatch fix.

Covers:
- CPU output shape, device, and value sanity
- clip=True clamps boxes to [0, 1] normalized coords (via _wh_pairs)
- clip=False allows boxes outside [0, 1]
- Batch size > 1
- CUDA device consistency (skipped when CUDA unavailable)
- TorchScript compatibility on CPU
- TorchScript compatibility on CUDA (skipped when CUDA unavailable)

Run with:
pytest test_default_box_generator.py -v
or for CUDA tests specifically:
pytest test_default_box_generator.py -v -k cuda
"""

import pytest
import torch
import torch.nn as nn
from torchvision.models.detection.anchor_utils import DefaultBoxGenerator
from torchvision.models.detection.image_list import ImageList

# ---------------------------------------------------------------------------
# Constants — SSD-300 configuration (well-known: produces exactly 8 732 anchors)
# ---------------------------------------------------------------------------

ASPECT_RATIOS = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
FEATURE_MAP_SIZES = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
IMAGE_SIZE = (300, 300)
EXPECTED_BOXES = 8732 # sum of (h*w * num_anchors_per_cell) across all feature maps


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def make_inputs(device: torch.device, batch_size: int = 1):
"""Return (ImageList, feature_maps) on the requested device."""
image_tensors = torch.zeros(batch_size, 3, *IMAGE_SIZE, device=device)
image_sizes = [IMAGE_SIZE] * batch_size
image_list = ImageList(image_tensors, image_sizes)
feature_maps = [
torch.zeros(batch_size, 1, h, w, device=device)
for h, w in FEATURE_MAP_SIZES
]
return image_list, feature_maps


def make_generator(clip: bool = True) -> DefaultBoxGenerator:
return DefaultBoxGenerator(aspect_ratios=ASPECT_RATIOS, clip=clip)


# ---------------------------------------------------------------------------
# CPU tests (always run)
# ---------------------------------------------------------------------------

class TestDefaultBoxGeneratorCPU:
"""CPU-only tests — no CUDA required."""

def test_output_length_matches_batch(self):
"""One anchor list per image in the batch."""
gen = make_generator()
image_list, feature_maps = make_inputs(torch.device("cpu"), batch_size=2)
out = gen(image_list, feature_maps)
assert len(out) == 2

def test_output_shape_single_image(self):
"""Each anchor list has shape (EXPECTED_BOXES, 4)."""
gen = make_generator()
image_list, feature_maps = make_inputs(torch.device("cpu"))
out = gen(image_list, feature_maps)
assert out[0].shape == (EXPECTED_BOXES, 4), (
f"Expected ({EXPECTED_BOXES}, 4), got {out[0].shape}"
)

def test_output_shape_batch(self):
"""Shape holds for every image in a batch."""
gen = make_generator()
image_list, feature_maps = make_inputs(torch.device("cpu"), batch_size=3)
out = gen(image_list, feature_maps)
for i, anchors in enumerate(out):
assert anchors.shape == (EXPECTED_BOXES, 4), (
f"Image {i}: expected ({EXPECTED_BOXES}, 4), got {anchors.shape}"
)

def test_output_device_is_cpu(self):
"""Anchors must be on CPU when inputs are on CPU."""
gen = make_generator()
image_list, feature_maps = make_inputs(torch.device("cpu"))
out = gen(image_list, feature_maps)
assert out[0].device.type == "cpu"

def test_no_nans(self):
"""Anchor coordinates must be finite (no NaN)."""
gen = make_generator()
image_list, feature_maps = make_inputs(torch.device("cpu"))
out = gen(image_list, feature_maps)
assert not torch.any(torch.isnan(out[0])), "NaN values found in anchors"

def test_clip_true_clamps_wh_pairs(self):
"""
With clip=True the internal _wh_pairs are clamped to [0, 1], so
the (w, h) components of every anchor should be in [0, 1].
The (cx, cy) components can legitimately be outside that range
for border anchors, so we only check wh.
"""
gen = make_generator(clip=True)
image_list, feature_maps = make_inputs(torch.device("cpu"))
out = gen(image_list, feature_maps)
# out is in (x1, y1, x2, y2) pixel space — convert width/height
boxes = out[0] # (N, 4): x1, y1, x2, y2
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
# pixel widths/heights must be ≤ image dimension (clip keeps wh ≤ 1 in normalized)
assert (widths <= IMAGE_SIZE[1] + 1e-4).all(), "Width exceeds image width"
assert (heights <= IMAGE_SIZE[0] + 1e-4).all(), "Height exceeds image height"

def test_clip_false_allows_large_anchors(self):
"""With clip=False, some anchors can be larger than the image."""
gen_clipped = make_generator(clip=True)
gen_free = make_generator(clip=False)
image_list, feature_maps = make_inputs(torch.device("cpu"))
out_clipped = gen_clipped(image_list, feature_maps)
out_free = gen_free(image_list, feature_maps)
# Both should still have the right shape
assert out_clipped[0].shape == (EXPECTED_BOXES, 4)
assert out_free[0].shape == (EXPECTED_BOXES, 4)
# clip=False boxes may be larger — widths can exceed image size
boxes_free = out_free[0]
widths_free = boxes_free[:, 2] - boxes_free[:, 0]
assert (widths_free > IMAGE_SIZE[1]).any(), (
"Expected some anchors wider than image when clip=False"
)

def test_output_dtype_float32(self):
"""Default dtype should be float32."""
gen = make_generator()
image_list, feature_maps = make_inputs(torch.device("cpu"))
out = gen(image_list, feature_maps)
assert out[0].dtype == torch.float32

def test_torchscript_cpu(self):
"""DefaultBoxGenerator must be TorchScript-traceable on CPU."""
gen = make_generator()
gen.eval()
scripted = torch.jit.script(gen)

image_list, feature_maps = make_inputs(torch.device("cpu"))
out_eager = gen(image_list, feature_maps)
out_scripted = scripted(image_list, feature_maps)

assert len(out_scripted) == len(out_eager)
torch.testing.assert_close(out_scripted[0], out_eager[0])


# ---------------------------------------------------------------------------
# CUDA tests (skipped when CUDA unavailable)
# ---------------------------------------------------------------------------

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
class TestDefaultBoxGeneratorCUDA:
"""
CUDA tests — verify fix for issue #9414:
DefaultBoxGenerator._grid_default_boxes previously built shifts on CPU
and left self._wh_pairs on CPU, causing torch.cat to raise a device
mismatch error when the model was moved to GPU.
"""

def test_output_device_is_cuda(self):
"""
Core regression test for #9414.
Anchors must be on the same CUDA device as the input feature maps.
Before the fix this raised: RuntimeError: Expected all tensors to be
on the same device, but found at least two devices, cpu and cuda:0!
"""
gen = make_generator()
device = torch.device("cuda:0")
image_list, feature_maps = make_inputs(device)
out = gen(image_list, feature_maps) # must not raise
assert out[0].device.type == "cuda", (
f"Anchors on wrong device: {out[0].device} (expected cuda)"
)

def test_cuda_output_shape(self):
"""Shape is correct on CUDA."""
gen = make_generator()
device = torch.device("cuda:0")
image_list, feature_maps = make_inputs(device)
out = gen(image_list, feature_maps)
assert out[0].shape == (EXPECTED_BOXES, 4)

def test_cuda_no_nans(self):
"""No NaN values on CUDA."""
gen = make_generator()
device = torch.device("cuda:0")
image_list, feature_maps = make_inputs(device)
out = gen(image_list, feature_maps)
assert not torch.any(torch.isnan(out[0]))

def test_cuda_cpu_values_match(self):
"""
Anchor coordinates produced on CUDA must match those on CPU
(up to floating-point tolerance), confirming no device-dependent
numeric divergence was introduced by the fix.
"""
gen = make_generator()
device = torch.device("cuda:0")

image_list_cpu, fmaps_cpu = make_inputs(torch.device("cpu"))
image_list_gpu, fmaps_gpu = make_inputs(device)

out_cpu = gen(image_list_cpu, fmaps_cpu)
out_gpu = gen(image_list_gpu, fmaps_gpu)

torch.testing.assert_close(
out_gpu[0].cpu(), out_cpu[0],
atol=1e-5, rtol=1e-5,
msg="CPU and CUDA anchors differ beyond tolerance",
)

def test_cuda_batch(self):
"""Batch of 2 works correctly on CUDA."""
gen = make_generator()
device = torch.device("cuda:0")
image_list, feature_maps = make_inputs(device, batch_size=2)
out = gen(image_list, feature_maps)
assert len(out) == 2
for anchors in out:
assert anchors.shape == (EXPECTED_BOXES, 4)
assert anchors.device.type == "cuda"

def test_cuda_clip_false(self):
"""clip=False works on CUDA without raising."""
gen = make_generator(clip=False)
device = torch.device("cuda:0")
image_list, feature_maps = make_inputs(device)
out = gen(image_list, feature_maps) # must not raise
assert out[0].shape == (EXPECTED_BOXES, 4)
assert out[0].device.type == "cuda"

def test_torchscript_cuda(self):
"""TorchScript tracing works on CUDA and results match eager mode."""
gen = make_generator()
gen.eval()
scripted = torch.jit.script(gen)

device = torch.device("cuda:0")
image_list, feature_maps = make_inputs(device)

out_eager = gen(image_list, feature_maps)
out_scripted = scripted(image_list, feature_maps)

assert len(out_scripted) == len(out_eager)
torch.testing.assert_close(out_scripted[0], out_eager[0])

def test_wh_pairs_moved_to_cuda(self):
"""
Internal _wh_pairs (created on CPU in __init__) must be transparently
moved to CUDA during forward — confirmed by the anchors being on CUDA.
This is the exact mechanism broken by #9414.
"""
gen = make_generator()
# _wh_pairs starts on CPU (by design in __init__)
assert gen._wh_pairs.device.type == "cpu", (
"_wh_pairs should be on CPU after __init__"
)
device = torch.device("cuda:0")
image_list, feature_maps = make_inputs(device)
out = gen(image_list, feature_maps)
# After forward on CUDA, output must be on CUDA
assert out[0].device.type == "cuda"
# _wh_pairs itself should still be on CPU (we .to() inside forward, not move the buffer)
assert gen._wh_pairs.device.type == "cpu", (
"_wh_pairs should remain on CPU after forward (we .to() inside the call, not mutate)"
)
18 changes: 12 additions & 6 deletions torchvision/models/detection/anchor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,11 @@ def num_anchors_per_location(self) -> list[int]:

# Default Boxes calculation based on page 6 of SSD paper
def _grid_default_boxes(
self, grid_sizes: list[list[int]], image_size: list[int], dtype: torch.dtype = torch.float32
self,
grid_sizes: list[list[int]],
image_size: list[int],
dtype: torch.dtype = torch.float32,
device: torch.device = torch.device("cpu"),
) -> Tensor:
default_boxes = []
for k, f_k in enumerate(grid_sizes):
Expand All @@ -218,15 +222,16 @@ def _grid_default_boxes(
else:
y_f_k, x_f_k = f_k

shifts_x = ((torch.arange(0, f_k[1]) + 0.5) / x_f_k).to(dtype=dtype)
shifts_y = ((torch.arange(0, f_k[0]) + 0.5) / y_f_k).to(dtype=dtype)
shifts_x = ((torch.arange(0, f_k[1], device=device) + 0.5) / x_f_k).to(dtype=dtype)
shifts_y = ((torch.arange(0, f_k[0], device=device) + 0.5) / y_f_k).to(dtype=dtype)
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing="ij")
shift_x = shift_x.reshape(-1)
shift_y = shift_y.reshape(-1)

shifts = torch.stack((shift_x, shift_y) * len(self._wh_pairs[k]), dim=-1).reshape(-1, 2)
# Clipping the default boxes while the boxes are encoded in format (cx, cy, w, h)
_wh_pair = self._wh_pairs[k].clamp(min=0, max=1) if self.clip else self._wh_pairs[k]
_wh_pair = self._wh_pairs[k].to(dtype=dtype, device=device)
_wh_pair = _wh_pair.clamp(min=0, max=1) if self.clip else _wh_pair
wh_pairs = _wh_pair.repeat((f_k[0] * f_k[1]), 1)

default_box = torch.cat((shifts, wh_pairs), dim=1)
Expand All @@ -250,8 +255,8 @@ def forward(self, image_list: ImageList, feature_maps: list[Tensor]) -> list[Ten
grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
image_size = image_list.tensors.shape[-2:]
dtype, device = feature_maps[0].dtype, feature_maps[0].device
default_boxes = self._grid_default_boxes(grid_sizes, image_size, dtype=dtype)
default_boxes = default_boxes.to(device)
default_boxes = self._grid_default_boxes(grid_sizes, image_size, dtype=dtype, device=device)
# device already set inside _grid_default_boxes

dboxes = []
x_y_size = torch.tensor([image_size[1], image_size[0]], device=default_boxes.device)
Expand All @@ -266,3 +271,4 @@ def forward(self, image_list: ImageList, feature_maps: list[Tensor]) -> list[Ten
)
dboxes.append(dboxes_in_image)
return dboxes