diff --git a/test/test_default_box_generator.py b/test/test_default_box_generator.py new file mode 100644 index 00000000000..3c0dcd76170 --- /dev/null +++ b/test/test_default_box_generator.py @@ -0,0 +1,277 @@ +""" +Tests for DefaultBoxGenerator — specifically the CUDA device-mismatch fix. + +Covers: + - CPU output shape, device, and value sanity + - clip=True clamps boxes to [0, 1] normalized coords (via _wh_pairs) + - clip=False allows boxes outside [0, 1] + - Batch size > 1 + - CUDA device consistency (skipped when CUDA unavailable) + - TorchScript compatibility on CPU + - TorchScript compatibility on CUDA (skipped when CUDA unavailable) + +Run with: + pytest test_default_box_generator.py -v +or for CUDA tests specifically: + pytest test_default_box_generator.py -v -k cuda +""" + +import pytest +import torch +import torch.nn as nn +from torchvision.models.detection.anchor_utils import DefaultBoxGenerator +from torchvision.models.detection.image_list import ImageList + +# --------------------------------------------------------------------------- +# Constants — SSD-300 configuration (well-known: produces exactly 8 732 anchors) +# --------------------------------------------------------------------------- + +ASPECT_RATIOS = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] +FEATURE_MAP_SIZES = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)] +IMAGE_SIZE = (300, 300) +EXPECTED_BOXES = 8732 # sum of (h*w * num_anchors_per_cell) across all feature maps + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def make_inputs(device: torch.device, batch_size: int = 1): + """Return (ImageList, feature_maps) on the requested device.""" + image_tensors = torch.zeros(batch_size, 3, *IMAGE_SIZE, device=device) + image_sizes = [IMAGE_SIZE] * batch_size + image_list = ImageList(image_tensors, image_sizes) + feature_maps = [ + torch.zeros(batch_size, 1, h, w, device=device) + for h, w in FEATURE_MAP_SIZES + ] + return image_list, feature_maps + + +def make_generator(clip: bool = True) -> DefaultBoxGenerator: + return DefaultBoxGenerator(aspect_ratios=ASPECT_RATIOS, clip=clip) + + +# --------------------------------------------------------------------------- +# CPU tests (always run) +# --------------------------------------------------------------------------- + +class TestDefaultBoxGeneratorCPU: + """CPU-only tests — no CUDA required.""" + + def test_output_length_matches_batch(self): + """One anchor list per image in the batch.""" + gen = make_generator() + image_list, feature_maps = make_inputs(torch.device("cpu"), batch_size=2) + out = gen(image_list, feature_maps) + assert len(out) == 2 + + def test_output_shape_single_image(self): + """Each anchor list has shape (EXPECTED_BOXES, 4).""" + gen = make_generator() + image_list, feature_maps = make_inputs(torch.device("cpu")) + out = gen(image_list, feature_maps) + assert out[0].shape == (EXPECTED_BOXES, 4), ( + f"Expected ({EXPECTED_BOXES}, 4), got {out[0].shape}" + ) + + def test_output_shape_batch(self): + """Shape holds for every image in a batch.""" + gen = make_generator() + image_list, feature_maps = make_inputs(torch.device("cpu"), batch_size=3) + out = gen(image_list, feature_maps) + for i, anchors in enumerate(out): + assert anchors.shape == (EXPECTED_BOXES, 4), ( + f"Image {i}: expected ({EXPECTED_BOXES}, 4), got {anchors.shape}" + ) + + def test_output_device_is_cpu(self): + """Anchors must be on CPU when inputs are on CPU.""" + gen = make_generator() + image_list, feature_maps = make_inputs(torch.device("cpu")) + out = gen(image_list, feature_maps) + assert out[0].device.type == "cpu" + + def test_no_nans(self): + """Anchor coordinates must be finite (no NaN).""" + gen = make_generator() + image_list, feature_maps = make_inputs(torch.device("cpu")) + out = gen(image_list, feature_maps) + assert not torch.any(torch.isnan(out[0])), "NaN values found in anchors" + + def test_clip_true_clamps_wh_pairs(self): + """ + With clip=True the internal _wh_pairs are clamped to [0, 1], so + the (w, h) components of every anchor should be in [0, 1]. + The (cx, cy) components can legitimately be outside that range + for border anchors, so we only check wh. + """ + gen = make_generator(clip=True) + image_list, feature_maps = make_inputs(torch.device("cpu")) + out = gen(image_list, feature_maps) + # out is in (x1, y1, x2, y2) pixel space — convert width/height + boxes = out[0] # (N, 4): x1, y1, x2, y2 + widths = boxes[:, 2] - boxes[:, 0] + heights = boxes[:, 3] - boxes[:, 1] + # pixel widths/heights must be ≤ image dimension (clip keeps wh ≤ 1 in normalized) + assert (widths <= IMAGE_SIZE[1] + 1e-4).all(), "Width exceeds image width" + assert (heights <= IMAGE_SIZE[0] + 1e-4).all(), "Height exceeds image height" + + def test_clip_false_allows_large_anchors(self): + """With clip=False, some anchors can be larger than the image.""" + gen_clipped = make_generator(clip=True) + gen_free = make_generator(clip=False) + image_list, feature_maps = make_inputs(torch.device("cpu")) + out_clipped = gen_clipped(image_list, feature_maps) + out_free = gen_free(image_list, feature_maps) + # Both should still have the right shape + assert out_clipped[0].shape == (EXPECTED_BOXES, 4) + assert out_free[0].shape == (EXPECTED_BOXES, 4) + # clip=False boxes may be larger — widths can exceed image size + boxes_free = out_free[0] + widths_free = boxes_free[:, 2] - boxes_free[:, 0] + assert (widths_free > IMAGE_SIZE[1]).any(), ( + "Expected some anchors wider than image when clip=False" + ) + + def test_output_dtype_float32(self): + """Default dtype should be float32.""" + gen = make_generator() + image_list, feature_maps = make_inputs(torch.device("cpu")) + out = gen(image_list, feature_maps) + assert out[0].dtype == torch.float32 + + def test_torchscript_cpu(self): + """DefaultBoxGenerator must be TorchScript-traceable on CPU.""" + gen = make_generator() + gen.eval() + scripted = torch.jit.script(gen) + + image_list, feature_maps = make_inputs(torch.device("cpu")) + out_eager = gen(image_list, feature_maps) + out_scripted = scripted(image_list, feature_maps) + + assert len(out_scripted) == len(out_eager) + torch.testing.assert_close(out_scripted[0], out_eager[0]) + + +# --------------------------------------------------------------------------- +# CUDA tests (skipped when CUDA unavailable) +# --------------------------------------------------------------------------- + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +class TestDefaultBoxGeneratorCUDA: + """ + CUDA tests — verify fix for issue #9414: + DefaultBoxGenerator._grid_default_boxes previously built shifts on CPU + and left self._wh_pairs on CPU, causing torch.cat to raise a device + mismatch error when the model was moved to GPU. + """ + + def test_output_device_is_cuda(self): + """ + Core regression test for #9414. + Anchors must be on the same CUDA device as the input feature maps. + Before the fix this raised: RuntimeError: Expected all tensors to be + on the same device, but found at least two devices, cpu and cuda:0! + """ + gen = make_generator() + device = torch.device("cuda:0") + image_list, feature_maps = make_inputs(device) + out = gen(image_list, feature_maps) # must not raise + assert out[0].device.type == "cuda", ( + f"Anchors on wrong device: {out[0].device} (expected cuda)" + ) + + def test_cuda_output_shape(self): + """Shape is correct on CUDA.""" + gen = make_generator() + device = torch.device("cuda:0") + image_list, feature_maps = make_inputs(device) + out = gen(image_list, feature_maps) + assert out[0].shape == (EXPECTED_BOXES, 4) + + def test_cuda_no_nans(self): + """No NaN values on CUDA.""" + gen = make_generator() + device = torch.device("cuda:0") + image_list, feature_maps = make_inputs(device) + out = gen(image_list, feature_maps) + assert not torch.any(torch.isnan(out[0])) + + def test_cuda_cpu_values_match(self): + """ + Anchor coordinates produced on CUDA must match those on CPU + (up to floating-point tolerance), confirming no device-dependent + numeric divergence was introduced by the fix. + """ + gen = make_generator() + device = torch.device("cuda:0") + + image_list_cpu, fmaps_cpu = make_inputs(torch.device("cpu")) + image_list_gpu, fmaps_gpu = make_inputs(device) + + out_cpu = gen(image_list_cpu, fmaps_cpu) + out_gpu = gen(image_list_gpu, fmaps_gpu) + + torch.testing.assert_close( + out_gpu[0].cpu(), out_cpu[0], + atol=1e-5, rtol=1e-5, + msg="CPU and CUDA anchors differ beyond tolerance", + ) + + def test_cuda_batch(self): + """Batch of 2 works correctly on CUDA.""" + gen = make_generator() + device = torch.device("cuda:0") + image_list, feature_maps = make_inputs(device, batch_size=2) + out = gen(image_list, feature_maps) + assert len(out) == 2 + for anchors in out: + assert anchors.shape == (EXPECTED_BOXES, 4) + assert anchors.device.type == "cuda" + + def test_cuda_clip_false(self): + """clip=False works on CUDA without raising.""" + gen = make_generator(clip=False) + device = torch.device("cuda:0") + image_list, feature_maps = make_inputs(device) + out = gen(image_list, feature_maps) # must not raise + assert out[0].shape == (EXPECTED_BOXES, 4) + assert out[0].device.type == "cuda" + + def test_torchscript_cuda(self): + """TorchScript tracing works on CUDA and results match eager mode.""" + gen = make_generator() + gen.eval() + scripted = torch.jit.script(gen) + + device = torch.device("cuda:0") + image_list, feature_maps = make_inputs(device) + + out_eager = gen(image_list, feature_maps) + out_scripted = scripted(image_list, feature_maps) + + assert len(out_scripted) == len(out_eager) + torch.testing.assert_close(out_scripted[0], out_eager[0]) + + def test_wh_pairs_moved_to_cuda(self): + """ + Internal _wh_pairs (created on CPU in __init__) must be transparently + moved to CUDA during forward — confirmed by the anchors being on CUDA. + This is the exact mechanism broken by #9414. + """ + gen = make_generator() + # _wh_pairs starts on CPU (by design in __init__) + assert gen._wh_pairs.device.type == "cpu", ( + "_wh_pairs should be on CPU after __init__" + ) + device = torch.device("cuda:0") + image_list, feature_maps = make_inputs(device) + out = gen(image_list, feature_maps) + # After forward on CUDA, output must be on CUDA + assert out[0].device.type == "cuda" + # _wh_pairs itself should still be on CPU (we .to() inside forward, not move the buffer) + assert gen._wh_pairs.device.type == "cpu", ( + "_wh_pairs should remain on CPU after forward (we .to() inside the call, not mutate)" + ) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 4722e1550c2..6680dca7dcb 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -207,7 +207,11 @@ def num_anchors_per_location(self) -> list[int]: # Default Boxes calculation based on page 6 of SSD paper def _grid_default_boxes( - self, grid_sizes: list[list[int]], image_size: list[int], dtype: torch.dtype = torch.float32 + self, + grid_sizes: list[list[int]], + image_size: list[int], + dtype: torch.dtype = torch.float32, + device: torch.device = torch.device("cpu"), ) -> Tensor: default_boxes = [] for k, f_k in enumerate(grid_sizes): @@ -218,15 +222,16 @@ def _grid_default_boxes( else: y_f_k, x_f_k = f_k - shifts_x = ((torch.arange(0, f_k[1]) + 0.5) / x_f_k).to(dtype=dtype) - shifts_y = ((torch.arange(0, f_k[0]) + 0.5) / y_f_k).to(dtype=dtype) + shifts_x = ((torch.arange(0, f_k[1], device=device) + 0.5) / x_f_k).to(dtype=dtype) + shifts_y = ((torch.arange(0, f_k[0], device=device) + 0.5) / y_f_k).to(dtype=dtype) shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing="ij") shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) shifts = torch.stack((shift_x, shift_y) * len(self._wh_pairs[k]), dim=-1).reshape(-1, 2) # Clipping the default boxes while the boxes are encoded in format (cx, cy, w, h) - _wh_pair = self._wh_pairs[k].clamp(min=0, max=1) if self.clip else self._wh_pairs[k] + _wh_pair = self._wh_pairs[k].to(dtype=dtype, device=device) + _wh_pair = _wh_pair.clamp(min=0, max=1) if self.clip else _wh_pair wh_pairs = _wh_pair.repeat((f_k[0] * f_k[1]), 1) default_box = torch.cat((shifts, wh_pairs), dim=1) @@ -250,8 +255,8 @@ def forward(self, image_list: ImageList, feature_maps: list[Tensor]) -> list[Ten grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps] image_size = image_list.tensors.shape[-2:] dtype, device = feature_maps[0].dtype, feature_maps[0].device - default_boxes = self._grid_default_boxes(grid_sizes, image_size, dtype=dtype) - default_boxes = default_boxes.to(device) + default_boxes = self._grid_default_boxes(grid_sizes, image_size, dtype=dtype, device=device) + # device already set inside _grid_default_boxes dboxes = [] x_y_size = torch.tensor([image_size[1], image_size[0]], device=default_boxes.device) @@ -266,3 +271,4 @@ def forward(self, image_list: ImageList, feature_maps: list[Tensor]) -> list[Ten ) dboxes.append(dboxes_in_image) return dboxes +