justincdavis · justincdavis · Feb 28, 2026
diff --git a/tests/image/conftest.py b/tests/image/conftest.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2025-2026 Justin Davis (davisjustin302@gmail.com)
+#
+# MIT License
+# mypy: disable-error-code="misc,no-any-return"
+from __future__ import annotations
+
+import tempfile
+from typing import Callable
+
+import numpy as np
+import pytest
+
+from trtutils.image.preprocessors import (
+    CPUPreprocessor,
+    CUDAPreprocessor,
+    TRTPreprocessor,
+)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+PREPROC_SIZE = (640, 640)
+PREPROC_RANGE = (0.0, 1.0)
+PREPROC_DTYPE = np.dtype(np.float32)
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+# Tolerance for CPU/GPU parity
+CUDA_MAG_BOUNDS = 0.01
+
+
+# ---------------------------------------------------------------------------
+# Build support detection
+# ---------------------------------------------------------------------------
+@pytest.fixture(scope="session")
+def _trt_build_supported() -> bool:
+    """Check if TRT can build engines on this hardware (session-cached)."""
+    try:
+        from pathlib import Path
+
+        from trtutils.builder._build import build_engine
+
+        onnx_path = Path(__file__).parent.parent.parent / "data" / "simple.onnx"
+        if not onnx_path.exists():
+            return False
+        with tempfile.NamedTemporaryFile(suffix=".engine", delete=True) as f:
+            build_engine(onnx_path, f.name, optimization_level=1)
+            return True
+    except RuntimeError:
+        return False
+    except Exception:
+        return False
+
+
+# ---------------------------------------------------------------------------
+# Parametrized fixtures
+# ---------------------------------------------------------------------------
+@pytest.fixture(params=["cpu", "cuda", "trt"])
+def preprocessor_type(request: pytest.FixtureRequest) -> str:
+    """Provide preprocessor type identifiers."""
+    return request.param
+
+
+@pytest.fixture(params=["linear", "letterbox"])
+def resize_method(request: pytest.FixtureRequest) -> str:
+    """Provide resize method identifiers."""
+    return request.param
+
+
+# ---------------------------------------------------------------------------
+# Preprocessor factory
+# ---------------------------------------------------------------------------
+@pytest.fixture
+def make_preprocessor(
+    _trt_build_supported: bool,
+) -> Callable[..., CPUPreprocessor | CUDAPreprocessor | TRTPreprocessor]:
+    """Return a factory that builds preprocessors by type."""
+
+    def _make(
+        ptype: str,
+        *,
+        mean: tuple[float, float, float] | None = None,
+        std: tuple[float, float, float] | None = None,
+        batch_size: int = 4,
+    ) -> CPUPreprocessor | CUDAPreprocessor | TRTPreprocessor:
+        if ptype == "cpu":
+            return CPUPreprocessor(PREPROC_SIZE, PREPROC_RANGE, PREPROC_DTYPE, mean=mean, std=std)
+        if ptype == "cuda":
+            return CUDAPreprocessor(PREPROC_SIZE, PREPROC_RANGE, PREPROC_DTYPE, mean=mean, std=std)
+        if ptype == "trt":
+            if not _trt_build_supported:
+                pytest.skip("TRT cannot build engines for this GPU")
+            return TRTPreprocessor(
+                PREPROC_SIZE, PREPROC_RANGE, PREPROC_DTYPE, mean=mean, std=std, batch_size=batch_size
+            )
+        err_msg = f"Unknown preprocessor type: {ptype}"
+        raise ValueError(err_msg)
+
+    return _make
+
+
+# ---------------------------------------------------------------------------
+# Output mock generators
+# ---------------------------------------------------------------------------
+@pytest.fixture
+def make_yolov10_output() -> Callable[[int, int], list[np.ndarray]]:
+    """Return a factory for YOLOv10-like outputs."""
+
+    def _make(batch_size: int, num_dets: int = 10) -> list[np.ndarray]:
+        output = np.zeros((batch_size, 300, 6), dtype=np.float32)
+        for b in range(batch_size):
+            for i in range(num_dets):
+                offset = b * 50
+                output[b, i] = [
+                    100 + i * 10 + offset,
+                    100 + i * 10 + offset,
+                    200 + i * 10 + offset,
+                    200 + i * 10 + offset,
+                    0.9 - i * 0.05,
+                    i % 10,
+                ]
+        return [output]
+
+    return _make
+
+
+@pytest.fixture
+def make_efficient_nms_output() -> Callable[[int, int], list[np.ndarray]]:
+    """Return a factory for EfficientNMS-like outputs."""
+
+    def _make(batch_size: int, num_dets: int = 10) -> list[np.ndarray]:
+        max_dets = 100
+        num_dets_arr = np.full((batch_size,), num_dets, dtype=np.int32)
+        bboxes = np.zeros((batch_size, max_dets, 4), dtype=np.float32)
+        scores = np.zeros((batch_size, max_dets), dtype=np.float32)
+        class_ids = np.zeros((batch_size, max_dets), dtype=np.float32)
+        for b in range(batch_size):
+            offset = b * 50
+            for i in range(num_dets):
+                bboxes[b, i] = [
+                    100 + i * 10 + offset,
+                    100 + i * 10 + offset,
+                    200 + i * 10 + offset,
+                    200 + i * 10 + offset,
+                ]
+                scores[b, i] = 0.9 - i * 0.05
+                class_ids[b, i] = i % 10
+        return [num_dets_arr, bboxes, scores, class_ids]
+
+    return _make
+
+
+@pytest.fixture
+def make_rfdetr_output() -> Callable[[int, int, int, int], list[np.ndarray]]:
+    """Return a factory for RF-DETR-like outputs."""
+
+    def _make(
+        batch_size: int, num_queries: int = 300, num_classes: int = 80, num_dets: int = 10
+    ) -> list[np.ndarray]:
+        dets = np.zeros((batch_size, num_queries, 4), dtype=np.float32)
+        labels = np.full((batch_size, num_queries, num_classes), -10.0, dtype=np.float32)
+        for b in range(batch_size):
+            for i in range(num_dets):
+                cx = (150 + i * 10 + b * 30) / 640.0
+                cy = (150 + i * 10 + b * 30) / 640.0
+                w = 100 / 640.0
+                h = 100 / 640.0
+                dets[b, i] = [cx, cy, w, h]
+                class_idx = i % num_classes
+                labels[b, i, class_idx] = 5.0 - i * 0.3
+        return [dets, labels]
+
+    return _make
+
+
+@pytest.fixture
+def make_detr_output() -> Callable[[int, int, int], list[np.ndarray]]:
+    """Return a factory for DETR-like outputs."""
+
+    def _make(batch_size: int, num_queries: int = 300, num_dets: int = 10) -> list[np.ndarray]:
+        scores = np.zeros((batch_size, num_queries), dtype=np.float32)
+        labels = np.zeros((batch_size, num_queries), dtype=np.float32)
+        boxes = np.zeros((batch_size, num_queries, 4), dtype=np.float32)
+        for b in range(batch_size):
+            offset = b * 50
+            for i in range(num_dets):
+                scores[b, i] = 0.9 - i * 0.05
+                labels[b, i] = i % 10
+                boxes[b, i] = [
+                    100 + i * 10 + offset,
+                    100 + i * 10 + offset,
+                    200 + i * 10 + offset,
+                    200 + i * 10 + offset,
+                ]
+        return [scores, labels, boxes]
+
+    return _make
+
+
+@pytest.fixture
+def make_classification_output() -> Callable[[int, int], list[np.ndarray]]:
+    """Return a factory for classification outputs."""
+    rng = np.random.default_rng()
+
+    def _make(batch_size: int, num_classes: int = 1000) -> list[np.ndarray]:
+        output = rng.standard_normal((batch_size, num_classes)).astype(np.float32)
+        for b in range(batch_size):
+            output[b, b % num_classes] = 10.0
+            output[b, (b + 1) % num_classes] = 8.0
+        return [output]
+
+    return _make
+
+
+# ---------------------------------------------------------------------------
+# Ratios/padding factory
+# ---------------------------------------------------------------------------
+@pytest.fixture
+def make_ratios_padding() -> Callable[
+    [int], tuple[list[tuple[float, float]], list[tuple[float, float]]]
+]:
+    """Return a factory for ratios and padding lists."""
+
+    def _make(batch_size: int) -> tuple[list[tuple[float, float]], list[tuple[float, float]]]:
+        ratios = [(1.0, 1.0) for _ in range(batch_size)]
+        padding = [(0.0, 0.0) for _ in range(batch_size)]
+        return ratios, padding
+
+    return _make
diff --git a/tests/image/kernels/conftest.py b/tests/image/kernels/conftest.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025-2026 Justin Davis (davisjustin302@gmail.com)
+#
+# MIT License
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture
+def cuda_stream():
+    """Create and destroy a CUDA stream for kernel tests."""
+    from trtutils.core import create_stream, destroy_stream
+
+    stream = create_stream()
+    yield stream
+    destroy_stream(stream)
diff --git a/tests/image/kernels/test_letterbox.py b/tests/image/kernels/test_letterbox.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2026 Justin Davis (davisjustin302@gmail.com)
+#
+# MIT License
+# mypy: disable-error-code="misc"
+"""Tests for the letterbox resize CUDA kernel."""
+
+from __future__ import annotations
+
+import math
+from pathlib import Path
+
+import cv2
+import numpy as np
+import pytest
+
+from trtutils.core import (
+    Kernel,
+    create_binding,
+    create_stream,
+    destroy_stream,
+    memcpy_device_to_host_async,
+    memcpy_host_to_device_async,
+    stream_synchronize,
+)
+from trtutils.image import kernels
+
+try:
+    from cv2ext.image import letterbox as cv2ext_letterbox  # type: ignore[import-untyped]
+
+    _CV2EXT_AVAILABLE = True
+except ImportError:
+    cv2ext_letterbox = None  # type: ignore[assignment]
+    _CV2EXT_AVAILABLE = False
+
+_DATA_DIR = Path(__file__).parent.parent.parent.parent / "data"
+_HORSE_IMAGE_PATH = _DATA_DIR / "horse.jpg"
+
+
+def _run_letterbox_kernel(
+    img: np.ndarray,
+    output_shape: tuple[int, int],
+) -> np.ndarray:
+    """Run the letterbox kernel and return result."""
+    o_width, o_height = output_shape
+    height, width = img.shape[:2]
+
+    stream = create_stream()
+
+    num_threads: tuple[int, int, int] = (32, 32, 1)
+    num_blocks: tuple[int, int, int] = (
+        math.ceil(o_width / num_threads[1]),
+        math.ceil(o_height / num_threads[0]),
+        1,
+    )
+
+    input_binding = create_binding(img, is_input=True)
+    dummy_output = np.zeros((o_height, o_width, 3), dtype=np.uint8)
+    output_binding = create_binding(dummy_output, pagelocked_mem=True)
+
+    scale_x = o_width / width
+    scale_y = o_height / height
+    scale = min(scale_x, scale_y)
+    new_width = int(width * scale)
+    new_height = int(height * scale)
+    pad_x = int((o_width - new_width) / 2)
+    pad_y = int((o_height - new_height) / 2)
+
+    kernel = Kernel(kernels.LETTERBOX_RESIZE[0], kernels.LETTERBOX_RESIZE[1])
+    args = kernel.create_args(
+        input_binding.allocation,
+        output_binding.allocation,
+        width,
+        height,
+        o_width,
+        o_height,
+        pad_x,
+        pad_y,
+        new_width,
+        new_height,
+    )
+
+    memcpy_host_to_device_async(input_binding.allocation, img, stream)
+    kernel.call(num_blocks, num_threads, stream, args)
+    memcpy_device_to_host_async(output_binding.host_allocation, output_binding.allocation, stream)
+    stream_synchronize(stream)
+
+    result = output_binding.host_allocation.copy()
+
+    destroy_stream(stream)
+    input_binding.free()
+    output_binding.free()
+    kernel.free()
+
+    return result
+
+
+class TestLetterboxKernel:
+    """Tests for the letterbox CUDA kernel."""
+
+    def test_compiles(self) -> None:
+        """Letterbox kernel compiles without error."""
+        stream = create_stream()
+        compiled = Kernel(kernels.LETTERBOX_RESIZE[0], kernels.LETTERBOX_RESIZE[1])
+        assert compiled is not None
+        destroy_stream(stream)
+
+    @pytest.mark.skipif(not _CV2EXT_AVAILABLE, reason="cv2ext not installed")
+    def test_correctness_against_cv2ext(self) -> None:
+        """GPU letterbox result matches cv2ext.letterbox()."""
+        if not _HORSE_IMAGE_PATH.exists():
+            pytest.skip("Horse test image not found")
+        img = cv2.imread(str(_HORSE_IMAGE_PATH))
+        if img is None:
+            pytest.skip("Failed to read test image")
+
+        output_shape = (640, 480)
+        assert cv2ext_letterbox is not None
+        resized_img, _, _ = cv2ext_letterbox(img, output_shape)  # type: ignore[misc]
+        cuda_result = _run_letterbox_kernel(img, output_shape)
+
+        assert cuda_result.shape == resized_img.shape
+        cpu_mean = np.mean(resized_img)
+        assert cpu_mean - 0.5 <= np.mean(cuda_result) <= cpu_mean + 0.5
+        diff_mask = np.any(resized_img != cuda_result, axis=-1)
+        avg_diff = np.mean(np.abs(resized_img[diff_mask] - cuda_result[diff_mask]))
+        assert avg_diff < 1.0
+
+    @pytest.mark.parametrize(
+        "output_shape",
+        [(640, 640), (416, 416), (320, 320)],
+        ids=["640x640", "416x416", "320x320"],
+    )
+    def test_various_target_sizes(self, output_shape: tuple[int, int]) -> None:
+        """Letterbox kernel works with various target sizes."""
+        if not _HORSE_IMAGE_PATH.exists():
+            pytest.skip("Horse test image not found")
+        img = cv2.imread(str(_HORSE_IMAGE_PATH))
+        if img is None:
+            pytest.skip("Failed to read test image")
+
+        o_width, o_height = output_shape
+        result = _run_letterbox_kernel(img, output_shape)
+        assert result.shape == (o_height, o_width, 3)