lightly-ai · JonasWurst · Jan 15, 2026 · Jan 13, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/src/labelformat/formats/__init__.py b/src/labelformat/formats/__init__.py
@@ -65,6 +65,7 @@
     YOLOv26ObjectDetectionInput,
     YOLOv26ObjectDetectionOutput,
 )
+from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput
 
 __all__ = [
     "COCOInstanceSegmentationInput",
@@ -105,4 +106,5 @@
     "YOLOv26ObjectDetectionInput",
     "YOLOv26ObjectDetectionOutput",
     "MaskPairInstanceSegmentationInput",
+    "YouTubeVISObjectDetectionTrackInput",
 ]
diff --git a/src/labelformat/formats/youtubevis.py b/src/labelformat/formats/youtubevis.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import json
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Dict, Iterable, List
+
+from labelformat.model.bounding_box import BoundingBox, BoundingBoxFormat
+from labelformat.model.category import Category
+from labelformat.model.object_detection_track import (
+    ObjectDetectionTrackInput,
+    SingleObjectDetectionTrack,
+    VideoObjectDetectionTrack,
+)
+from labelformat.model.video import Video
+from labelformat.types import JsonDict
+
+
+class YouTubeVISObjectDetectionTrackInput(ObjectDetectionTrackInput):
+    @staticmethod
+    def add_cli_arguments(parser: ArgumentParser) -> None:
+        parser.add_argument(
+            "--input-file",
+            type=Path,
+            required=True,
+            help="Path to input YouTube-VIS JSON file",
+        )
+
+    def __init__(self, input_file: Path) -> None:
+        with input_file.open() as file:
+            self._data = json.load(file)
+
+    def get_categories(self) -> Iterable[Category]:
+        for category in self._data["categories"]:
+            yield Category(
+                id=category["id"],
+                name=category["name"],
+            )
+
+    def get_videos(self) -> Iterable[Video]:
+        for video in self._data["videos"]:
+            yield Video(
+                id=video["id"],
+                # TODO (Jonas, 1/2026): The file_names do not hold the video file extension. Solution required.
+                filename=Path(video["file_names"][0]).parent.name,
+                width=int(video["width"]),
+                height=int(video["height"]),
+                number_of_frames=int(video["length"]),
+            )
+
+    def get_labels(self) -> Iterable[VideoObjectDetectionTrack]:
+        video_id_to_video = {video.id: video for video in self.get_videos()}
+        category_id_to_category = {
+            category.id: category for category in self.get_categories()
+        }
+        video_id_to_tracks: Dict[int, List[JsonDict]] = {
+            video_id: [] for video_id in video_id_to_video.keys()
+        }
+        for ann in self._data["annotations"]:
+            video_id_to_tracks[ann["video_id"]].append(ann)
+
+        for video_id, tracks in video_id_to_tracks.items():
+            video = video_id_to_video[video_id]
+            objects = []
+            for track in tracks:
+                boxes = _get_object_track_boxes(ann=track)
+                objects.append(
+                    SingleObjectDetectionTrack(
+                        category=category_id_to_category[ann["category_id"]],
+                        boxes=boxes,
+                    )
+                )
+            yield VideoObjectDetectionTrack(
+                video=video,
+                objects=objects,
+            )
+
+
+def _get_object_track_boxes(
+    ann: JsonDict,
+) -> list[BoundingBox | None]:
+    boxes: list[BoundingBox | None] = []
+    for bbox in ann["bboxes"]:
+        if bbox is None or len(bbox) == 0:
+            boxes.append(None)
+        else:
+            boxes.append(
+                BoundingBox.from_format(
+                    bbox=[float(x) for x in bbox],
+                    format=BoundingBoxFormat.XYWH,
+                )
+            )
+    return boxes
diff --git a/tests/unit/formats/test_youtubevis.py b/tests/unit/formats/test_youtubevis.py
@@ -0,0 +1,92 @@
+import json
+from pathlib import Path
+
+from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput
+from labelformat.model.bounding_box import BoundingBox
+from labelformat.model.category import Category
+from labelformat.model.object_detection_track import (
+    SingleObjectDetectionTrack,
+    VideoObjectDetectionTrack,
+)
+from labelformat.model.video import Video
+
+
+class TestYouTubeVISObjectDetectionTrackInput:
+    def test_get_categories(self, tmp_path: Path) -> None:
+        input_file = _write_youtube_vis_json(tmp_path / "instances.json")
+        label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file)
+
+        assert list(label_input.get_categories()) == [Category(id=1, name="cat")]
+
+    def test_get_videos(self, tmp_path: Path) -> None:
+        input_file = _write_youtube_vis_json(tmp_path / "instances.json")
+        label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file)
+
+        assert list(label_input.get_videos()) == [
+            Video(
+                id=5,
+                filename="video1",
+                width=640,
+                height=480,
+                number_of_frames=2,
+            )
+        ]
+
+    def test_get_labels(self, tmp_path: Path) -> None:
+        input_file = _write_youtube_vis_json(tmp_path / "instances.json")
+        label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file)
+
+        assert list(label_input.get_labels()) == [
+            VideoObjectDetectionTrack(
+                video=Video(
+                    id=5,
+                    filename="video1",
+                    width=640,
+                    height=480,
+                    number_of_frames=2,
+                ),
+                objects=[
+                    SingleObjectDetectionTrack(
+                        category=Category(id=1, name="cat"),
+                        boxes=[
+                            BoundingBox(
+                                xmin=10.0,
+                                ymin=20.0,
+                                xmax=40.0,
+                                ymax=60.0,
+                            ),
+                            None,
+                        ],
+                    )
+                ],
+            )
+        ]
+
+
+def _write_youtube_vis_json(input_file: Path) -> Path:
+    data = {
+        "categories": [
+            {"id": 1, "name": "cat"},
+        ],
+        "videos": [
+            {
+                "id": 5,
+                "file_names": ["video1/00000.jpg", "video1/00001.jpg"],
+                "width": 640,
+                "height": 480,
+                "length": 2,
+            }
+        ],
+        "annotations": [
+            {
+                "video_id": 5,
+                "category_id": 1,
+                "bboxes": [
+                    [10.0, 20.0, 30.0, 40.0],
+                    None,
+                ],
+            }
+        ],
+    }
+    input_file.write_text(json.dumps(data))
+    return input_file