From 473bb9c7e596ba37605a90d3591a4c70211401b4 Mon Sep 17 00:00:00 2001 From: JonasWurst Date: Tue, 13 Jan 2026 16:23:01 +0100 Subject: [PATCH 1/6] Adding models: video and object detection track --- .../model/object_detection_track.py | 69 +++++++++++++++++++ src/labelformat/model/video.py | 11 +++ .../unit/model/test_object_detection_track.py | 55 +++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 src/labelformat/model/object_detection_track.py create mode 100644 src/labelformat/model/video.py create mode 100644 tests/unit/model/test_object_detection_track.py diff --git a/src/labelformat/model/object_detection_track.py b/src/labelformat/model/object_detection_track.py new file mode 100644 index 0000000..ed1fff5 --- /dev/null +++ b/src/labelformat/model/object_detection_track.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from argparse import ArgumentParser +from dataclasses import dataclass +from typing import Iterable, List + +from labelformat.model.bounding_box import BoundingBox +from labelformat.model.category import Category +from labelformat.model.video import Video + + +@dataclass(frozen=True) +class SingleObjectDetectionTrack: + category: Category + boxes: list[BoundingBox | None] + # TODO (Jonas, 01/2026): Add confidence + + +@dataclass(frozen=True) +class VideoObjectDetectionTrack: + """ + The base class for a video alongside with its object detection track annotations. + A a video contains of N frames and of M objects. Each object contains N boxes. + The number of frames and the number of annotations for each object must match + --> one annotation per frame. + If a object is not present on a frame, the corresponding entry has to be None. + """ + + video: Video + objects: List[SingleObjectDetectionTrack] + + def __post_init__(self) -> None: + number_of_frames = self.video.number_of_frames + + for object in self.objects: + if len(object.boxes) != number_of_frames: + raise ValueError( + "Length of object detection track does not match the number of frames in the video." + ) + + +class ObjectDetectionTrackInput(ABC): + @staticmethod + @abstractmethod + def add_cli_arguments(parser: ArgumentParser) -> None: + raise NotImplementedError() + + @abstractmethod + def get_categories(self) -> Iterable[Category]: + raise NotImplementedError() + + @abstractmethod + def get_videos(self) -> Iterable[Video]: + raise NotImplementedError() + + @abstractmethod + def get_labels(self) -> Iterable[VideoObjectDetectionTrack]: + raise NotImplementedError() + + +class ObjectDetectionTrackOutput(ABC): + @staticmethod + @abstractmethod + def add_cli_arguments(parser: ArgumentParser) -> None: + raise NotImplementedError() + + def save(self, label_input: ObjectDetectionTrackInput) -> None: + raise NotImplementedError() diff --git a/src/labelformat/model/video.py b/src/labelformat/model/video.py new file mode 100644 index 0000000..d6f272c --- /dev/null +++ b/src/labelformat/model/video.py @@ -0,0 +1,11 @@ +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Video: + id: int + filename: str + width: int + height: int + number_of_frames: int + # TODO (Jonas, 01/2026): Add list of frames diff --git a/tests/unit/model/test_object_detection_track.py b/tests/unit/model/test_object_detection_track.py new file mode 100644 index 0000000..ea7c52e --- /dev/null +++ b/tests/unit/model/test_object_detection_track.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import pytest + +from labelformat.model.bounding_box import BoundingBox +from labelformat.model.category import Category +from labelformat.model.object_detection_track import ( + SingleObjectDetectionTrack, + VideoObjectDetectionTrack, +) +from labelformat.model.video import Video + + +class TestVideoObjectDetectionTrack: + def test_frames_equal_boxes_length__valid(self) -> None: + track_a = SingleObjectDetectionTrack( + category=Category(id=0, name="cat"), + boxes=[BoundingBox(xmin=0, ymin=0, xmax=1, ymax=1) for _ in range(2)], + ) + + track_b = SingleObjectDetectionTrack( + category=Category(id=1, name="dog"), + boxes=[BoundingBox(xmin=0, ymin=0, xmax=1, ymax=1) for _ in range(2)], + ) + + video = Video(id=0, filename="test.mov", width=1, height=1, number_of_frames=2) + + detections = VideoObjectDetectionTrack( + video=video, + objects=[track_a, track_b], + ) + assert len(detections.objects) == 2 + assert len(detections.objects[0].boxes) == 2 + + def test_frames_equal_boxes_length___invalid(self) -> None: + track_a = SingleObjectDetectionTrack( + category=Category(id=0, name="cat"), + boxes=[BoundingBox(xmin=0, ymin=0, xmax=1, ymax=1) for _ in range(2)], + ) + + track_b = SingleObjectDetectionTrack( + category=Category(id=1, name="dog"), + boxes=[BoundingBox(xmin=0, ymin=0, xmax=1, ymax=1) for _ in range(3)], + ) + + video = Video(id=0, filename="test.mov", width=1, height=1, number_of_frames=2) + + with pytest.raises( + ValueError, + match="Length of object detection track does not match the number of frames in the video.", + ): + VideoObjectDetectionTrack( + video=video, + objects=[track_a, track_b], + ) From 6bd5ed3dcd6d48e386dd7286cf309571cb11c5c0 Mon Sep 17 00:00:00 2001 From: JonasWurst Date: Wed, 14 Jan 2026 10:28:25 +0100 Subject: [PATCH 2/6] Add youtubevis input format --- src/labelformat/formats/__init__.py | 2 + src/labelformat/formats/youtubevis.py | 93 +++++++++++++++++++++++++++ tests/unit/formats/test_youtubevis.py | 92 ++++++++++++++++++++++++++ 3 files changed, 187 insertions(+) create mode 100644 src/labelformat/formats/youtubevis.py create mode 100644 tests/unit/formats/test_youtubevis.py diff --git a/src/labelformat/formats/__init__.py b/src/labelformat/formats/__init__.py index 47bbd18..76a513c 100644 --- a/src/labelformat/formats/__init__.py +++ b/src/labelformat/formats/__init__.py @@ -65,6 +65,7 @@ YOLOv26ObjectDetectionInput, YOLOv26ObjectDetectionOutput, ) +from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput __all__ = [ "COCOInstanceSegmentationInput", @@ -105,4 +106,5 @@ "YOLOv26ObjectDetectionInput", "YOLOv26ObjectDetectionOutput", "MaskPairInstanceSegmentationInput", + "YouTubeVISObjectDetectionTrackInput", ] diff --git a/src/labelformat/formats/youtubevis.py b/src/labelformat/formats/youtubevis.py new file mode 100644 index 0000000..60108c6 --- /dev/null +++ b/src/labelformat/formats/youtubevis.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +import json +from argparse import ArgumentParser +from pathlib import Path +from typing import Dict, Iterable, List + +from labelformat.model.bounding_box import BoundingBox, BoundingBoxFormat +from labelformat.model.category import Category +from labelformat.model.object_detection_track import ( + ObjectDetectionTrackInput, + SingleObjectDetectionTrack, + VideoObjectDetectionTrack, +) +from labelformat.model.video import Video +from labelformat.types import JsonDict + + +class YouTubeVISObjectDetectionTrackInput(ObjectDetectionTrackInput): + @staticmethod + def add_cli_arguments(parser: ArgumentParser) -> None: + parser.add_argument( + "--input-file", + type=Path, + required=True, + help="Path to input YouTube-VIS JSON file", + ) + + def __init__(self, input_file: Path) -> None: + with input_file.open() as file: + self._data = json.load(file) + + def get_categories(self) -> Iterable[Category]: + for category in self._data["categories"]: + yield Category( + id=category["id"], + name=category["name"], + ) + + def get_videos(self) -> Iterable[Video]: + for video in self._data["videos"]: + yield Video( + id=video["id"], + # TODO (Jonas, 1/2026): The file_names do not hold the video file extension. Solution required. + filename=Path(video["file_names"][0]).parent.name, + width=int(video["width"]), + height=int(video["height"]), + number_of_frames=int(video["length"]), + ) + + def get_labels(self) -> Iterable[VideoObjectDetectionTrack]: + video_id_to_video = {video.id: video for video in self.get_videos()} + category_id_to_category = { + category.id: category for category in self.get_categories() + } + video_id_to_tracks: Dict[int, List[JsonDict]] = { + video_id: [] for video_id in video_id_to_video.keys() + } + for ann in self._data["annotations"]: + video_id_to_tracks[ann["video_id"]].append(ann) + + for video_id, tracks in video_id_to_tracks.items(): + video = video_id_to_video[video_id] + objects = [] + for track in tracks: + boxes = _get_object_track_boxes(ann=track) + objects.append( + SingleObjectDetectionTrack( + category=category_id_to_category[ann["category_id"]], + boxes=boxes, + ) + ) + yield VideoObjectDetectionTrack( + video=video, + objects=objects, + ) + + +def _get_object_track_boxes( + ann: JsonDict, +) -> list[BoundingBox | None]: + boxes: list[BoundingBox | None] = [] + for bbox in ann["bboxes"]: + if bbox is None or len(bbox) == 0: + boxes.append(None) + continue + boxes.append( + BoundingBox.from_format( + bbox=[float(x) for x in bbox], + format=BoundingBoxFormat.XYWH, + ) + ) + return boxes diff --git a/tests/unit/formats/test_youtubevis.py b/tests/unit/formats/test_youtubevis.py new file mode 100644 index 0000000..b5218b7 --- /dev/null +++ b/tests/unit/formats/test_youtubevis.py @@ -0,0 +1,92 @@ +import json +from pathlib import Path + +from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput +from labelformat.model.bounding_box import BoundingBox +from labelformat.model.category import Category +from labelformat.model.object_detection_track import ( + SingleObjectDetectionTrack, + VideoObjectDetectionTrack, +) +from labelformat.model.video import Video + + +class TestYouTubeVISObjectDetectionTrackInput: + def test_get_categories(self, tmp_path: Path) -> None: + input_file = _write_youtube_vis_json(tmp_path) + label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file) + + assert list(label_input.get_categories()) == [Category(id=1, name="cat")] + + def test_get_videos(self, tmp_path: Path) -> None: + input_file = _write_youtube_vis_json(tmp_path) + label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file) + + assert list(label_input.get_videos()) == [ + Video( + id=5, + filename="video1", + width=640, + height=480, + number_of_frames=2, + ) + ] + + def test_get_labels(self, tmp_path: Path) -> None: + input_file = _write_youtube_vis_json(tmp_path) + label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file) + + assert list(label_input.get_labels()) == [ + VideoObjectDetectionTrack( + video=Video( + id=5, + filename="video1", + width=640, + height=480, + number_of_frames=2, + ), + objects=[ + SingleObjectDetectionTrack( + category=Category(id=1, name="cat"), + boxes=[ + BoundingBox( + xmin=10.0, + ymin=20.0, + xmax=40.0, + ymax=60.0, + ), + None, + ], + ) + ], + ) + ] + +def _write_youtube_vis_json(tmp_path: Path) -> Path: + data = { + "categories": [ + {"id": 1, "name": "cat"}, + ], + "videos": [ + { + "id": 5, + "file_names": ["video1/00000.jpg", "video1/00001.jpg"], + "width": 640, + "height": 480, + "length": 2, + } + ], + "annotations": [ + { + "video_id": 5, + "category_id": 1, + "bboxes": [ + [10.0, 20.0, 30.0, 40.0], + None, + ], + } + ], + } + input_file = tmp_path / "instances.json" + input_file.write_text(json.dumps(data)) + return input_file \ No newline at end of file From 1b4afd5527290b75751e6b57918ea5d7885b1edf Mon Sep 17 00:00:00 2001 From: JonasWurst Date: Wed, 14 Jan 2026 15:13:54 +0100 Subject: [PATCH 3/6] Video support: Add youtubevis input format Adding the input format for youtube-vis. Note: This PR was inspired by #27 by @fardinayar --- .python-version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.python-version b/.python-version index 36f601f..cc1923a 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.7.16 +3.8 From 198296851bc58d0678a48af6fbcb1f5bfefe6d86 Mon Sep 17 00:00:00 2001 From: JonasWurst Date: Wed, 14 Jan 2026 15:14:38 +0100 Subject: [PATCH 4/6] revert python version --- .python-version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.python-version b/.python-version index cc1923a..36f601f 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.8 +3.7.16 From 230d90757aa2b554e78b8f4d5d5c533b9676f410 Mon Sep 17 00:00:00 2001 From: JonasWurst Date: Thu, 15 Jan 2026 12:44:58 +0100 Subject: [PATCH 5/6] Review comments --- src/labelformat/formats/youtubevis.py | 12 ++++++------ tests/unit/formats/test_youtubevis.py | 11 +++++------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/labelformat/formats/youtubevis.py b/src/labelformat/formats/youtubevis.py index 60108c6..be8c273 100644 --- a/src/labelformat/formats/youtubevis.py +++ b/src/labelformat/formats/youtubevis.py @@ -83,11 +83,11 @@ def _get_object_track_boxes( for bbox in ann["bboxes"]: if bbox is None or len(bbox) == 0: boxes.append(None) - continue - boxes.append( - BoundingBox.from_format( - bbox=[float(x) for x in bbox], - format=BoundingBoxFormat.XYWH, + else: + boxes.append( + BoundingBox.from_format( + bbox=[float(x) for x in bbox], + format=BoundingBoxFormat.XYWH, + ) ) - ) return boxes diff --git a/tests/unit/formats/test_youtubevis.py b/tests/unit/formats/test_youtubevis.py index b5218b7..47ef4ff 100644 --- a/tests/unit/formats/test_youtubevis.py +++ b/tests/unit/formats/test_youtubevis.py @@ -13,13 +13,13 @@ class TestYouTubeVISObjectDetectionTrackInput: def test_get_categories(self, tmp_path: Path) -> None: - input_file = _write_youtube_vis_json(tmp_path) + input_file = _write_youtube_vis_json(tmp_path / "instances.json") label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file) assert list(label_input.get_categories()) == [Category(id=1, name="cat")] def test_get_videos(self, tmp_path: Path) -> None: - input_file = _write_youtube_vis_json(tmp_path) + input_file = _write_youtube_vis_json(tmp_path / "instances.json") label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file) assert list(label_input.get_videos()) == [ @@ -33,7 +33,7 @@ def test_get_videos(self, tmp_path: Path) -> None: ] def test_get_labels(self, tmp_path: Path) -> None: - input_file = _write_youtube_vis_json(tmp_path) + input_file = _write_youtube_vis_json(tmp_path / "instances.json") label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file) assert list(label_input.get_labels()) == [ @@ -62,7 +62,7 @@ def test_get_labels(self, tmp_path: Path) -> None: ) ] -def _write_youtube_vis_json(tmp_path: Path) -> Path: +def _write_youtube_vis_json(input_file: Path) -> Path: data = { "categories": [ {"id": 1, "name": "cat"}, @@ -87,6 +87,5 @@ def _write_youtube_vis_json(tmp_path: Path) -> Path: } ], } - input_file = tmp_path / "instances.json" input_file.write_text(json.dumps(data)) - return input_file \ No newline at end of file + return input_file From 59d644150b798c3dcde769e390f0786bad54bc0b Mon Sep 17 00:00:00 2001 From: JonasWurst Date: Thu, 15 Jan 2026 12:47:38 +0100 Subject: [PATCH 6/6] format --- tests/unit/formats/test_youtubevis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/formats/test_youtubevis.py b/tests/unit/formats/test_youtubevis.py index 47ef4ff..e176720 100644 --- a/tests/unit/formats/test_youtubevis.py +++ b/tests/unit/formats/test_youtubevis.py @@ -62,6 +62,7 @@ def test_get_labels(self, tmp_path: Path) -> None: ) ] + def _write_youtube_vis_json(input_file: Path) -> Path: data = { "categories": [