diff --git a/src/labelformat/formats/__init__.py b/src/labelformat/formats/__init__.py index 47bbd18..76a513c 100644 --- a/src/labelformat/formats/__init__.py +++ b/src/labelformat/formats/__init__.py @@ -65,6 +65,7 @@ YOLOv26ObjectDetectionInput, YOLOv26ObjectDetectionOutput, ) +from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput __all__ = [ "COCOInstanceSegmentationInput", @@ -105,4 +106,5 @@ "YOLOv26ObjectDetectionInput", "YOLOv26ObjectDetectionOutput", "MaskPairInstanceSegmentationInput", + "YouTubeVISObjectDetectionTrackInput", ] diff --git a/src/labelformat/formats/youtubevis.py b/src/labelformat/formats/youtubevis.py new file mode 100644 index 0000000..be8c273 --- /dev/null +++ b/src/labelformat/formats/youtubevis.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +import json +from argparse import ArgumentParser +from pathlib import Path +from typing import Dict, Iterable, List + +from labelformat.model.bounding_box import BoundingBox, BoundingBoxFormat +from labelformat.model.category import Category +from labelformat.model.object_detection_track import ( + ObjectDetectionTrackInput, + SingleObjectDetectionTrack, + VideoObjectDetectionTrack, +) +from labelformat.model.video import Video +from labelformat.types import JsonDict + + +class YouTubeVISObjectDetectionTrackInput(ObjectDetectionTrackInput): + @staticmethod + def add_cli_arguments(parser: ArgumentParser) -> None: + parser.add_argument( + "--input-file", + type=Path, + required=True, + help="Path to input YouTube-VIS JSON file", + ) + + def __init__(self, input_file: Path) -> None: + with input_file.open() as file: + self._data = json.load(file) + + def get_categories(self) -> Iterable[Category]: + for category in self._data["categories"]: + yield Category( + id=category["id"], + name=category["name"], + ) + + def get_videos(self) -> Iterable[Video]: + for video in self._data["videos"]: + yield Video( + id=video["id"], + # TODO (Jonas, 1/2026): The file_names do not hold the video file extension. Solution required. + filename=Path(video["file_names"][0]).parent.name, + width=int(video["width"]), + height=int(video["height"]), + number_of_frames=int(video["length"]), + ) + + def get_labels(self) -> Iterable[VideoObjectDetectionTrack]: + video_id_to_video = {video.id: video for video in self.get_videos()} + category_id_to_category = { + category.id: category for category in self.get_categories() + } + video_id_to_tracks: Dict[int, List[JsonDict]] = { + video_id: [] for video_id in video_id_to_video.keys() + } + for ann in self._data["annotations"]: + video_id_to_tracks[ann["video_id"]].append(ann) + + for video_id, tracks in video_id_to_tracks.items(): + video = video_id_to_video[video_id] + objects = [] + for track in tracks: + boxes = _get_object_track_boxes(ann=track) + objects.append( + SingleObjectDetectionTrack( + category=category_id_to_category[ann["category_id"]], + boxes=boxes, + ) + ) + yield VideoObjectDetectionTrack( + video=video, + objects=objects, + ) + + +def _get_object_track_boxes( + ann: JsonDict, +) -> list[BoundingBox | None]: + boxes: list[BoundingBox | None] = [] + for bbox in ann["bboxes"]: + if bbox is None or len(bbox) == 0: + boxes.append(None) + else: + boxes.append( + BoundingBox.from_format( + bbox=[float(x) for x in bbox], + format=BoundingBoxFormat.XYWH, + ) + ) + return boxes diff --git a/tests/unit/formats/test_youtubevis.py b/tests/unit/formats/test_youtubevis.py new file mode 100644 index 0000000..e176720 --- /dev/null +++ b/tests/unit/formats/test_youtubevis.py @@ -0,0 +1,92 @@ +import json +from pathlib import Path + +from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput +from labelformat.model.bounding_box import BoundingBox +from labelformat.model.category import Category +from labelformat.model.object_detection_track import ( + SingleObjectDetectionTrack, + VideoObjectDetectionTrack, +) +from labelformat.model.video import Video + + +class TestYouTubeVISObjectDetectionTrackInput: + def test_get_categories(self, tmp_path: Path) -> None: + input_file = _write_youtube_vis_json(tmp_path / "instances.json") + label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file) + + assert list(label_input.get_categories()) == [Category(id=1, name="cat")] + + def test_get_videos(self, tmp_path: Path) -> None: + input_file = _write_youtube_vis_json(tmp_path / "instances.json") + label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file) + + assert list(label_input.get_videos()) == [ + Video( + id=5, + filename="video1", + width=640, + height=480, + number_of_frames=2, + ) + ] + + def test_get_labels(self, tmp_path: Path) -> None: + input_file = _write_youtube_vis_json(tmp_path / "instances.json") + label_input = YouTubeVISObjectDetectionTrackInput(input_file=input_file) + + assert list(label_input.get_labels()) == [ + VideoObjectDetectionTrack( + video=Video( + id=5, + filename="video1", + width=640, + height=480, + number_of_frames=2, + ), + objects=[ + SingleObjectDetectionTrack( + category=Category(id=1, name="cat"), + boxes=[ + BoundingBox( + xmin=10.0, + ymin=20.0, + xmax=40.0, + ymax=60.0, + ), + None, + ], + ) + ], + ) + ] + + +def _write_youtube_vis_json(input_file: Path) -> Path: + data = { + "categories": [ + {"id": 1, "name": "cat"}, + ], + "videos": [ + { + "id": 5, + "file_names": ["video1/00000.jpg", "video1/00001.jpg"], + "width": 640, + "height": 480, + "length": 2, + } + ], + "annotations": [ + { + "video_id": 5, + "category_id": 1, + "bboxes": [ + [10.0, 20.0, 30.0, 40.0], + None, + ], + } + ], + } + input_file.write_text(json.dumps(data)) + return input_file