From c2ba4c38370c60da407cb2365bc64696e0cc3631 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sat, 9 Aug 2025 20:35:05 -0400 Subject: [PATCH 1/7] promoted `get()` with default value from `Mmif` to `MmifObject` (prime superclass) --- mmif/serialize/mmif.py | 19 ------------------- mmif/serialize/model.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 2715bae8..efec7348 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -794,22 +794,3 @@ def __getitem__(self, item: str) \ return self.views.__getitem__(item) except KeyError: raise KeyError(f"Object with ID {item} not found in the MMIF object. ") - - def get(self, obj_id, default=None): - """ - High-level getter for Mmif. This will try to find any object, given - an identifier or an immediate attribute name. When nothing is found, - this will return a default value instead of raising an error. - - :param obj_id: an immediate attribute name or an object identifier - (a document ID, a view ID, or an annotation ID). When - annotation ID is given as a "short" ID (without view - ID prefix), the method will try to find a match from - the first view, and return immediately if found. - :param default: the default value to return if none is found - :return: the object searched for or the default value - """ - try: - return self.__getitem__(obj_id) - except KeyError: - return default diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 70ff9e25..9c232a81 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -314,6 +314,25 @@ def __getitem__(self, key) -> Union['MmifObject', str, datetime]: else: return value + def get(self, obj_id, default=None): + """ + High-level getter for Mmif. This will try to find any object, given + an identifier or an immediate attribute name. When nothing is found, + this will return a default value instead of raising an error. + + :param obj_id: an immediate attribute name or an object identifier + (a document ID, a view ID, or an annotation ID). When + annotation ID is given as a "short" ID (without view + ID prefix), the method will try to find a match from + the first view, and return immediately if found. + :param default: the default value to return if none is found + :return: the object searched for or the default value + """ + try: + return self.__getitem__(obj_id) + except KeyError: + return default + class MmifObjectEncoder(json.JSONEncoder): """ From 6f95b04cec7273cf493d3985237b468a75b9ff24 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sun, 17 Aug 2025 20:33:57 -0400 Subject: [PATCH 2/7] deprecated `get()` from list-like objs, separate key-index, int-index ... * added `get_by_key()` for string queries * `__getitem__()` now only works with integer/slice indexing --- mmif/serialize/mmif.py | 37 +++++++++++++++++++-------------- mmif/serialize/model.py | 35 ++++++++++++++++++++++--------- mmif/serialize/view.py | 4 ++-- tests/test_serialize.py | 46 ++++++++++++++++++----------------------- 4 files changed, 69 insertions(+), 53 deletions(-) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index efec7348..a4c4512a 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -266,8 +266,8 @@ def _when_failed(): ## caching alignments if all(map(lambda x: x in alignment_ann.properties, ('source', 'target'))): try: - source_ann = self[alignment_ann.get('source')] - target_ann = self[alignment_ann.get('target')] + source_ann = self.__getitem__(alignment_ann.get('source')) + target_ann = self.__getitem__(alignment_ann.get('target')) if isinstance(source_ann, Annotation) and isinstance(target_ann, Annotation): source_ann._cache_alignment(alignment_ann, target_ann) target_ann._cache_alignment(alignment_ann, source_ann) @@ -586,7 +586,7 @@ def get_alignments(self, at_type1: Union[str, ThingTypesBase], at_type2: Union[s aligned_types = set() for ann_id in [alignment['target'], alignment['source']]: ann_id = cast(str, ann_id) - aligned_type = cast(Annotation, self[ann_id]).at_type + aligned_type = cast(Annotation, self.__getitem__(ann_id)).at_type aligned_types.add(aligned_type) aligned_types = list(aligned_types) # because membership check for sets also checks hash() values if len(aligned_types) == 2 and at_type1 in aligned_types and at_type2 in aligned_types: @@ -745,10 +745,10 @@ def _get_linear_anchor_point(self, ann: Annotation, targets_sorted=False, start: point = math.inf if start else -1 comp = min if start else max for target_id in ann.get_property('targets'): - point = comp(point, self._get_linear_anchor_point(self[target_id], start=start)) + point = comp(point, self._get_linear_anchor_point(self.__getitem__(target_id), start=start)) return point target_id = ann.get_property('targets')[0 if start else -1] - return self._get_linear_anchor_point(self[target_id], start=start) + return self._get_linear_anchor_point(self.__getitem__(target_id), start=start) elif (start and 'start' in props) or (not start and 'end' in props): return ann.get_property('start' if start else 'end') else: @@ -766,8 +766,7 @@ def get_end(self, annotation: Annotation) -> Union[int, float]: """ return self._get_linear_anchor_point(annotation, start=False) - def __getitem__(self, item: str) \ - -> Union[Document, View, Annotation, MmifMetadata, DocumentsList, ViewsList]: + def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetadata]: """ index ([]) implementation for Mmif. This will try to find any object, given an identifier or an immediate attribute name. When nothing is found, this will raise an error rather than returning a None @@ -780,17 +779,25 @@ def __getitem__(self, item: str) \ :raise KeyError: if the item is not found or multiple objects are found with the same ID """ if item in self._named_attributes(): - return self.__dict__[item] + return self.__dict__.__getitem__(item) if self.id_delimiter in item: vid, _ = item.split(self.id_delimiter, 1) - return self.views[vid].annotations[item] + view = self.views.get_by_key(vid) + if view is None: + raise KeyError(f"View with ID {vid} not found in the MMIF object.") + ann = view.annotations.get_by_key(item) + if ann is None: + raise KeyError(f"Annotation with ID {item} not found in the MMIF object.") + return ann else: # search for document first, then views # raise KeyError if nothing is found - try: - return self.documents.__getitem__(item) - except KeyError: - try: - return self.views.__getitem__(item) - except KeyError: + ret = self.documents.get_by_key(item) + if ret is None: + ret = self.views.get_by_key(item) + if ret is None: raise KeyError(f"Object with ID {item} not found in the MMIF object. ") + else: + return ret + else: + return ret diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 9c232a81..671ff228 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -13,8 +13,9 @@ """ import json +import warnings from datetime import datetime -from typing import Union, Any, Dict, Optional, TypeVar, Generic, Generator, Iterator, Type, Set, ClassVar +from typing import Union, Any, Dict, Optional, TypeVar, Generic, Generator, Iterator, Type, Set, ClassVar, List from deepdiff import DeepDiff @@ -302,7 +303,7 @@ def __contains__(self, key: str) -> bool: except (TypeError, AttributeError, KeyError): return False - def __getitem__(self, key) -> Union['MmifObject', str, datetime]: + def __getitem__(self, key) -> Any: if key in self._named_attributes(): value = self.__dict__[key] elif self._unnamed_attributes is None: @@ -394,6 +395,11 @@ def _deserialize(self, input_list: list) -> None: def get(self, key: str, default=None) -> Optional[T]: """ + .. deprecated:: 1.1.3 + Will be removed in 2.0.0. + Use `__getitem__` for integer access or `get_by_key` for string-based + ID or attribute access. + Standard dictionary-style get() method, albeit with no ``default`` parameter. Relies on the implementation of __getitem__. @@ -403,10 +409,17 @@ def get(self, key: str, default=None) -> Optional[T]: :param default: the default value to return if the key is not found :return: the value matching that key """ - try: - return self[key] - except KeyError: - return default + warnings.warn( + "The 'get' method is deprecated and will be removed in a future version. " + "Use `__getitem__` for integer access or `get_by_key` for string-based " \ + "ID or attribute access.", + DeprecationWarning, + stacklevel=2 + ) + return self.get_by_key(key, default) + + def get_by_key(self, key: str, default=None) -> Optional[T]: + return self._items.get(key, default) def _append_with_key(self, key: str, value: T, overwrite=False) -> None: """ @@ -431,11 +444,13 @@ def _append_with_key(self, key: str, value: T, overwrite=False) -> None: def append(self, value, overwrite): raise NotImplementedError() - def __getitem__(self, key: str) -> T: - if key not in self.reserved_names: - return self._items.__getitem__(key) + def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: + if isinstance(key, (int, slice)): + # Python's dicts preserve insertion order since 3.7. + # We can convert values to a list and index it. + return list(self._items.values())[key] else: - raise KeyError("Don't use __getitem__ to access a reserved name") + raise TypeError(f"list indices must be integers or slices, not {type(key).__name__}") def __setitem__(self, key: str, value: T): if key not in self.reserved_names: diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index d8821c5b..82a624dc 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -287,7 +287,7 @@ def get_document_by_id(self, doc_id) -> Document: "View.get_document_by_id() is deprecated, use view[doc_id] instead.", DeprecationWarning ) - doc_found = self.annotations[doc_id] + doc_found = self.annotations.get_by_key(doc_id) if not isinstance(doc_found, Document): raise KeyError(f"Document \"{doc_id}\" not found in view {self.id}.") return cast(Document, doc_found) @@ -302,7 +302,7 @@ def __getitem__(self, key: str) -> 'Annotation': """ if key in self._named_attributes(): return self.__dict__[key] - anno_result = self.annotations.get(key) + anno_result = self.annotations.get_by_key(key) if not anno_result: raise KeyError("Annotation ID not found: %s" % key) return anno_result diff --git a/tests/test_serialize.py b/tests/test_serialize.py index bec77d28..86159185 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -802,14 +802,14 @@ def test_new_textdocument(self): self.assertTrue(td1.properties.text_value == td1.text_value) self.assertNotEqual(td1.text_language, td2.text_language) self.assertEqual(english_text, td1.text_value) - self.assertEqual(td1, self.view_obj.annotations.get(td1.id)) + self.assertEqual(td1, self.view_obj.annotations.get_by_key(td1.id)) td3 = self.view_obj.new_textdocument(english_text, mime='plain/text') self.assertEqual(td1.text_value, td3.text_value) self.assertEqual(len(td1.properties), len(td3.properties) - 1) def test_parent(self): mmif_obj = Mmif(self.mmif_examples_json['everything']) - self.assertTrue(all(anno.parent == v.id for v in mmif_obj.views for anno in mmif_obj.get_view_by_id(v.id).annotations)) + self.assertTrue(all(anno.parent == v.id for v in mmif_obj.views for anno in mmif_obj[v.id].annotations)) def test_non_existing_parent(self): anno_obj = Annotation(FRACTIONAL_EXAMPLES['doc_only']) @@ -820,20 +820,20 @@ def test_non_existing_parent(self): def test_get_by_id(self): mmif_obj = Mmif(MMIF_EXAMPLES['everything']) - mmif_obj['m1'] - mmif_obj['v4:td1'] + mmif_obj.__getitem__('m1') + mmif_obj.__getitem__('v4:td1') with self.assertRaises(KeyError): - mmif_obj['m55'] + mmif_obj.__getitem__('m55') with self.assertRaises(KeyError): - mmif_obj['v1:td1'] - view_obj = mmif_obj['v4'] - td1 = view_obj['v4:td1'] + mmif_obj.__getitem__('v1:td1') + view_obj = mmif_obj.__getitem__('v4') + td1 = view_obj.__getitem__('v4:td1') self.assertEqual(td1.properties.mime, 'text/plain') - a1 = view_obj['v4:a1'] + a1 = view_obj.__getitem__('v4:a1') self.assertEqual(a1.at_type, AnnotationTypes.Alignment) with self.assertRaises(KeyError): - view_obj['completely-unlikely-annotation-id'] - + view_obj.__getitem__('completely-unlikely-annotation-id') + def test_get_annotations(self): mmif_obj = Mmif(MMIF_EXAMPLES['everything']) # simple search by at_type @@ -913,7 +913,8 @@ def test_error_to_text(self): self.assertTrue(aview.has_error()) self.assertTrue(isinstance(mmif_obj.get_last_error(), str)) err_str = 'custom error as a single long string' - aview.metadata.error = err_str + aview.metadata.error = ErrorDict({'message': err_str}) + print(aview.metadata.error) self.assertTrue(aview.has_error()) self.assertTrue(isinstance(mmif_obj.get_last_error(), str)) self.assertIn(err_str, mmif_obj.get_last_error()) @@ -1039,7 +1040,7 @@ def test_add_property(self): removed_prop_key, removed_prop_value = list(props.items())[-1] props.pop(removed_prop_key) new_mmif = Mmif(datum['json']) - new_mmif.get_view_by_id(view_id).annotations[first_ann_id].add_property(removed_prop_key, removed_prop_value) + new_mmif.get(view_id).annotations[first_ann_id].add_property(removed_prop_key, removed_prop_value) self.assertEqual(json.loads(datum['string'])['views'][j], json.loads(new_mmif.serialize())['views'][j], f'Failed on {i}, {view_id}') @@ -1236,9 +1237,9 @@ def test_document_adding_duplicate_properties(self): doc1.add_property('publisher', 'they') self.assertEqual(2, len(doc1._props_pending)) mmif_roundtrip3 = Mmif(mmif_roundtrip2.serialize()) - r0_v_anns = list(mmif_roundtrip3.views[r0_vid].get_annotations(AnnotationTypes.Annotation)) - r1_v_anns = list(mmif_roundtrip3.views[r1_vid].get_annotations(AnnotationTypes.Annotation)) - r2_v_anns = list(mmif_roundtrip3.views[r2_vid].get_annotations(AnnotationTypes.Annotation)) + r0_v_anns = list(mmif_roundtrip3.views.get_by_key(r0_vid).get_annotations(AnnotationTypes.Annotation)) + r1_v_anns = list(mmif_roundtrip3.views.get_by_key(r1_vid).get_annotations(AnnotationTypes.Annotation)) + r2_v_anns = list(mmif_roundtrip3.views.get_by_key(r2_vid).get_annotations(AnnotationTypes.Annotation)) # two props (`author` and `publisher`) are serialized to one `Annotation` objects self.assertEqual(1, len(r0_v_anns)) self.assertEqual(0, len(r1_v_anns)) @@ -1324,7 +1325,7 @@ def test_capital_annotation_generation_viewfinder(self): mmif[f'doc{i+1}'].add_property('author', authors[i]) mmif_roundtrip = Mmif(mmif.serialize()) for i in range(1, 3): - cap_anns = list(mmif_roundtrip.views[f'v{i}'].get_annotations(AnnotationTypes.Annotation)) + cap_anns = list(mmif_roundtrip.views.get_by_key(f'v{i}').get_annotations(AnnotationTypes.Annotation)) self.assertEqual(1, len(cap_anns)) self.assertEqual(authors[i-1], cap_anns[0].get_property('author')) @@ -1391,7 +1392,7 @@ def test_add_property(self): properties.pop(removed_prop_key) try: new_mmif = Mmif(datum['json']) - new_mmif.get_document_by_id(document_id).add_property(removed_prop_key, removed_prop_value) + new_mmif.get(document_id).add_property(removed_prop_key, removed_prop_value) self.assertEqual(json.loads(datum['string']), json.loads(new_mmif.serialize()), f'Failed on {i}, {document_id}') except ValidationError: continue @@ -1406,13 +1407,6 @@ def test_setitem(self): self.datalist['v1'] = View({'id': 'v1'}) self.datalist['v2'] = View({'id': 'v2'}) - def test_getitem(self): - self.assertIs(self.mmif_obj['v1'], self.datalist['v1']) - - def test_getitem_raises(self): - with self.assertRaises(KeyError): - _ = self.datalist['reserved_names'] - def test_append(self): self.assertTrue('v256' not in self.datalist._items) self.datalist.append(View({'id': 'v256'})) @@ -1462,7 +1456,7 @@ def test_setitem_fail_on_reserved_name(self): self.assertEqual("can't set item on a reserved name", ke.args[0]) def test_get(self): - self.assertEqual(self.datalist['v1'], self.datalist.get('v1')) + self.assertEqual(self.datalist.get_by_key('v1'), self.datalist.get('v1')) def test_update(self): other_contains = """{ From c5ae73a277434dd55821fc6f035764e9d7135257 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 29 Oct 2025 20:38:33 -0400 Subject: [PATCH 3/7] big updates on docstring based on current `get_by_id` impl --- mmif/serialize/annotation.py | 90 ++++++++++++++----- mmif/serialize/mmif.py | 99 ++++++++++++++++++-- mmif/serialize/model.py | 170 ++++++++++++++++++++++++++++++----- mmif/serialize/view.py | 85 ++++++++++++++++-- 4 files changed, 388 insertions(+), 56 deletions(-) diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index 0b2d3ad6..fe914bb2 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -256,18 +256,39 @@ def __getitem__(self, prop_name: str): def get(self, prop_name: str, default=None): """ - A getter for Annotation, will search for a property by its name, - and return the value if found, or the default value if not found. - This is designed to allow for directly accessing properties without - having to go through the properties object, or view-level - annotation metadata (common properties) encoded in the - ``view.metadata.contains`` dict. Note that the regular properties - will take the priority over the view-level common properties when - there are name conflicts. - - :param prop_name: the name of the property to get - :param default: the value to return if the property is not found - :return: the value of the property + Safe property access with optional default value. + + Searches for an annotation property by name and returns its value, + or a default value if not found. This method searches in multiple + locations with the following priority: + + 1. Direct properties (in ``annotation.properties``) + 2. Ephemeral properties (view-level metadata from ``contains``) + 3. Special fields (``@type``, ``properties``) + + This allows convenient access to properties without explicitly + checking the ``properties`` object or view-level metadata. + + :param prop_name: The name of the property to retrieve + :param default: The value to return if the property is not found (default: None) + :return: The property value, or the default value if not found + + Examples + -------- + >>> # Access annotation properties: + >>> label = annotation.get('label', default='unknown') + >>> start_time = annotation.get('start', default=0) + >>> + >>> # Access @type: + >>> at_type = annotation.get('@type') + >>> + >>> # Safe access with custom default: + >>> targets = annotation.get('targets', default=[]) + + See Also + -------- + __getitem__ : Direct property access that raises KeyError when not found + get_property : Alias for this method """ try: return self.__getitem__(prop_name) @@ -378,13 +399,42 @@ def add_property(self, name: str, def get(self, prop_name, default=None): """ - A special getter for Document properties. The major difference from - the super class's :py:meth:`Annotation.get` method is that Document - class has one more set of *"pending"* properties, that are added after - the Document object is created and will be serialized as a separate - :py:class:`Annotation` object of which ``@type = Annotation``. The - pending properties will take the priority over the regular properties - when there are conflicts. + Safe property access with optional default value for Document objects. + + Searches for a document property by name and returns its value, or a + default value if not found. Documents have a more complex property + hierarchy than regular annotations: + + Priority order (highest to lowest): + 1. Special fields ('id', 'location') + 2. Pending properties (added after creation, to be serialized as ``Annotation`` objects) + 3. Ephemeral properties (from existing ``Annotation`` annotations or view metadata) + 4. Original properties (in ``document.properties``) + + This allows convenient access to all document properties regardless of + where they're stored internally. + + :param prop_name: The name of the property to retrieve + :param default: The value to return if the property is not found (default: None) + :return: The property value, or the default value if not found + + Examples + -------- + >>> # Access document properties: + >>> mime = document.get('mime', default='application/octet-stream') + >>> location = document.get('location') + >>> + >>> # Access properties added after creation (pending): + >>> author = document.get('author', default='anonymous') + >>> publisher = document.get('publisher') + >>> + >>> # Access ephemeral properties from Annotation objects: + >>> sentiment = document.get('sentiment', default='neutral') + + See Also + -------- + add_property : Add a new property to the document + Mmif.generate_capital_annotations : How pending properties are serialized """ if prop_name == 'id': # because all three dicts have `id` key as required field, we need @@ -399,7 +449,7 @@ class has one more set of *"pending"* properties, that are added after elif prop_name in self._props_ephemeral: return self._props_ephemeral[prop_name] else: - return super().get(prop_name) + return super().get(prop_name, default) get_property = get diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index a4c4512a..20f25f11 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -2,7 +2,16 @@ The :mod:`mmif` module contains the classes used to represent a full MMIF file as a live Python object. +The :class:`Mmif` class is a high-level container that provides convenient +string-based access to documents, views, and annotations via ``mmif[id]``. +The underlying ``documents`` and ``views`` attributes are list-like collections +that use integer indexing; use ``get_by_key()`` on those for ID-based access. + See the specification docs and the JSON Schema file for more information. + +.. versionchanged:: 1.1.3 + DocumentsList and ViewsList now only accept integer/slice indexing. + Use ``get_by_key()`` for string ID access on those collections. """ import json @@ -139,8 +148,39 @@ class Mmif(MmifObject): """ MmifObject that represents a full MMIF file. + This is a high-level container object that provides convenient string-based + access to documents, views, and annotations using their IDs. The underlying + collections (``documents`` and ``views``) are list-like and use integer + indexing, but Mmif itself accepts string IDs for convenient access. + :param mmif_obj: the JSON data :param validate: whether to validate the data against the MMIF JSON schema. + + Examples + -------- + Accessing objects by ID (high-level, convenient): + + >>> mmif = Mmif(mmif_json) + >>> doc = mmif['m1'] # Get document by ID + >>> view = mmif['v1'] # Get view by ID + >>> ann = mmif['v1:a1'] # Get annotation by long-form ID + >>> + >>> # Safe access with default: + >>> doc = mmif.get('m99', default=None) + + Accessing via underlying lists (positional access): + + >>> first_doc = mmif.documents[0] # First document + >>> last_view = mmif.views[-1] # Last view + >>> all_views = mmif.views[1:4] # Slice of views + >>> + >>> # Access by ID on lists: + >>> doc = mmif.documents.get_by_key('m1') + >>> view = mmif.views.get_by_key('v1') + + .. versionchanged:: 1.1.3 + Underlying ``documents`` and ``views`` lists now only accept integer + indexing. Use ``get_by_key()`` for string ID access on those lists. """ def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *, validate: bool = True) -> None: @@ -768,15 +808,56 @@ def get_end(self, annotation: Annotation) -> Union[int, float]: def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetadata]: """ - index ([]) implementation for Mmif. This will try to find any object, given an identifier or an immediate - attribute name. When nothing is found, this will raise an error rather than returning a None - - :raises KeyError: if the item is not found or if the search results are ambiguous - :param item: an attribute name or an object identifier (a document ID, a view ID, or an annotation ID). When - annotation ID is given as a "short" ID (without view ID prefix), the method will try to find a - match from the first view, and return immediately if found. - :return: the object searched for - :raise KeyError: if the item is not found or multiple objects are found with the same ID + High-level string-based access to MMIF objects by their IDs. + + This method provides convenient access to documents, views, and annotations + using their string identifiers. For long-form annotation IDs (format: ``V:A``), + performs a two-level search through the specified view. + + Note: This is a high-level convenience method on the Mmif container itself. + The underlying ``documents`` and ``views`` collections are list-like and + only support integer indexing - use ``get_by_key()`` on those for ID-based access. + + :param item: An object identifier: + - Document ID (e.g., 'm1', 'd1') + - View ID (e.g., 'v1', 'v_0') + - Annotation ID in long form (e.g., 'v1:a1', 'v1:tf1') + - Attribute name (e.g., 'metadata', 'documents', 'views') + :return: The requested Document, View, Annotation, or attribute object + :raises KeyError: If the item is not found + + Examples + -------- + High-level access by ID: + + >>> mmif = Mmif(mmif_json) + >>> + >>> # Access documents: + >>> doc = mmif['m1'] # Returns Document with ID 'm1' + >>> + >>> # Access views: + >>> view = mmif['v1'] # Returns View with ID 'v1' + >>> + >>> # Access annotations (long-form ID): + >>> ann = mmif['v1:a1'] # Returns Annotation from view v1 + >>> + >>> # Access attributes: + >>> metadata = mmif['metadata'] # Returns MmifMetadata object + >>> + >>> # Will raise KeyError: + >>> doc = mmif['nonexistent'] # KeyError! + + For list-style positional access, use the underlying collections: + + >>> first_doc = mmif.documents[0] # Integer index + >>> second_view = mmif.views[1] # Integer index + >>> doc_by_id = mmif.documents.get_by_key('m1') # ID-based access + + See Also + -------- + get : Safe access with default value instead of raising KeyError + documents.get_by_key : ID-based access on the documents list + views.get_by_key : ID-based access on the views list """ if item in self._named_attributes(): return self.__dict__.__getitem__(item) diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 671ff228..5d6952d3 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -10,6 +10,16 @@ core functionality for deserializing MMIF JSON data into live objects and serializing live objects into MMIF JSON data. Specialized behavior for the different components of MMIF is added in the subclasses. + +This module defines two main collection types: + +- :class:`DataList`: List-like collections that support integer/slice indexing. + Use ``get_by_key()`` for string ID access. +- :class:`DataDict`: Dict-like collections that support string key access. + +.. versionchanged:: 1.1.3 + DataList now only accepts integers/slices in ``__getitem__``. Use + ``get_by_key()`` for string-based ID access. """ import json @@ -317,17 +327,33 @@ def __getitem__(self, key) -> Any: def get(self, obj_id, default=None): """ - High-level getter for Mmif. This will try to find any object, given - an identifier or an immediate attribute name. When nothing is found, - this will return a default value instead of raising an error. + High-level safe getter that returns a default value instead of raising KeyError. + + This method wraps ``__getitem__()`` with exception handling, making it safe + to query for objects that might not exist. Available on all MmifObject subclasses. - :param obj_id: an immediate attribute name or an object identifier - (a document ID, a view ID, or an annotation ID). When - annotation ID is given as a "short" ID (without view - ID prefix), the method will try to find a match from - the first view, and return immediately if found. - :param default: the default value to return if none is found - :return: the object searched for or the default value + :param obj_id: An attribute name or object identifier (document ID, view ID, + annotation ID, or property name depending on the object type). + For Mmif objects: when annotation ID is given as a "short" ID + (without view ID prefix), searches from the first view. + :param default: The value to return if the key is not found (default: None) + :return: The object/value searched for, or the default value if not found + + Examples + -------- + Safe access pattern (works on all MmifObject subclasses): + + >>> # On Mmif objects: + >>> view = mmif.get('v1', default=None) # Returns None if not found + >>> doc = mmif.get('doc1', default=None) + >>> + >>> # On Annotation/Document objects: + >>> label = annotation.get('label', default='unknown') + >>> author = document.get('author', default='anonymous') + + See Also + -------- + __getitem__ : Direct access that raises KeyError when not found """ try: return self.__getitem__(obj_id) @@ -396,29 +422,78 @@ def _deserialize(self, input_list: list) -> None: def get(self, key: str, default=None) -> Optional[T]: """ .. deprecated:: 1.1.3 - Will be removed in 2.0.0. - Use `__getitem__` for integer access or `get_by_key` for string-based - ID or attribute access. + Do not use in new code. Will be removed in 2.0.0. + Use ``get_by_key()`` for string-based ID access or ``[int]`` for + positional access. + + Deprecated method for retrieving list elements by string ID. - Standard dictionary-style get() method, albeit with no ``default`` - parameter. Relies on the implementation of __getitem__. + :param key: The string ID of the element to search for + :param default: The default value to return if the key is not found + :return: The element matching the ID, or the default value - Will return ``None`` if the key is not found. + Examples + -------- + Old pattern (deprecated, do not use): - :param key: the key to search for - :param default: the default value to return if the key is not found - :return: the value matching that key + >>> view = mmif.views.get('v1') # DeprecationWarning! + + New patterns to use instead: + + >>> # For ID-based access: + >>> view = mmif.views.get_by_key('v1') + >>> # For positional access: + >>> view = mmif.views[0] + >>> # For high-level safe access: + >>> view = mmif.get('v1', default=None) + + See Also + -------- + get_by_key : Replacement method for string ID access + __getitem__ : List-style positional access with integers """ warnings.warn( - "The 'get' method is deprecated and will be removed in a future version. " - "Use `__getitem__` for integer access or `get_by_key` for string-based " \ - "ID or attribute access.", + "The 'get' method is deprecated and will be removed in 2.0.0. " + "Use `get_by_key()` for string-based ID access or `[int]` for " + "positional access.", DeprecationWarning, stacklevel=2 ) return self.get_by_key(key, default) def get_by_key(self, key: str, default=None) -> Optional[T]: + """ + .. versionadded:: 1.1.3 + + Retrieve an element from the list by its string ID. + + This is the recommended way to access list elements by their identifier. + Unlike ``__getitem__`` which only accepts integers for list-style access, + this method accepts string IDs and returns a default value if not found. + + :param key: The string ID of the element to retrieve + :param default: The value to return if the ID is not found (default: None) + :return: The element with the matching ID, or the default value + + Examples + -------- + Accessing elements by ID: + + >>> # Get a view by ID: + >>> view = mmif.views.get_by_key('v1') + >>> if view is None: + ... print("View not found") + >>> + >>> # Get with a custom default: + >>> doc = mmif.documents.get_by_key('doc99', default=default_doc) + >>> + >>> # Get annotation from a view: + >>> ann = view.annotations.get_by_key('v1:a1') + + See Also + -------- + __getitem__ : List-style positional access with integers/slices + """ return self._items.get(key, default) def _append_with_key(self, key: str, value: T, overwrite=False) -> None: @@ -445,6 +520,37 @@ def append(self, value, overwrite): raise NotImplementedError() def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: + """ + List-style positional access using integers or slices. + + This method provides pythonic list behavior - it only accepts integers + for positional access or slices for range access. For string-based ID + access, use ``get_by_key()`` instead. + + :param key: An integer index or slice object + :return: The element at the index, or a list of elements for slices + :raises TypeError: If key is not an integer or slice (e.g., if a string is passed) + + Examples + -------- + Positional access (pythonic list behavior): + + >>> # Get first view: + >>> first_view = mmif.views[0] + >>> + >>> # Get last document: + >>> last_doc = mmif.documents[-1] + >>> + >>> # Slice to get multiple elements: + >>> first_three_views = mmif.views[0:3] + >>> + >>> # This will raise TypeError: + >>> view = mmif.views['v1'] # TypeError! Use get_by_key('v1') instead + + See Also + -------- + get_by_key : String-based ID access with optional default value + """ if isinstance(key, (int, slice)): # Python's dicts preserve insertion order since 3.7. # We can convert values to a list and index it. @@ -487,6 +593,26 @@ def _serialize(self, *args, **kwargs) -> dict: return super()._serialize(self._items) def get(self, key: T, default=None) -> Optional[S]: + """ + Dictionary-style safe access with optional default value. + + This method provides pythonic dict behavior - returns the value for + the given key, or a default value if the key is not found. + + :param key: The key to look up + :param default: The value to return if key is not found (default: None) + :return: The value associated with the key, or the default value + + Examples + -------- + >>> # Access contains metadata: + >>> timeframe_meta = view.metadata.contains.get(AnnotationTypes.TimeFrame) + >>> if timeframe_meta is None: + ... print("No TimeFrame annotations in this view") + >>> + >>> # With custom default: + >>> value = some_dict.get('key', default={}) + """ return self._items.get(key, default) def _append_with_key(self, key: T, value: S, overwrite=False) -> None: diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index 82a624dc..557804a0 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -4,6 +4,15 @@ In MMIF, views are created by apps in a pipeline that are annotating data that was previously present in the MMIF file. + +The :class:`View` class is a high-level container that provides convenient +string-based access to annotations via ``view[id]``. The underlying +``annotations`` attribute is a list-like collection that uses integer indexing; +use ``get_by_key()`` on it for ID-based access. + +.. versionchanged:: 1.1.3 + AnnotationsList now only accepts integer/slice indexing. + Use ``get_by_key()`` for string ID access on that collection. """ import json import warnings @@ -26,9 +35,38 @@ class View(MmifObject): a list of annotations, and potentially a JSON-LD ``@context`` IRI. + This is a high-level container object that provides convenient string-based + access to annotations using their IDs. The underlying ``annotations`` collection + is list-like and uses integer indexing, but View itself accepts string IDs for + convenient access. + If ``view_obj`` is not provided, an empty View will be generated. :param view_obj: the JSON data that defines the view + + Examples + -------- + Accessing annotations by ID (high-level, convenient): + + >>> view = mmif['v1'] + >>> ann = view['v1:a1'] # Get annotation by ID + >>> doc = view['v1:td1'] # Get document by ID + >>> + >>> # Safe access with default: + >>> ann = view.get('v1:a999', default=None) + + Accessing via underlying list (positional access): + + >>> first_ann = view.annotations[0] # First annotation + >>> last_ann = view.annotations[-1] # Last annotation + >>> some_anns = view.annotations[1:5] # Slice of annotations + >>> + >>> # Access by ID on list: + >>> ann = view.annotations.get_by_key('v1:a1') + + .. versionchanged:: 1.1.3 + Underlying ``annotations`` list now only accepts integer indexing. + Use ``get_by_key()`` for string ID access on that list. """ def __init__(self, view_obj: Optional[Union[bytes, str, dict]] = None, parent_mmif=None, *_) -> None: @@ -294,11 +332,48 @@ def get_document_by_id(self, doc_id) -> Document: def __getitem__(self, key: str) -> 'Annotation': """ - index ([]) implementation for View. - - :raises KeyError: if the key is not found - :param key: the search string. - :return: the :class:`mmif.serialize.annotation.Annotation` object searched for + High-level string-based access to annotations by their IDs. + + This method provides convenient access to annotations and documents + within this view using their string identifiers. + + Note: This is a high-level convenience method on the View container itself. + The underlying ``annotations`` collection is list-like and only supports + integer indexing - use ``get_by_key()`` on it for ID-based access. + + :param key: The annotation or document ID (e.g., 'v1:a1', 'v1:td1'), + or an attribute name (e.g., 'metadata', 'annotations') + :return: The requested Annotation, Document, or attribute object + :raises KeyError: If the key is not found + + Examples + -------- + High-level access by ID: + + >>> view = mmif['v1'] + >>> + >>> # Access annotations: + >>> ann = view['v1:a1'] # Returns Annotation with ID 'v1:a1' + >>> + >>> # Access text documents in view: + >>> doc = view['v1:td1'] # Returns Document with ID 'v1:td1' + >>> + >>> # Access attributes: + >>> metadata = view['metadata'] # Returns ViewMetadata object + >>> + >>> # Will raise KeyError: + >>> ann = view['nonexistent'] # KeyError! + + For list-style positional access, use the underlying collection: + + >>> first_ann = view.annotations[0] # Integer index + >>> ann_by_id = view.annotations.get_by_key('v1:a1') # ID-based access + + See Also + -------- + get : Safe access with default value instead of raising KeyError + annotations.get_by_key : ID-based access on the annotations list + get_annotations : Search for annotations by type or properties """ if key in self._named_attributes(): return self.__dict__[key] From 15dd3df77ccfff1b405a7a13dc59c8003512bc17 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 18 Nov 2025 15:50:08 -0500 Subject: [PATCH 4/7] get rid of intermediate `get_by_key` implementation and deprecating list-like classes' `get` once and for all --- mmif/serialize/annotation.py | 1 + mmif/serialize/mmif.py | 27 ++---- mmif/serialize/model.py | 79 +++++------------ mmif/serialize/view.py | 24 ++---- tests/mmif_examples.py | 42 ++++++++- tests/test_serialize.py | 162 +++++++++++++++++++++++++++++++++-- 6 files changed, 229 insertions(+), 106 deletions(-) diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index f3b23fc8..c7c00f3d 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -232,6 +232,7 @@ def add_property(self, name: str, :param name: the name of the property :param value: the property's desired value + :return: None """ # if self.check_prop_value_is_simple_enough(value): self.properties[name] = value diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index dfd905fe..e8957682 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -5,13 +5,9 @@ The :class:`Mmif` class is a high-level container that provides convenient string-based access to documents, views, and annotations via ``mmif[id]``. The underlying ``documents`` and ``views`` attributes are list-like collections -that use integer indexing; use ``get_by_key()`` on those for ID-based access. +that use integer indexing; use container-level access for ID-based lookups. See the specification docs and the JSON Schema file for more information. - -.. versionchanged:: 1.1.3 - DocumentsList and ViewsList now only accept integer/slice indexing. - Use ``get_by_key()`` for string ID access on those collections. """ import json @@ -173,14 +169,6 @@ class Mmif(MmifObject): >>> first_doc = mmif.documents[0] # First document >>> last_view = mmif.views[-1] # Last view >>> all_views = mmif.views[1:4] # Slice of views - >>> - >>> # Access by ID on lists: - >>> doc = mmif.documents.get_by_key('m1') - >>> view = mmif.views.get_by_key('v1') - - .. versionchanged:: 1.1.3 - Underlying ``documents`` and ``views`` lists now only accept integer - indexing. Use ``get_by_key()`` for string ID access on those lists. """ def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *, validate: bool = True) -> None: @@ -821,7 +809,7 @@ def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetada Note: This is a high-level convenience method on the Mmif container itself. The underlying ``documents`` and ``views`` collections are list-like and - only support integer indexing - use ``get_by_key()`` on those for ID-based access. + only support integer indexing. :param item: An object identifier: - Document ID (e.g., 'm1', 'd1') @@ -856,31 +844,28 @@ def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetada >>> first_doc = mmif.documents[0] # Integer index >>> second_view = mmif.views[1] # Integer index - >>> doc_by_id = mmif.documents.get_by_key('m1') # ID-based access See Also -------- get : Safe access with default value instead of raising KeyError - documents.get_by_key : ID-based access on the documents list - views.get_by_key : ID-based access on the views list """ if item in self._named_attributes(): return self.__dict__.__getitem__(item) if self.id_delimiter in item: vid, _ = item.split(self.id_delimiter, 1) - view = self.views.get_by_key(vid) + view = self.views._items.get(vid) if view is None: raise KeyError(f"View with ID {vid} not found in the MMIF object.") - ann = view.annotations.get_by_key(item) + ann = view.annotations._items.get(item) if ann is None: raise KeyError(f"Annotation with ID {item} not found in the MMIF object.") return ann else: # search for document first, then views # raise KeyError if nothing is found - ret = self.documents.get_by_key(item) + ret = self.documents._items.get(item) if ret is None: - ret = self.views.get_by_key(item) + ret = self.views._items.get(item) if ret is None: raise KeyError(f"Object with ID {item} not found in the MMIF object. ") else: diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index bea8db88..3c5e6986 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -13,13 +13,12 @@ This module defines two main collection types: -- :class:`DataList`: List-like collections that support integer/slice indexing. - Use ``get_by_key()`` for string ID access. +- :class:`DataList`: List-like collections that support integer/slice + indexing. For ID-based access, use indexing or ``get`` in the container + level. For example, for DocumentList, use its parent Mmif object's + getter methods to access documents by ID. (e.g., ``mmif['doc1']``). - :class:`DataDict`: Dict-like collections that support string key access. -.. versionchanged:: 1.1.3 - DataList now only accepts integers/slices in ``__getitem__``. Use - ``get_by_key()`` for string-based ID access. """ import json @@ -451,13 +450,13 @@ def get(self, key: str, default=None) -> Optional[T]: """ .. deprecated:: 1.1.3 Do not use in new code. Will be removed in 2.0.0. - Use ``get_by_key()`` for string-based ID access or ``[int]`` for - positional access. + Use container-level access or positional indexing instead. Deprecated method for retrieving list elements by string ID. :param key: the key to search for - :param default: the default value to return if the key is not found (defaults to None) + :param default: the default value to return if the key is not found + (defaults to None) :return: the value matching that key, or the default value if not found Examples @@ -468,60 +467,24 @@ def get(self, key: str, default=None) -> Optional[T]: New patterns to use instead: - >>> # For ID-based access: - >>> view = mmif.views.get_by_key('v1') + >>> # For ID-based access, use container: + >>> view = mmif['v1'] + >>> # Or with safe access: + >>> view = mmif.get('v1', default=None) >>> # For positional access: >>> view = mmif.views[0] - >>> # For high-level safe access: - >>> view = mmif.get('v1', default=None) See Also -------- - get_by_key : Replacement method for string ID access __getitem__ : List-style positional access with integers """ warnings.warn( - "The 'get' method is deprecated and will be removed in 2.0.0. " - "Use `get_by_key()` for string-based ID access or `[int]` for " - "positional access.", + "The 'get' method on list-like collections is deprecated and " + "will be removed in 2.0.0. Use container-level access " + "(e.g., mmif['v1']) or positional indexing (e.g., views[0]).", DeprecationWarning, stacklevel=2 ) - return self.get_by_key(key, default) - - def get_by_key(self, key: str, default=None) -> Optional[T]: - """ - .. versionadded:: 1.1.3 - - Retrieve an element from the list by its string ID. - - This is the recommended way to access list elements by their identifier. - Unlike ``__getitem__`` which only accepts integers for list-style access, - this method accepts string IDs and returns a default value if not found. - - :param key: The string ID of the element to retrieve - :param default: The value to return if the ID is not found (default: None) - :return: The element with the matching ID, or the default value - - Examples - -------- - Accessing elements by ID: - - >>> # Get a view by ID: - >>> view = mmif.views.get_by_key('v1') - >>> if view is None: - ... print("View not found") - >>> - >>> # Get with a custom default: - >>> doc = mmif.documents.get_by_key('doc99', default=default_doc) - >>> - >>> # Get annotation from a view: - >>> ann = view.annotations.get_by_key('v1:a1') - - See Also - -------- - __getitem__ : List-style positional access with integers/slices - """ return self._items.get(key, default) def _append_with_key(self, key: str, value: T, overwrite=False) -> None: @@ -553,11 +516,12 @@ def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: This method provides pythonic list behavior - it only accepts integers for positional access or slices for range access. For string-based ID - access, use ``get_by_key()`` instead. + access, use container-level indexing instead (e.g., ``mmif['v1']``). :param key: An integer index or slice object :return: The element at the index, or a list of elements for slices - :raises TypeError: If key is not an integer or slice (e.g., if a string is passed) + :raises TypeError: If key is not an integer or slice (e.g., if a + string is passed) Examples -------- @@ -573,11 +537,10 @@ def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: >>> first_three_views = mmif.views[0:3] >>> >>> # This will raise TypeError: - >>> view = mmif.views['v1'] # TypeError! Use get_by_key('v1') instead - - See Also - -------- - get_by_key : String-based ID access with optional default value + >>> view = mmif.views['v1'] # TypeError! + >>> + >>> # For ID-based access, use container: + >>> view = mmif['v1'] # Correct way """ if isinstance(key, (int, slice)): # Python's dicts preserve insertion order since 3.7. diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index d9e853c1..9b419b80 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -8,11 +8,8 @@ The :class:`View` class is a high-level container that provides convenient string-based access to annotations via ``view[id]``. The underlying ``annotations`` attribute is a list-like collection that uses integer indexing; -use ``get_by_key()`` on it for ID-based access. +use container-level access for ID-based lookups. -.. versionchanged:: 1.1.3 - AnnotationsList now only accepts integer/slice indexing. - Use ``get_by_key()`` for string ID access on that collection. """ import json import warnings @@ -60,13 +57,6 @@ class View(MmifObject): >>> first_ann = view.annotations[0] # First annotation >>> last_ann = view.annotations[-1] # Last annotation >>> some_anns = view.annotations[1:5] # Slice of annotations - >>> - >>> # Access by ID on list: - >>> ann = view.annotations.get_by_key('v1:a1') - - .. versionchanged:: 1.1.3 - Underlying ``annotations`` list now only accepts integer indexing. - Use ``get_by_key()`` for string ID access on that list. """ def __init__(self, view_obj: Optional[Union[bytes, str, dict]] = None, parent_mmif=None, *_) -> None: @@ -325,7 +315,7 @@ def get_document_by_id(self, doc_id) -> Document: "View.get_document_by_id() is deprecated, use view[doc_id] instead.", DeprecationWarning ) - doc_found = self.annotations.get_by_key(doc_id) + doc_found = self.annotations._items.get(doc_id) if not isinstance(doc_found, Document): raise KeyError(f"Document \"{doc_id}\" not found in view {self.id}.") return cast(Document, doc_found) @@ -337,9 +327,9 @@ def __getitem__(self, key: str) -> 'Annotation': This method provides convenient access to annotations and documents within this view using their string identifiers. - Note: This is a high-level convenience method on the View container itself. - The underlying ``annotations`` collection is list-like and only supports - integer indexing - use ``get_by_key()`` on it for ID-based access. + Note: This is a high-level convenience method on the View container + itself. The underlying ``annotations`` collection is list-like and + only supports integer indexing. :param key: The annotation or document ID (e.g., 'v1:a1', 'v1:td1'), or an attribute name (e.g., 'metadata', 'annotations') @@ -367,17 +357,15 @@ def __getitem__(self, key: str) -> 'Annotation': For list-style positional access, use the underlying collection: >>> first_ann = view.annotations[0] # Integer index - >>> ann_by_id = view.annotations.get_by_key('v1:a1') # ID-based access See Also -------- get : Safe access with default value instead of raising KeyError - annotations.get_by_key : ID-based access on the annotations list get_annotations : Search for annotations by type or properties """ if key in self._named_attributes(): return self.__dict__[key] - anno_result = self.annotations.get_by_key(key) + anno_result = self.annotations._items.get(key) if not anno_result: raise KeyError("Annotation ID not found: %s" % key) return anno_result diff --git a/tests/mmif_examples.py b/tests/mmif_examples.py index 5635d144..7e860c0a 100644 --- a/tests/mmif_examples.py +++ b/tests/mmif_examples.py @@ -1,4 +1,6 @@ import itertools +import os +import subprocess from string import Template from urllib import request @@ -11,10 +13,44 @@ 'FRACTIONAL_EXAMPLES', ] + +def _load_from_url_or_git(url): + """ + Load content from URL or local git repository. + If LOCALMMIF env var is set, use git show to load from local repo. + LOCALMMIF should be the path to the local mmif repository. + """ + localmmif = os.environ.get('LOCALMMIF') + if localmmif: + # Extract the version/branch and file path from the URL + # URL format: https://raw.githubusercontent.com/clamsproject/mmif/{version}/{filepath} + url_prefix = "https://raw.githubusercontent.com/clamsproject/mmif/" + if url.startswith(url_prefix): + remainder = url[len(url_prefix):] + parts = remainder.split('/', 1) + if len(parts) == 2: + version, filepath = parts + # Use git show to get the file from the specific version + git_ref = f"{version}:{filepath}" + try: + result = subprocess.run( + ['git', 'show', git_ref], + cwd=localmmif, + capture_output=True, + text=True, + check=True + ) + return result.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to load {git_ref} from local git repo at {localmmif}: {e.stderr}") + + # Fallback to URL loading + return request.urlopen(url).read().decode('utf-8') + everything_file_url = f"https://raw.githubusercontent.com/clamsproject/mmif/{__specver__}/specifications/samples/everything/raw.json" -old_mmif_w_short_id = f"https://raw.githubusercontent.com/clamsproject/mmif/1.0.5/specifications/samples/everything/raw.json" -EVERYTHING_JSON = request.urlopen(everything_file_url).read().decode('utf-8') -OLD_SHORTID_JSON = request.urlopen(old_mmif_w_short_id).read().decode('utf-8') +old_mmif_w_short_id_url = f"https://raw.githubusercontent.com/clamsproject/mmif/1.0.5/specifications/samples/everything/raw.json" +EVERYTHING_JSON = _load_from_url_or_git(everything_file_url) +OLD_SHORTID_JSON = _load_from_url_or_git(old_mmif_w_short_id_url) SWT_1_0_JSON = open('tests/samples/1.0/swt.mmif').read() # for keys and values in chain all typevers in mmif.vocabulary.*_types modules diff --git a/tests/test_serialize.py b/tests/test_serialize.py index f68c2f5b..b0836c5a 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -885,7 +885,7 @@ def test_new_textdocument(self): self.assertTrue(td1.properties.text_value == td1.text_value) self.assertNotEqual(td1.text_language, td2.text_language) self.assertEqual(english_text, td1.text_value) - self.assertEqual(td1, self.view_obj.annotations.get_by_key(td1.id)) + self.assertEqual(td1, self.view_obj[td1.id]) td3 = self.view_obj.new_textdocument(english_text, mime='plain/text') self.assertEqual(td1.text_value, td3.text_value) self.assertEqual(len(td1.properties), len(td3.properties) - 1) @@ -1320,9 +1320,9 @@ def test_document_adding_duplicate_properties(self): doc1.add_property('publisher', 'they') self.assertEqual(2, len(doc1._props_pending)) mmif_roundtrip3 = Mmif(mmif_roundtrip2.serialize()) - r0_v_anns = list(mmif_roundtrip3.views.get_by_key(r0_vid).get_annotations(AnnotationTypes.Annotation)) - r1_v_anns = list(mmif_roundtrip3.views.get_by_key(r1_vid).get_annotations(AnnotationTypes.Annotation)) - r2_v_anns = list(mmif_roundtrip3.views.get_by_key(r2_vid).get_annotations(AnnotationTypes.Annotation)) + r0_v_anns = list(mmif_roundtrip3[r0_vid].get_annotations(AnnotationTypes.Annotation)) + r1_v_anns = list(mmif_roundtrip3[r1_vid].get_annotations(AnnotationTypes.Annotation)) + r2_v_anns = list(mmif_roundtrip3[r2_vid].get_annotations(AnnotationTypes.Annotation)) # two props (`author` and `publisher`) are serialized to one `Annotation` objects self.assertEqual(1, len(r0_v_anns)) self.assertEqual(0, len(r1_v_anns)) @@ -1408,7 +1408,7 @@ def test_capital_annotation_generation_viewfinder(self): mmif[f'doc{i+1}'].add_property('author', authors[i]) mmif_roundtrip = Mmif(mmif.serialize()) for i in range(1, 3): - cap_anns = list(mmif_roundtrip.views.get_by_key(f'v{i}').get_annotations(AnnotationTypes.Annotation)) + cap_anns = list(mmif_roundtrip[f'v{i}'].get_annotations(AnnotationTypes.Annotation)) self.assertEqual(1, len(cap_anns)) self.assertEqual(authors[i-1], cap_anns[0].get_property('author')) @@ -1539,7 +1539,157 @@ def test_setitem_fail_on_reserved_name(self): self.assertEqual("can't set item on a reserved name", ke.args[0]) def test_get(self): - self.assertEqual(self.datalist.get_by_key('v1'), self.datalist.get('v1')) + # Test that get() returns the correct view and returns default for + # non-existent IDs + view = self.datalist.get('v1') + self.assertIsNotNone(view) + self.assertEqual('v1', view.id) + + # Test default value + self.assertIsNone(self.datalist.get('nonexistent')) + self.assertEqual('default', self.datalist.get('nonexistent', 'default')) + + # New tests for pythonic getters (#295) + def test_integer_indexing(self): + """Test that DataList supports integer indexing (list-like behavior).""" + # Positive indexing + first_view = self.datalist[0] + self.assertEqual('v1', first_view.id) + + second_view = self.datalist[1] + self.assertEqual('v2', second_view.id) + + # Negative indexing + last_view = self.datalist[-1] + self.assertEqual('v8', last_view.id) + + second_to_last = self.datalist[-2] + self.assertEqual('v7', second_to_last.id) + + def test_slice_indexing(self): + """Test that DataList supports slice indexing (list-like behavior).""" + # Basic slice + first_three = self.datalist[0:3] + self.assertEqual(3, len(first_three)) + self.assertIsInstance(first_three, list) + self.assertEqual('v1', first_three[0].id) + self.assertEqual('v3', first_three[2].id) + + # Slice with step + every_other = self.datalist[::2] + self.assertEqual(4, len(every_other)) + self.assertEqual('v1', every_other[0].id) + self.assertEqual('v3', every_other[1].id) + self.assertEqual('v5', every_other[2].id) + + # Slice from middle + middle = self.datalist[2:5] + self.assertEqual(3, len(middle)) + self.assertEqual('v3', middle[0].id) + self.assertEqual('v4', middle[1].id) + + # Empty slice + empty = self.datalist[10:20] + self.assertEqual(0, len(empty)) + + def test_string_indexing_raises_typeerror(self): + """Test that DataList raises TypeError for string indexing.""" + # String indexing should raise TypeError + with self.assertRaises(TypeError) as cm: + _ = self.datalist['v1'] + self.assertIn("list indices must be integers or slices", str(cm.exception)) + self.assertIn("not str", str(cm.exception)) + + # Test with documents list too + with self.assertRaises(TypeError) as cm: + _ = self.mmif_obj.documents['m1'] + self.assertIn("list indices must be integers or slices", str(cm.exception)) + + def test_get_deprecated_warning(self): + """Test that get() method raises DeprecationWarning.""" + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = self.datalist.get('v1') + + # Check that warning was raised + self.assertEqual(1, len(w)) + self.assertTrue(issubclass(w[0].category, DeprecationWarning)) + self.assertIn("deprecated", str(w[0].message).lower()) + self.assertIn("2.0.0", str(w[0].message)) + + # But it should still work and return the view + self.assertIsNotNone(result) + self.assertEqual('v1', result.id) + + def test_high_level_mmif_string_access(self): + """Test that high-level Mmif container accepts string IDs.""" + # Mmif should accept string access (container behavior) + view = self.mmif_obj['v1'] + self.assertEqual('v1', view.id) + + doc = self.mmif_obj['m1'] + self.assertEqual('m1', doc.id) + + # Long-form annotation ID + ann = self.mmif_obj['v5:bb1'] + self.assertIsNotNone(ann) + + def test_high_level_view_string_access(self): + """Test that high-level View container accepts string IDs.""" + view = self.mmif_obj['v4'] + + # View should accept string access (container behavior) + ann = view['v4:a1'] + self.assertIsNotNone(ann) + self.assertEqual('v4:a1', ann.id) + + def test_mmif_get_with_default(self): + """Test safe access on Mmif with default values.""" + # Existing object + view = self.mmif_obj.get('v1') + self.assertIsNotNone(view) + self.assertEqual('v1', view.id) + + # Non-existent object with default + result = self.mmif_obj.get('v999', default=None) + self.assertIsNone(result) + + # Non-existent with custom default + default_value = "not found" + result = self.mmif_obj.get('v999', default=default_value) + self.assertEqual(default_value, result) + + def test_mixed_access_patterns(self): + """Test that different access patterns can be used together.""" + # Integer access on list + first_view = self.mmif_obj.views[0] + + # String access on high-level container + specific_view = self.mmif_obj['v5'] + + # All should work + self.assertEqual('v1', first_view.id) + self.assertEqual('v5', specific_view.id) + + def test_datalist_all_collections(self): + """Test that all DataList subclasses behave consistently.""" + # ViewsList + view_by_int = self.mmif_obj.views[0] + view_by_container = self.mmif_obj[view_by_int.id] + self.assertEqual(view_by_int.id, view_by_container.id) + + # DocumentsList + if len(self.mmif_obj.documents) > 0: + doc_by_int = self.mmif_obj.documents[0] + doc_by_container = self.mmif_obj[doc_by_int.id] + self.assertEqual(doc_by_int.id, doc_by_container.id) + + # AnnotationsList + view = self.mmif_obj['v4'] + if len(view.annotations) > 0: + ann_by_int = view.annotations[0] + ann_by_container = view[ann_by_int.id] + self.assertEqual(ann_by_int.id, ann_by_container.id) def test_update(self): other_contains = """{ From d0024a607f7c9f86d840f90147521c6fe0b1663d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 18 Nov 2025 23:16:32 +0000 Subject: [PATCH 5/7] Initial plan From ad8d4e2d3d6eb354a68c1529dd66e6c29c37b308 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 18 Nov 2025 23:21:29 +0000 Subject: [PATCH 6/7] Address PR feedback: simplify returns, use KeyError, add pathlib validation Co-authored-by: keighrim <9062727+keighrim@users.noreply.github.com> --- mmif/serialize/mmif.py | 5 +---- mmif/serialize/model.py | 4 ++-- tests/mmif_examples.py | 10 +++++++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index e8957682..004be991 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -868,7 +868,4 @@ def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetada ret = self.views._items.get(item) if ret is None: raise KeyError(f"Object with ID {item} not found in the MMIF object. ") - else: - return ret - else: - return ret + return ret diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 3c5e6986..b052b0fb 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -337,14 +337,14 @@ def __contains__(self, key: str) -> bool: try: self.__getitem__(key) return True - except (TypeError, AttributeError, KeyError): + except (TypeError, KeyError): return False def __getitem__(self, key) -> Any: if key in self._named_attributes(): value = self.__dict__[key] elif self._unnamed_attributes is None: - raise AttributeError(f"Additional properties are disallowed by {self.__class__}: {key}") + raise KeyError(f"Additional properties are disallowed by {self.__class__}: {key}") else: value = self._unnamed_attributes[key] if key not in self._required_attributes and self.is_empty(value): diff --git a/tests/mmif_examples.py b/tests/mmif_examples.py index 7e860c0a..b19f9d9a 100644 --- a/tests/mmif_examples.py +++ b/tests/mmif_examples.py @@ -1,6 +1,7 @@ import itertools import os import subprocess +from pathlib import Path from string import Template from urllib import request @@ -20,8 +21,11 @@ def _load_from_url_or_git(url): If LOCALMMIF env var is set, use git show to load from local repo. LOCALMMIF should be the path to the local mmif repository. """ - localmmif = os.environ.get('LOCALMMIF') - if localmmif: + localmmif_str = os.environ.get('LOCALMMIF') + if localmmif_str: + localmmif = Path(localmmif_str) + if not localmmif.is_dir(): + raise ValueError(f"LOCALMMIF path is not a valid directory: {localmmif}") # Extract the version/branch and file path from the URL # URL format: https://raw.githubusercontent.com/clamsproject/mmif/{version}/{filepath} url_prefix = "https://raw.githubusercontent.com/clamsproject/mmif/" @@ -35,7 +39,7 @@ def _load_from_url_or_git(url): try: result = subprocess.run( ['git', 'show', git_ref], - cwd=localmmif, + cwd=str(localmmif), capture_output=True, text=True, check=True From e32ec869d1bb500f2ed0773fc0d33f729fb6c86f Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 18 Nov 2025 19:28:47 -0500 Subject: [PATCH 7/7] fixed syntax error in docstrings --- .gitignore | 2 + mmif/serialize/annotation.py | 42 ++++++++++--------- mmif/serialize/mmif.py | 64 ++++++++++++++++------------- mmif/serialize/model.py | 80 ++++++++++++++++++++---------------- mmif/serialize/view.py | 54 +++++++++++++----------- 5 files changed, 137 insertions(+), 105 deletions(-) diff --git a/.gitignore b/.gitignore index 84afd451..5dd9b09b 100644 --- a/.gitignore +++ b/.gitignore @@ -82,3 +82,5 @@ mmif/vocabulary documentation/_build/ /VERSION +_issues + diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index c7c00f3d..6f1471aa 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -276,15 +276,17 @@ def get(self, prop_name: str, default=None): Examples -------- - >>> # Access annotation properties: - >>> label = annotation.get('label', default='unknown') - >>> start_time = annotation.get('start', default=0) - >>> - >>> # Access @type: - >>> at_type = annotation.get('@type') - >>> - >>> # Safe access with custom default: - >>> targets = annotation.get('targets', default=[]) + .. code-block:: python + + # Access annotation properties: + label = annotation.get('label', default='unknown') + start_time = annotation.get('start', default=0) + + # Access @type: + at_type = annotation.get('@type') + + # Safe access with custom default: + targets = annotation.get('targets', default=[]) See Also -------- @@ -424,16 +426,18 @@ def get(self, prop_name, default=None): Examples -------- - >>> # Access document properties: - >>> mime = document.get('mime', default='application/octet-stream') - >>> location = document.get('location') - >>> - >>> # Access properties added after creation (pending): - >>> author = document.get('author', default='anonymous') - >>> publisher = document.get('publisher') - >>> - >>> # Access ephemeral properties from Annotation objects: - >>> sentiment = document.get('sentiment', default='neutral') + .. code-block:: python + + # Access document properties: + mime = document.get('mime', default='application/octet-stream') + location = document.get('location') + + # Access properties added after creation (pending): + author = document.get('author', default='anonymous') + publisher = document.get('publisher') + + # Access ephemeral properties from Annotation objects: + sentiment = document.get('sentiment', default='neutral') See Also -------- diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 004be991..2b759765 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -156,19 +156,23 @@ class Mmif(MmifObject): -------- Accessing objects by ID (high-level, convenient): - >>> mmif = Mmif(mmif_json) - >>> doc = mmif['m1'] # Get document by ID - >>> view = mmif['v1'] # Get view by ID - >>> ann = mmif['v1:a1'] # Get annotation by long-form ID - >>> - >>> # Safe access with default: - >>> doc = mmif.get('m99', default=None) + .. code-block:: python + + mmif = Mmif(mmif_json) + doc = mmif['m1'] # Get document by ID + view = mmif['v1'] # Get view by ID + ann = mmif['v1:a1'] # Get annotation by long-form ID + + # Safe access with default: + doc = mmif.get('m99', default=None) Accessing via underlying lists (positional access): - >>> first_doc = mmif.documents[0] # First document - >>> last_view = mmif.views[-1] # Last view - >>> all_views = mmif.views[1:4] # Slice of views + .. code-block:: python + + first_doc = mmif.documents[0] # First document + last_view = mmif.views[-1] # Last view + all_views = mmif.views[1:4] # Slice of views """ def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *, validate: bool = True) -> None: @@ -823,27 +827,31 @@ def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetada -------- High-level access by ID: - >>> mmif = Mmif(mmif_json) - >>> - >>> # Access documents: - >>> doc = mmif['m1'] # Returns Document with ID 'm1' - >>> - >>> # Access views: - >>> view = mmif['v1'] # Returns View with ID 'v1' - >>> - >>> # Access annotations (long-form ID): - >>> ann = mmif['v1:a1'] # Returns Annotation from view v1 - >>> - >>> # Access attributes: - >>> metadata = mmif['metadata'] # Returns MmifMetadata object - >>> - >>> # Will raise KeyError: - >>> doc = mmif['nonexistent'] # KeyError! + .. code-block:: python + + mmif = Mmif(mmif_json) + + # Access documents: + doc = mmif['m1'] # Returns Document with ID 'm1' + + # Access views: + view = mmif['v1'] # Returns View with ID 'v1' + + # Access annotations (long-form ID): + ann = mmif['v1:a1'] # Returns Annotation from view v1 + + # Access attributes: + metadata = mmif['metadata'] # Returns MmifMetadata object + + # Will raise KeyError: + doc = mmif['nonexistent'] # KeyError! For list-style positional access, use the underlying collections: - >>> first_doc = mmif.documents[0] # Integer index - >>> second_view = mmif.views[1] # Integer index + .. code-block:: python + + first_doc = mmif.documents[0] # Integer index + second_view = mmif.views[1] # Integer index See Also -------- diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index b052b0fb..1bec7b29 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -370,13 +370,15 @@ def get(self, obj_id, default=None): -------- Safe access pattern (works on all MmifObject subclasses): - >>> # On Mmif objects: - >>> view = mmif.get('v1', default=None) # Returns None if not found - >>> doc = mmif.get('doc1', default=None) - >>> - >>> # On Annotation/Document objects: - >>> label = annotation.get('label', default='unknown') - >>> author = document.get('author', default='anonymous') + .. code-block:: python + + # On Mmif objects: + view = mmif.get('v1', default=None) # Returns None if not found + doc = mmif.get('doc1', default=None) + + # On Annotation/Document objects: + label = annotation.get('label', default='unknown') + author = document.get('author', default='anonymous') See Also -------- @@ -463,16 +465,20 @@ def get(self, key: str, default=None) -> Optional[T]: -------- Old pattern (deprecated, do not use): - >>> view = mmif.views.get('v1') # DeprecationWarning! + .. code-block:: python + + view = mmif.views.get('v1') # DeprecationWarning! New patterns to use instead: - >>> # For ID-based access, use container: - >>> view = mmif['v1'] - >>> # Or with safe access: - >>> view = mmif.get('v1', default=None) - >>> # For positional access: - >>> view = mmif.views[0] + .. code-block:: python + + # For ID-based access, use container: + view = mmif['v1'] + # Or with safe access: + view = mmif.get('v1', default=None) + # For positional access: + view = mmif.views[0] See Also -------- @@ -527,20 +533,22 @@ def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: -------- Positional access (pythonic list behavior): - >>> # Get first view: - >>> first_view = mmif.views[0] - >>> - >>> # Get last document: - >>> last_doc = mmif.documents[-1] - >>> - >>> # Slice to get multiple elements: - >>> first_three_views = mmif.views[0:3] - >>> - >>> # This will raise TypeError: - >>> view = mmif.views['v1'] # TypeError! - >>> - >>> # For ID-based access, use container: - >>> view = mmif['v1'] # Correct way + .. code-block:: python + + # Get first view: + first_view = mmif.views[0] + + # Get last document: + last_doc = mmif.documents[-1] + + # Slice to get multiple elements: + first_three_views = mmif.views[0:3] + + # This will raise TypeError: + view = mmif.views['v1'] # TypeError! + + # For ID-based access, use container: + view = mmif['v1'] # Correct way """ if isinstance(key, (int, slice)): # Python's dicts preserve insertion order since 3.7. @@ -596,13 +604,15 @@ def get(self, key: T, default=None) -> Optional[S]: Examples -------- - >>> # Access contains metadata: - >>> timeframe_meta = view.metadata.contains.get(AnnotationTypes.TimeFrame) - >>> if timeframe_meta is None: - ... print("No TimeFrame annotations in this view") - >>> - >>> # With custom default: - >>> value = some_dict.get('key', default={}) + .. code-block:: python + + # Access contains metadata: + timeframe_meta = view.metadata.contains.get(AnnotationTypes.TimeFrame) + if timeframe_meta is None: + print("No TimeFrame annotations in this view") + + # With custom default: + value = some_dict.get('key', default={}) """ return self._items.get(key, default) diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index 9b419b80..80b7a65b 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -45,18 +45,22 @@ class View(MmifObject): -------- Accessing annotations by ID (high-level, convenient): - >>> view = mmif['v1'] - >>> ann = view['v1:a1'] # Get annotation by ID - >>> doc = view['v1:td1'] # Get document by ID - >>> - >>> # Safe access with default: - >>> ann = view.get('v1:a999', default=None) + .. code-block:: python + + view = mmif['v1'] + ann = view['v1:a1'] # Get annotation by ID + doc = view['v1:td1'] # Get document by ID + + # Safe access with default: + ann = view.get('v1:a999', default=None) Accessing via underlying list (positional access): - >>> first_ann = view.annotations[0] # First annotation - >>> last_ann = view.annotations[-1] # Last annotation - >>> some_anns = view.annotations[1:5] # Slice of annotations + .. code-block:: python + + first_ann = view.annotations[0] # First annotation + last_ann = view.annotations[-1] # Last annotation + some_anns = view.annotations[1:5] # Slice of annotations """ def __init__(self, view_obj: Optional[Union[bytes, str, dict]] = None, parent_mmif=None, *_) -> None: @@ -340,23 +344,27 @@ def __getitem__(self, key: str) -> 'Annotation': -------- High-level access by ID: - >>> view = mmif['v1'] - >>> - >>> # Access annotations: - >>> ann = view['v1:a1'] # Returns Annotation with ID 'v1:a1' - >>> - >>> # Access text documents in view: - >>> doc = view['v1:td1'] # Returns Document with ID 'v1:td1' - >>> - >>> # Access attributes: - >>> metadata = view['metadata'] # Returns ViewMetadata object - >>> - >>> # Will raise KeyError: - >>> ann = view['nonexistent'] # KeyError! + .. code-block:: python + + view = mmif['v1'] + + # Access annotations: + ann = view['v1:a1'] # Returns Annotation with ID 'v1:a1' + + # Access text documents in view: + doc = view['v1:td1'] # Returns Document with ID 'v1:td1' + + # Access attributes: + metadata = view['metadata'] # Returns ViewMetadata object + + # Will raise KeyError: + ann = view['nonexistent'] # KeyError! For list-style positional access, use the underlying collection: - >>> first_ann = view.annotations[0] # Integer index + .. code-block:: python + + first_ann = view.annotations[0] # Integer index See Also --------