diff --git a/changes/35.feature b/changes/35.feature new file mode 100644 index 0000000..bc5bae4 --- /dev/null +++ b/changes/35.feature @@ -0,0 +1,5 @@ +Handle unbound XML prefixes when strict=False + +Allow for unbound XML prefixes when parsing with strict=False. +This is useful for handling XML documents that may have missing +namespace declarations. diff --git a/didl_lite/didl_lite.py b/didl_lite/didl_lite.py index 13c0d4e..de42326 100644 --- a/didl_lite/didl_lite.py +++ b/didl_lite/didl_lite.py @@ -2,6 +2,7 @@ """DIDL-Lite (Digital Item Declaration Language) tools for Python.""" # pylint: disable=too-many-lines +import re from typing import ( Any, Dict, @@ -1073,7 +1074,27 @@ def to_xml_string(*objects: DidlObject) -> bytes: def from_xml_string( xml_string: str, strict: bool = True ) -> List[Union[DidlObject, Descriptor]]: - """Convert XML string to DIDL Objects.""" + """Parse DIDL-Lite XML string.""" + if not strict: + # Find all prefixes used in tags, e.g., + used_prefixes = set(re.findall(r"<([a-zA-Z0-9]+):", xml_string)) + + # Find all defined namespaces, e.g., xmlns:prefix=... + defined_prefixes = set(re.findall(r"xmlns:([a-zA-Z0-9]+)=", xml_string)) + + # Identify prefixes used but not defined. + missing_prefixes = ( + used_prefixes - defined_prefixes - {"DIDL-Lite", "dc", "upnp", "dlna"} + ) + + # Remove the "if missing_prefixes:" line and just keep the for loop + for prefix in missing_prefixes: + dlna_ns = 'xmlns:dlna="urn:schemas-dlna-org:metadata-1-0/"' + if dlna_ns in xml_string: + replacement = f'{dlna_ns} xmlns:{prefix}="http://tempuri.org/{prefix}/"' + xml_string = xml_string.replace(dlna_ns, replacement) + + # Proceed with parsing using the (potentially) patched xml_string xml_el = defusedxml.ElementTree.fromstring(xml_string) return from_xml_el(xml_el, strict) diff --git a/tests/test_didl_lite.py b/tests/test_didl_lite.py index f16787c..e4627b0 100644 --- a/tests/test_didl_lite.py +++ b/tests/test_didl_lite.py @@ -680,3 +680,45 @@ def test_item_improper_class_nesting(self) -> None: item = items[0] assert isinstance(item, didl_lite.MusicTrack) + + def test_from_xml_string_unbound_prefix(self) -> None: + """Test from_xml_string with unbound namespace prefix.""" + # The key fix is adding: xmlns="urn:schemas-upnp-org:metadata-1-0/DIDL-Lite/" + broken_xml = ( + '' + '' + "Test Title" + "Test Subtitle" + "object.item.audioItem.musicTrack" + "" + "" + ) + + # This call should not throw ParseError when strict=False. + objs = didl_lite.from_xml_string(broken_xml, strict=False) + + assert len(objs) == 1 + assert objs[0].title == "Test Title" + # Check that the temporary namespace is correctly assigned. + assert "sub_title" in objs[0].__dict__ + assert objs[0].sub_title == "Test Subtitle" + assert isinstance(objs[0], didl_lite.MusicTrack) + + def test_music_track_artist_and_genre(self) -> None: + """Test MusicTrack artist and genre properties.""" + track = didl_lite.MusicTrack( + id="1", + parent_id="0", + title="Test", + restricted="0", + artist="My Artist", + genre="My Genre", + ) + + xml = didl_lite.to_xml_string(track) + + assert b"My Artist" in xml + assert b"My Genre" in xml