From 4170b154118e39087df3f88f56b4f88e0af8992a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Thu, 6 Feb 2025 17:31:32 +0100 Subject: [PATCH 1/4] videos: migration required transform rules --- README.rst | 15 + cds_migrator_kit/videos/__init__.py | 2 +- .../videos/weblecture_migration/__init__.py | 2 +- .../videos/weblecture_migration/cli.py | 3 +- .../weblecture_migration/load/__init__.py | 2 +- .../videos/weblecture_migration/load/load.py | 7 - .../videos/weblecture_migration/streams.py | 2 +- .../videos/weblecture_migration/streams.yaml | 20 +- .../transform/__init__.py | 2 +- .../transform/models/__init__.py | 2 +- .../transform/models/base.py | 2 +- .../transform/models/video_lecture.py | 199 ++++++----- .../transform/transform.py | 84 ++++- .../transform/xml_processing/__init__.py | 2 +- .../xml_processing/quality/__init__.py | 2 +- .../xml_processing/quality/contributors.py | 21 +- .../xml_processing/rules/__init__.py | 2 +- .../transform/xml_processing/rules/base.py | 24 +- .../xml_processing/rules/video_lecture.py | 200 +++++++++-- tests/cds-videos/__init__.py | 9 + tests/cds-videos/data/lecture.json | 26 ++ .../cds-videos/test_videos_transform_rules.py | 316 ++++++++++++++++++ tests/helpers.py | 43 +++ 23 files changed, 824 insertions(+), 163 deletions(-) create mode 100644 tests/cds-videos/__init__.py create mode 100644 tests/cds-videos/data/lecture.json create mode 100644 tests/cds-videos/test_videos_transform_rules.py diff --git a/README.rst b/README.rst index df521510..4305c175 100644 --- a/README.rst +++ b/README.rst @@ -7,6 +7,21 @@ cds-migrator-kit ================== + +TODO change here: + + +Default Installation (without RDM or Videos) +pip install . + +Install for RDM + +pip install .[rdm] + +Install for Videos + +pip install .[videos] + To run the interface: ``` gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app diff --git a/cds_migrator_kit/videos/__init__.py b/cds_migrator_kit/videos/__init__.py index d425caae..a9f8adc2 100644 --- a/cds_migrator_kit/videos/__init__.py +++ b/cds_migrator_kit/videos/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2024 CERN. +# Copyright (C) 2025 CERN. # # cds-migrator-kit is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/__init__.py b/cds_migrator_kit/videos/weblecture_migration/__init__.py index 61138b57..31cfe2d5 100644 --- a/cds_migrator_kit/videos/weblecture_migration/__init__.py +++ b/cds_migrator_kit/videos/weblecture_migration/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2024 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/cli.py b/cds_migrator_kit/videos/weblecture_migration/cli.py index 13d6e954..64d2f450 100644 --- a/cds_migrator_kit/videos/weblecture_migration/cli.py +++ b/cds_migrator_kit/videos/weblecture_migration/cli.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. @@ -44,5 +44,6 @@ def run(dry_run=False): stream_definitions=[RecordStreamDefinition], config_filepath=Path(stream_config).absolute(), dry_run=dry_run, + collection="weblectures", ) runner.run() diff --git a/cds_migrator_kit/videos/weblecture_migration/load/__init__.py b/cds_migrator_kit/videos/weblecture_migration/load/__init__.py index cad2a5c0..7030a9d9 100644 --- a/cds_migrator_kit/videos/weblecture_migration/load/__init__.py +++ b/cds_migrator_kit/videos/weblecture_migration/load/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2024 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/load/load.py b/cds_migrator_kit/videos/weblecture_migration/load/load.py index 3ad5d23a..13d6db5d 100644 --- a/cds_migrator_kit/videos/weblecture_migration/load/load.py +++ b/cds_migrator_kit/videos/weblecture_migration/load/load.py @@ -17,17 +17,10 @@ def __init__( db_uri, data_dir, tmp_dir, - existing_data=False, entries=None, dry_run=False, ): """Constructor.""" - self.db_uri = db_uri - - self.data_dir = data_dir - self.tmp_dir = tmp_dir - self.existing_data = existing_data - self.entries = entries self.dry_run = dry_run def _prepare(self, entry): diff --git a/cds_migrator_kit/videos/weblecture_migration/streams.py b/cds_migrator_kit/videos/weblecture_migration/streams.py index 0fb53431..94e11af6 100644 --- a/cds_migrator_kit/videos/weblecture_migration/streams.py +++ b/cds_migrator_kit/videos/weblecture_migration/streams.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/streams.yaml b/cds_migrator_kit/videos/weblecture_migration/streams.yaml index 055c93da..5d3eb7c1 100644 --- a/cds_migrator_kit/videos/weblecture_migration/streams.yaml +++ b/cds_migrator_kit/videos/weblecture_migration/streams.yaml @@ -1,12 +1,10 @@ -data_dir: cds_migrator_kit/videos/weblecture_migration/data/ -tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp -state_dir: cds_migrator_kit/videos/weblecture_migration/cache -log_dir: cds_migrator_kit/videos/weblecture_migration/log -db_uri: postgresql://cds-rdm:cds-rdm@localhost:5432/cds-rdm # TODO CHANGE -old_secret_key: CHANGE_ME # TODO CHANGE -new_secret_key: CHANGE_ME # TODO CHANGE +db_uri: postgresql://cds-videos:cds-videos@localhost:5432/cds-videos records: - extract: - dirpath: cds_migrator_kit/videos/weblecture_migration/data/weblectures/dump/ - transform: - files_dump_dir: cds_migrator_kit/videos/weblecture_migration/data/weblectures/files/ + weblectures: + data_dir: cds_migrator_kit/videos/weblecture_migration/data/ + tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp + log_dir: cds_migrator_kit/videos/weblecture_migration/log + extract: + dirpath: cds_migrator_kit/videos/weblecture_migration/data/weblectures/dump/ + transform: + files_dump_dir: cds_migrator_kit/videos/weblecture_migration/data/weblectures/files/ diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py index a8216c0e..faebe66e 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py index 6f9f543f..d1a351f4 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py index 5885950c..1b01058c 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py index a2d63dbc..09af17b8 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of CERN Document Server. -# Copyright (C) 2024 CERN. +# Copyright (C) 2025 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as @@ -30,130 +30,177 @@ class VideoLecture(CdsOverdo): __query__ = '8567_.x:"Absolute master path" 8567_.d:/mnt/master_share* -980__.C:MIGRATED -980__.c:DELETED -5831_.a:digitized' __ignore_keys__ = { - "110__a", # corporate author - "8567_y", # File description - "111__c", # Video location (indico location) - "518__r", # Video/meeting location + "003", + # 340: Drop streaming video, anything else we copy over to curation field. "340__a", # Physical medium -> curation field "340__d", # Physical medium/recording technique -> curation field + "340__9", # Physical medium/CD-ROM -> curation field + "340__k", # Physical medium/ -> curation field + "340__j", # Physical medium/ -> curation field + "340__8", # Physical medium/id? -> curation field https://cds.cern.ch/record/2234827 + # check with JY "961__x", # Creation Date TODO? check with JY "961__c", # modification Date TODO? check with JY "961__h", # Hour? TODO? check with JY "961__l", # Library? TODO? check with JY + "961__a", # ? TODO? check with JY + "961__b", # ? TODO? check with JY "964__a", # Item owner TODO? check with JY "916__d", # Status week? TODO? check with JY "901__u", # Affiliation at Conversion? TODO? check with JY "583__a", # Action note / curation TODO? check with JY "583__c", # Action note / curation TODO? check with JY "583__z", # Action note / curation TODO? check with JY + "916__n", # Status week TODO? check with JY + "916__s", # Status week TODO? check with JY + "916__w", # Status week TODO? check with JY + "916__y", # Status week TODO? check with JY + "916__a", # Status week TODO? check with JY + "306__a", # ? TODO? check with JY + "336__a", # ? TODO? check with JY + "981__a", # duplicate record id TODO? check with JY + "916__a", # Status week TODO? check with JY + "916__d", # Status week TODO? check with JY + "916__e", # Status week TODO? check with JY + "916__s", # Status week TODO? check with JY + "916__w", # Status week TODO? check with JY + "916__y", # Status week TODO? check with JY + "960__a", # Base? + # Category, Collection, Series, Keywords + "980__a", # collection tag + "980__b", # Secondary collection indicator "65027a", # TODO Subject category = Event? + "490__a", # TODO Series + "490__v", # Series: volume + "650172", # subject provenance + "65017a", # subject value + "6531_9", # keyword provenance + "6531_a", # keyword + "690C_a", # collection name + # Conference Information/Indico "111__a", # Title (indico) "111__9", # Start date (indico) "111__g", # Event id (indico) "111__z", # End date (indico) + "111__c", # Video location (indico location) "084__a", # Indico? "084__2", # Indico? - "8567_2", # File system? 'MediaArchive' - "980__b", # Secondary collection indicator + "518__r", # Video/meeting location + "518__g", # Lectures: conference identification + "970__a", # alternative identifier, indico id? + # Copyright/License "542__d", # Copyright holder "542__g", # Copyright date - "490__a", # TODO Series - "8567_u", # File url + "542__3", # Copyright materials + "540__a", # License + "540__b", # License person/organization + "540__u", # License URL + "540__3", # License material + # Alternative identifiers "962__n", # `Presented at` note (conference/linked document) "962__b", # `Presented at` record (conference/linked document) - "518__g", # Lectures: conference identification - "490__v", # Series: volume - "269__b", # Name of publ. "088__9", # Report number (make it alternative identifier with cds reference?) "088__z", # Report number (make it alternative identifier with cds reference?) - # Files - "8564_q", # File type (digitized) # TODO this record has both lecturemedia and DM https://cds.cern.ch/record/589875 + "035__9", # Inspire schema (Indico/AgendaMaker) + "035__a", # Inspire id value + "088__a", # Report Number --> alternative identifier with ds reference + # Additional Title, Volume, Note + "246__a", # Additional title + "246__i", # Additional title/display text + "246__b", # Additional title remaining + "246__n", # Volume + "246__p", # Volume + "500__a", # Note (-> internal note) + "500__b", # Note (-> internal note) + "500__9", # Note/type (-> internal note) https://cds.cern.ch/record/1561636 + # Restricted + "5061_f", + "5061_d", + "5061_5", + "5061_a", + "5061_2", + # Location (Shelving/Library) "852__c", # Location (Shelving/Library) + "852__b", # Location (Shelving/?) + "852__8", # Location (Shelving/id?) https://cds.cern.ch/record/2234827 "852__h", # Location (Shelving) example: https://cds.cern.ch/record/254588/ "852__a", # Location (Shelving) example: https://cds.cern.ch/record/558348 "852__x", # Location (Shelving/ type? DVD) example: https://cds.cern.ch/record/690000/ + "852__9", # Location (Shelving/ note?) example: https://cds.cern.ch/record/2233722 + # Date/Extra Reduntant + "260__c", # Redundant (more detailed value is in 269__c imprint.pub_date) + "260__a", + "260__b", + # Contributor? + "110__a", # corporate author + "700__m", # author's email + "270__p", # document contact --> add as a contributor with a correct role + # Internal Note + "595__a", # Internal Note --> curation field + "595__z", # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean + # Collaboration --> add new role to contributors + "710__5", # department / organisation author + "710__a", # organisation author + "710__g", # organisation author + # Accelerator/Facility, Experiment, Project, Study + "693__a", # accelerator, create a custom field? + "693__e", # experiments + "693__p", # project + "693__s", # study + # Submitter + "859__f", # creator's email + "8560_f", # submitter email + # OAI + "0248_a", # oai identifier + "0248_p", # oai identifier + "0248_q", # IGNORE "518__h", # Lectures: Starting time "300__2", # Imprint "300__b", # Imprint + "300__8", # Imprint "300__a", # Number of pages / duration "250__a", # Edition "700__0", # Author id (eg: AUTHOR|(CDS)2067852) "518__l", # Lectures: length of speech - # TODO copied from ssn - "0248_a", # oai identifier, not needed to migrate, TBD - "0248_p", # oai identifier, not needed to migrate, TBD - "0248_q", # full text tag 2778897 - "100__m", # author's email <-- decided not to keep in RDM, - "260__c", # Redundant (more detailed value is in 269__c imprint.pub_date) - "269__a", # imprint place - "270__m", # document contact email - "595__a", # always value CERN EDS, not displayed, TODO: do we keep? - "595__z", # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean - "700__m", # author's email <-- decided not to keep in RDM, - "710__5", # department / organisation author - "710__a", # organisation author - "8564_8", # Files system field - "8564_s", # Files system field - "8564_u", # Files - "8564_x", # Files system field - "8564_y", # Files - "937__c", # modification date - "937__s", # modification person - "960__a", # collection id? usually value 12, to confirm if we ignore - "980__a", # collection tag - "981__a", # duplicate record id - "003", - "035__9", # Inspire schema - "035__a", # Inspire id value - "037__a", # (Report number) alternative identifiers -> scheme "CDS REFERENCE" - "088__a", # RN (manual introduced?) second report number (so the identifiers schemas are not unique!) - "246__a", - "246__i", # abbreviation - "246__i", # abbreviation tag, applies to value of 246__A - "270__p", # document contact person name - "500__a", # Note (-> description.type = other) - "562__c", # note - "650172", # subject provenance - "65017a", # subject value - "6531_9", # keyword provenance - "6531_a", # keyword - "690C_a", # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field) - "6931_9", # keyword - "6931_a", # keyword - "693__a", # accelerator, do we create a custom field? - "693__b", # beams recid: 2640381 - "693__e", # custom_fields.cern:experiments - "693__f", # facility, do we create a custom field? - "693__p", # project, do we create a custom field? - "693__s", # study, do we create a custom field? - "710__g", # Collaboration, OK to migrate as corporate contributor (not creator)? - "859__f", # creator's email, to be used to determine the owner - "916__n", - "916__s", - "916__w", - "963__a", - "970__a", # alternative identifier, scheme ALEPH + "100__0", # Author id (eg: AUTHOR|(CDS)2067852) + "240__a", # Decided to drop, (Streaming Video) + "337__a", # Decided to drop, (Video) + "963__a", # values: PUBLIC/RESTRICTED + "8564_8", # File: bibdoc id + "8564_s", # File: file size # IMPLEMENTED # "520__a", # Note (-> description.type = abstract # "001", # "041__a", # languages - # "906__p", # names, is it supervisor? + # "906__p", # event speakers # "100__9", # #BEARD# tag # "100__a", # "100__u", # Author affiliation - # "700__e", # Contributor/Speaker role - # "700__0", # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID + # "700__e", # Contributor/Speaker role + # "700__0", # Contributors (cds author id) # "700__9", # #BEARD# tag # "700__a", # Contributors (full name) # "700__u", # Contributors (affiliation) - # "518__d", # Full date/time - # "269__c", # Date (full date/year) - # "518__a", # date? + # "518__d", # Full date/time + # "269__c", # Date (full date/year) + # "269__b", # CERN (checked for other values) + # "269__a", # Geneva (checked for other values) + # "518__a", # Date + # "906__u", # Contributor Affiliation + # "511__u", # Contributor Affiliation + # "8567_u", # File url + # "8567_y", # File description + # "8567_2", # File system? 'MediaArchive' + # "8564_q", # File type (digitized) + # "8564_8", # Files system field + # "8564_s", # Files system field + # "8564_u", # Files + # "8564_x", # Files system field + # "8564_y", # Files } model = VideoLecture( - bases=(base_model,), - entry_point_group="cds_migrator_kit.videos.rules.video_lecture", + bases=(base_model,), entry_point_group="cds_migrator_kit.videos.rules.video_lecture" ) diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py index f17c2890..dfbee8ad 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py @@ -96,18 +96,52 @@ def _media_files(self, entry): def _metadata(self, entry): """Transform the metadata of a record.""" + def extract_dates(json_data, key, subkey=None): + """Extracts date values from a given key in json_data.""" + items = json_data.get(key, []) + if subkey: + return { + item[subkey]["date"] + for item in items + if isinstance(item, dict) + and subkey in item + and isinstance(item[subkey], dict) + and "date" in item[subkey] + } + + return { + item["date"] + for item in items + if isinstance(item, dict) and "date" in item + } + def reformat_date(json_data): """Reformat the date for the cds-videos data model.""" - dates = json_data.get("date", []) - dates_set = {date for date in dates if date is not None} + # 1. Check primary date field + dates_set = {date for date in json_data.get("date", []) if date} - if len(dates_set) == 1: # Should be only one value - return next(iter(dates_set)) # Get the single date from the set - if len(dates_set) > 1: - return next(iter(dates_set)) # return the first + # 2. If no date found, check `indico_links` + if not dates_set: + dates_set = extract_dates(json_data, "url_files", subkey="indico") - raise UnexpectedValue( - "No valid date found in record: {}.".format(json_data.get("recid")) + # 3. If still no date found, check `internal_notes` + if not dates_set: + dates_set = extract_dates(json_data, "internal_notes") + + # 4. Return the valid date if only one is found + if len(dates_set) == 1: + return next(iter(dates_set)) + + # 5. Multiple dates (Must have different indico event videos?) + if len(dates_set) > 1: + raise UnexpectedValue( + f"More than one date found in record: {json_data.get('recid')} dates: {dates_set}.", + stage="transform", + ) + + raise MissingRequiredField( + f"No valid date found in record: {json_data.get('recid')}.", + stage="transform", ) def description(json_data): @@ -116,11 +150,41 @@ def description(json_data): return json_data.get("title").get("title") return json_data.get("description") + def format_contributors(json_data): + """ + Same contributors could be both in tag 700 and 906. + + TODO: Should we keep them both? https://cds.cern.ch/record/2233152/export/xm?ln=en + Removes duplicate contributors based on name, role, and affiliations. + """ + contributors = json_data.get("contributors") + if not contributors: + raise MissingRequiredField( + f"No valid contributor found in record: {json_data.get('recid')}.", + stage="transform", + ) + + unique_contributors = [] + seen = set() + + for contributor in contributors: + # Create a tuple to identify contributors + identifier = ( + contributor["name"], + contributor.get("role"), + tuple(contributor.get("affiliations", [])), + ) + if identifier not in seen: + seen.add(identifier) + unique_contributors.append(contributor) + + return unique_contributors + metadata = { "title": entry["title"], "description": description(entry), - "contributors": entry.get("contributors"), - "languages": entry.get("language"), + "contributors": format_contributors(entry), + "language": entry.get("language"), "date": reformat_date(entry), } # filter empty keys diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py index ce5e1662..29b34a6c 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py index d6e355a1..bd60b721 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py index 9e5a6577..f9fcd76a 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. @@ -38,4 +38,21 @@ def get_contributor_role(subfield, role, raise_unexpected=False): return translations[clean_role] -# TODO contributor affiliation will be implemented +def get_contributor(key, value): + """Create contributor json for tag 518 and 269.""" + beard = value.get("9") + if beard is not None and beard != "#BEARD#": + # checking if anything else stored in this field + # historically it was some kind of automatic script tagging + # and it should be ignored if value == #BEARD# + raise UnexpectedValue(field=key, subfield="9", value=beard) + name = value.get("a").strip() + affiliation = value.get("u", "") + role = value.get("e", "") + contributor = {"name": name} + if role: + role = get_contributor_role("e", role) + contributor.update({"role": role}) + if affiliation: + contributor.update({"affiliations": [affiliation]}) + return contributor diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py index aa43604f..72582f04 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2024 CERN. +# Copyright (C) 2025 CERN. # # cds-migrator-kit is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py index fd751c6f..fc10c171 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2025 CERN. # # CDS-Videos is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. @@ -21,7 +21,7 @@ ) from ...models.base import model -from ..quality.contributors import get_contributor_role +from ..quality.contributors import get_contributor @model.over("legacy_recid", "^001") @@ -68,26 +68,12 @@ def language(self, key, value): @require(["a"]) def creators(self, key, value): """Translates the creators field.""" - role = value.get("e") - if role: - role = get_contributor_role("e", role) - beard = value.get("9") - if beard is not None and beard != "#BEARD#": - # checking if anything else stored in this field - # historically it was some kind of automatic script tagging - # and it should be ignored if value == #BEARD# - raise UnexpectedValue(field=key, subfield="9", value=beard) - name = value.get("a").strip() - contributor = {"name": name} - if role: - contributor.update({"role": role}) - # TODO contributor affiliation will be implemented - - return contributor + return get_contributor(key, value) @model.over("contributors", "^700__") +@for_each_value @require(["a"]) def contributors(self, key, value): """Translates contributors.""" - return creators(self, key, value) + return get_contributor(key, value) diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py index 205ac8aa..dd807548 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of CERN Document Server. -# Copyright (C) 2024 CERN. +# Copyright (C) 2025 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as @@ -17,8 +17,11 @@ # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Common Videos fields.""" +import re + from dateutil.parser import ParserError, parse +from cds_migrator_kit.errors import UnexpectedValue from cds_migrator_kit.transform.xml_processing.quality.decorators import ( for_each_value, require, @@ -28,39 +31,61 @@ from ...models.video_lecture import model +def parse_date(date_str): + """Parses a date string into 'YYYY-MM-DD' format. + + Returns None if the string is missing, too short, too long, + or if it contains incomplete or ambiguous date information. + + Examples: + - Some values only contain year (e.g., "1998") + - Some values has date range (e.g., "23 - 27 Nov 1998") + """ + if not date_str: + return None + if len(date_str) < 10 or len(date_str) > 13: + # Too short/long to have the full date info + return None + try: + parsed_date = parse(date_str) + return parsed_date.strftime("%Y-%m-%d") + except ParserError: + return + + @model.over("date", "^518__") @for_each_value def date(self, key, value): """Translates date from tag 518.""" - - def parse_date(date_str): - """Parses a date string into 'YYYY-MM-DD' format.""" - try: - if len(date_str) < 10: # Too short to have the full date info - return None - parsed_date = parse(date_str) - return parsed_date.strftime("%Y-%m-%d") - except ParserError: - return - - # List of possible subfields containing dates - possible_date_fields = [ - value.get("d"), # 518 'd' subfield (e.g., '2024-11-19T14:00:00') - value.get("c"), # 269 'c' subfield (e.g., '1993-08-09') - value.get("a"), # 518 'a' subfield (e.g., 'CERN, Geneva, 23 - 27 Nov 1998') - ] - - for date_field in possible_date_fields: - if date_field: - parsed_date = parse_date(date_field) - if parsed_date: # If parsing succeeds, return the formatted date - return parsed_date + # 518 'd' subfield, take the first 10 char, it might have another character (e.g., 2008-03-11T14:00:00Z) + parsed_date = parse_date((value.get("d") or "")[:10]) + if parsed_date: + return parsed_date + # 518 'a' subfield (e.g., 'CERN, Geneva, 23 - 27 Nov 1998') + parsed_date = parse_date(value.get("a", "").split(",")[-1]) + if parsed_date: + return parsed_date @model.over("date", "^269__") +@for_each_value def imprint(self, key, value): """Translates date from tag 269.""" - return date(self, key, value) + name = value.get("b") + if name and name.strip().upper() != "CERN": + # checking if anything else stored in this field + # and it should be ignored if value == CERN + raise UnexpectedValue(field=key, subfield="b", value=name) + place = value.get("a") + if place and place.strip().upper() != "GENEVA": + # checking if anything else stored in this field + # and it should be ignored if value == Geneva + raise UnexpectedValue(field=key, subfield="a", value=place) + + date_field = value.get("c") # 269 'c' subfield (e.g., '1993-08-09') + parsed_date = parse_date(date_field) + if parsed_date: # If parsing succeeds, return the formatted date + return parsed_date @model.over("contributors", "^511__") @@ -71,7 +96,9 @@ def performer(self, key, value): name = value.get("a").strip() role = value.get("e") contributor = {"name": name, "role": "Performer"} # TODO or "Participant" - # TODO contributor affiliation will be implemented + affiliation = value.get("u", "") + if affiliation: + contributor.update({"affiliations": [affiliation]}) return contributor @@ -82,5 +109,124 @@ def event_speakers(self, key, value): """Translates event_speakers.""" name = value.get("p").strip() contributor = {"name": name, "role": "Speaker"} - # TODO contributor affiliation will be implemented + affiliation = value.get("u", "") + if affiliation: + contributor.update({"affiliations": [affiliation]}) return contributor + + +@model.over("url_files", "^8564_") +@for_each_value +@require(["u"]) +def url_files(self, key, value): + """Detects 8564 files.""" + url = value.get("u") + if "digital-memory" in url: + return { + "digitized": { + "url": url, + "format": value.get("q"), + "link_text": value.get("y"), + "public_note": value.get("z"), + "nonpublic_note": value.get("x"), + "md5_checksum": value.get("w"), + "source": value.get("2"), + } + } + elif "indico" in url or "agenda" in url: + indico_link = {"url": url} + + # Try to get event id + match_id = re.search(r"(?:ida=|confId=|event/)([\w\d]+)", url) + if match_id: + event_id = match_id.group(1) + if event_id: + indico_link["event_id"] = event_id + + # Try to get the date from text + text = value.get("y") + if text: + indico_link["text"] = text + match_date = re.search(r"(?:Talk\s*)?(\d{1,2}\s\w{3}\s\d{4})", text) + if match_date: + parsed_date = parse_date(match_date.group(1)) + if parsed_date: + indico_link["date"] = parsed_date + + return {"indico": indico_link} + + url_file = {"url_file": {"url": url}} + text = value.get("y") + if text: + url_file["url_file"]["text"] = text + + nonpublic_note = value.get("x") + if nonpublic_note: + url_file["url_file"]["nonpublic_note"] = nonpublic_note + return url_file + + +@model.over("internal_notes", "^500__") +@for_each_value +@require(["a"]) +def internal_notes(self, key, value): + """Detects internal notes.""" + note = value.get("a").strip() + if value.get("9"): + note = value.get("9").strip() + " : " + value.get("a").strip() + internal_note = {"note": note} + + parts = note.split(",") + match_date = parts[-1].strip() if len(parts) > 1 else "" + if match_date: + parsed_date = parse_date(match_date) + if parsed_date: + internal_note.update({"date": parsed_date}) + + return internal_note + + +@model.over("files", "^8567_") +@for_each_value +def files(self, key, value): + """Detects files.""" + source = value.get("2") + if source and source.strip() != "MediaArchive": + # Check if anything else stored + raise UnexpectedValue(field=key, subfield="2", value=source) + + file = {} + + # Master path + master_path = value.get("d", "").strip() + if master_path: + if master_path.startswith("/mnt/master_share"): + file["master_path"] = master_path + file_type = value.get("x", "").strip() + if file_type and file_type != "Absolute master path": + # Check if anything else stored + raise UnexpectedValue(field=key, subfield="x", value=file_type) + else: + # Raise error if anything else stored + raise UnexpectedValue(field=key, subfield="d", value=master_path) + + # File with url/path + url = value.get("u", "").strip() + if url: + if url.startswith("/"): + file["path"] = url # Relative path + elif url.startswith("https://lecturemedia.cern.ch"): + file["url"] = url + file["path"] = url.replace("https://lecturemedia.cern.ch", "") + else: + # Check if anything else stored + raise UnexpectedValue(field=key, subfield="u", value=url) + file_type = value.get("x") + if file_type: + file["type"] = file_type.strip() + + description = value.get("y") + if description: + file["description"] = description.strip() + + return file diff --git a/tests/cds-videos/__init__.py b/tests/cds-videos/__init__.py new file mode 100644 index 00000000..c001bb54 --- /dev/null +++ b/tests/cds-videos/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CDS. +# Copyright (C) 2025 CERN. +# +# cds-migrator-kit is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Migration tool kit from old CDS to new CDS-Videos - test suite.""" diff --git a/tests/cds-videos/data/lecture.json b/tests/cds-videos/data/lecture.json new file mode 100644 index 00000000..3b0c0e04 --- /dev/null +++ b/tests/cds-videos/data/lecture.json @@ -0,0 +1,26 @@ +[ + { + "files": [], + "collections": null, + "recid": 2233152, + "record": [ + { + "marcxml": "\n 2233152\n 20240626123513.0\n \n oai:cds.cern.ch:2233152\n cerncds:TALK\n \n \n eng\n \n \n CERN. Geneva\n \n \n 2016-10-24T10:00:00\n Glimos Instructions for CMS Underground Guiding - in english\n CERN - 513-R-068\n 588590\n 2016-10-24T12:00:00\n \n \n Glimos Instructions for CMS Underground Guiding - in english\n \n \n 2016\n \n \n 2016-10-24\n \n \n 317\n \n \n CERN\n 2016\n \n \n Streaming video\n \n \n e-learning\n \n \n 2016-10-24T10:00:00\n \n \n <!--HTML--><p>In this <strong>presentation in english</strong>, the basic safety rules for CMS underground visits are explained. The trainees are taught how to plan/organize a CMS underground visit along important safety aspects of the CMS underground (Point 5).</p>\n\n<p>Content owners and presenters (CMS safety team) :</p>\n\n<p>Niels Dupont (in french), Michael Brodski (in german), William Esposito (in english)</p>\n\n<p>A pdf document on the subject is available as material from the indico event page. (TO BE DONE from <a href=\"https://twiki.cern.ch/Edutech/CMSGlimosInstructions\">https://twiki.cern.ch/Edutech/CMSGlimosInstructions</a>!)</p>\n\n<p>&nbsp;</p>\n\n<p>Tell us what you think via e-learning.support at cern.ch More tutorials in the e-learning collection of the CERN Document Server (CDS) <a href=\"http://cds.cern.ch/collection/E-learning%20modules?ln=en\">http://cds.cern.ch/collection/E-learning%20modules?ln=en</a></p>\n\n<p>All info about the CERN rapid e-learning project is linked from <a href=\"http://twiki.cern.ch/ELearning\">http://twiki.cern.ch/ELearning</a></p>\n\n<p>&nbsp;</p>\n \n \n e-learning\n \n \n Event\n \n \n TALK\n \n \n CERN\n \n \n movingimages\n \n \n Brodski, Michael\n speaker\n Rheinisch-Westfaelische Tech. Hoch. (DE)\n \n \n Dupont, Niels\n speaker\n CERN\n \n \n Esposito, William\n speaker\n CERN\n \n \n https://indico.cern.ch/event/588590/\n Event details\n \n \n MediaArchive\n /mnt/master_share/master_data/2016/588590\n Absolute master path\n \n \n MediaArchive\n /2016/588590/588590_en.vtt\n subtitle\n subtitle English\n \n \n MediaArchive\n /2016/588590/588590_fr.vtt\n subtitle\n subtitle Fran\u00e7ais\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2016/588590/588590-posterframe-640x360-at-5.0-percent.jpg\n pngthumbnail\n thumbnail weblecture\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2016/588590/588590-1000-kbps-853x480-25-fps-audio-96-kbps-44-kHz-stereo.mp4\n video/mp4\n Content: presenter. Resolution: 853x480. Baudrate: 1000\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2016/588590/588590-2000-kbps-1280x720-25-fps-audio-96-kbps-44-kHz-stereo.mp4\n video/mp4\n Content: presenter. Resolution: 1280x720. Baudrate: 2000\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2016/588590/588590-4000-kbps-1920x1080-25-fps-audio-96-kbps-44-kHz-stereo.mp4\n video/mp4\n Content: presenter. Resolution: 1920x1080. Baudrate: 4000\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2016/588590/588590-512-kbps-426x240-25-fps-audio-96-kbps-44-kHz-stereo.mp4\n video/mp4\n Content: presenter. Resolution: 426x240. Baudrate: 512\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2016/588590/588590-800-kbps-640x360-25-fps-audio-96-kbps-44-kHz-stereo.mp4\n video/mp4\n Content: presenter. Resolution: 640x360. Baudrate: 800\n \n \n maria.dimou@cern.ch\n \n \n Brodski, Michael\n Rheinisch-Westfaelische Tech. Hoch. (DE)\n \n \n Dupont, Niels\n CERN\n \n \n Esposito, William\n CERN\n \n \n 2016-11-14T14:42:36\n 2016-11-14T14:41:23\n \n \n PUBLIC\n \n \n INDICO.588590\n \n \n Indico\n e-learning\n \n", + "json": null, + "modification_datetime": "2024-06-26T10:35:13+00:00" + } + ] + }, + { + "files": [], + "collections": null, + "recid": 489562, + "record": [ + { + "marcxml": "\n 489562\n SzGeCERN\n 20240626120523.0\n \n oai:cds.cern.ch:489562\n forSciTalks\n cerncds:TALK\n cerncds:FULLTEXT\n DOI\n cerncds:TALK:FULLTEXT\n INIS\n \n \n 2243887CERCER\n \n \n AgendaMaker\n a032617\n \n \n Indico\n a032617\n \n \n Indico\n a032618\n \n \n Indico\n a032619\n \n \n Indico\n a032620\n \n \n Indico\n a032621\n \n \n eng\n \n \n Issues in arms control\n \n \n Geneva\n CERN\n 2001\n \n \n 720x576 4/3, 25\n 4334\n \n \n paper\n \n \n CERN Academic Training Lecture\n 392\n \n \n Regular Lecture Programme\n \n \n CERN, Geneva, 12 - 16 Feb 2001\n \n \n Report\n CC-BY-3.0\n \n \n Report\n CERN\n 2001\n \n \n OA\n \n \n SzGeCERN\n Commerce, Economics, Social Science\n \n \n ACAD\n \n \n CERN\n \n \n movingimages\n \n \n Calogero, Francesco\n speaker\n \n \n TH\n \n \n CERN Central Library\n Acad. Train. 392\n \n \n http://documents.cern.ch/cgi-bin/setlink?base=cernrep&categ=Yellow_Report&id=2001-004\n Fulltext\n \n \n EOS\n MP4\n https://cern.ch/digital-memory/media-archive/video/open/mp4/ac_C1V1_CM-A00000339-058.mp4\n Video-CM-A00000339-058\n mp4 video format (640x360)\n (PICTURAE:MD5)CM-A00000339-058.mp4;173e693fc8fc12a92eafb02377803058\n Recovered and reviewed by Tom Barthelemy in Novembre 2020\n \n \n EOS\n MP4\n https://cern.ch/digital-memory/media-archive/video/open/mp4/ac_C1V1_CM-A00000339-059.mp4\n Video-CM-A00000339-059\n mp4 video format (640x360)\n (PICTURAE:MD5)CM-A00000339-059.mp4;9fd9d955dd6da0230cb7216874e48716\n Recovered and reviewed by Tom Barthelemy in Novembre 2020\n \n \n EOS\n MP4\n https://cern.ch/digital-memory/media-archive/video/open/mp4/ac_C1V1_CM-A00000339-065.mp4\n Video-CM-A00000339-065\n mp4 video format (640x360)\n (PICTURAE:MD5)CM-A00000339-065.mp4;07ed118d45df8808a4e425db525e618f\n Recovered and reviewed by Tom Barthelemy in Novembre 2020\n \n \n MediaArchive\n /mnt/master_share/master_data/2001/412092\n Absolute master path\n \n \n MediaArchive\n /2001/412092/412092_en.vtt\n subtitle\n subtitle English\n \n \n MediaArchive\n /2001/412092/412092_fr.vtt\n subtitle\n subtitle Français\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2001/412092/real/slides/img006.JPG\n pngthumbnail\n thumbnail weblecture\n \n \n MediaArchive\n /mnt/master_share/master_data/2001/412093\n Absolute master path\n \n \n MediaArchive\n /2001/412093/412093_en.vtt\n subtitle\n subtitle English\n \n \n MediaArchive\n /2001/412093/412093_fr.vtt\n subtitle\n subtitle Français\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2001/412093/real/slides/img013.JPG\n pngthumbnail\n thumbnail weblecture\n \n \n MediaArchive\n /mnt/master_share/master_data/2001/412094\n Absolute master path\n \n \n MediaArchive\n /2001/412094/412094_en.vtt\n subtitle\n subtitle English\n \n \n MediaArchive\n /2001/412094/412094_fr.vtt\n subtitle\n subtitle Français\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2001/412094/real/slides/img001.JPG\n pngthumbnail\n thumbnail weblecture\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2001/412092/master.mp4\n video/mp4\n Content: presenter. Resolution: 640x480. Baudrate: 210000\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2001/412093/master.mp4\n video/mp4\n Content: presenter. Resolution: 160x120. Baudrate: 210000\n \n \n MediaArchive\n https://lecturemedia.cern.ch/2001/412094/master.mp4\n video/mp4\n Content: presenter. Resolution: 160x120. Baudrate: 210000\n \n \n n\n 200137\n a2001\n \n \n 20080429\n 2023\n CER01\n 20030512\n \n \n 515422\n cern20000901\n \n \n PUBLIC\n \n \n 0002\n \n \n 002243887CER\n \n \n Indico\n ACAD\n \n", + "json": null, + "modification_datetime": "2024-06-26T10:05:23+00:00" + } + ] + } +] diff --git a/tests/cds-videos/test_videos_transform_rules.py b/tests/cds-videos/test_videos_transform_rules.py new file mode 100644 index 00000000..42d43983 --- /dev/null +++ b/tests/cds-videos/test_videos_transform_rules.py @@ -0,0 +1,316 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CDS. +# Copyright (C) 2025 CERN. +# +# cds-migrator-kit is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""CDS-Videos migration tests.""" + +from os.path import dirname, join + +import pytest + +from cds_migrator_kit.errors import ( + MissingRequiredField, + UnexpectedValue, +) +from cds_migrator_kit.transform.dumper import CDSRecordDump +from cds_migrator_kit.videos.weblecture_migration.transform import ( + videos_migrator_marc21, +) +from cds_migrator_kit.videos.weblecture_migration.transform.transform import ( + CDSToVideosRecordEntry, +) +from tests.helpers import add_tag_to_marcxml, load_json, remove_tag_from_marcxml + + +@pytest.fixture() +def datadir(): + """Get data directory.""" + return join(dirname(__file__), "data") + + +def test_transform_rules_reqired_metadata(datadir, base_app): + """Test migration rules.""" + with base_app.app_context(): + data = load_json(datadir, "lecture.json") + dump = CDSRecordDump(data=data[0], dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + created_date, res = dump.latest_revision + + assert res["legacy_recid"] == 2233152 + assert res["recid"] == "2233152" + assert res["language"] == "en" + assert res["contributors"] == [ + { + "name": "Brodski, Michael", + "role": "Speaker", + "affiliations": ["Rheinisch-Westfaelische Tech. Hoch. (DE)"], + }, + {"name": "Dupont, Niels", "role": "Speaker", "affiliations": ["CERN"]}, + {"name": "Esposito, William", "role": "Speaker", "affiliations": ["CERN"]}, + { + "name": "Brodski, Michael", + "role": "Speaker", + "affiliations": ["Rheinisch-Westfaelische Tech. Hoch. (DE)"], + }, + {"name": "Dupont, Niels", "role": "Speaker", "affiliations": ["CERN"]}, + {"name": "Esposito, William", "role": "Speaker", "affiliations": ["CERN"]}, + ] + assert res["title"] == { + "title": "Glimos Instructions for CMS Underground Guiding - in english" + } + assert "2016-10-24" in res["date"] + assert res["description"].startswith( + "

In this presentation in english" + ) + + +def test_transform_required_metadata(datadir, base_app): + """Test migration transform.""" + with base_app.app_context(): + data = load_json(datadir, "lecture.json") + dump = CDSRecordDump(data=data[0], dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + created_date, res = dump.latest_revision + + # Transform record + record_entry = CDSToVideosRecordEntry() + metadata = record_entry._metadata(res) + assert metadata["title"] == { + "title": "Glimos Instructions for CMS Underground Guiding - in english" + } + assert metadata["date"] == "2016-10-24" + # It should be same with the title + assert metadata["description"].startswith( + "

In this presentation in english" + ) + assert metadata["contributors"] == [ + { + "name": "Brodski, Michael", + "role": "Speaker", + "affiliations": ["Rheinisch-Westfaelische Tech. Hoch. (DE)"], + }, + {"name": "Dupont, Niels", "role": "Speaker", "affiliations": ["CERN"]}, + {"name": "Esposito, William", "role": "Speaker", "affiliations": ["CERN"]}, + ] + assert metadata["language"] == "en" + + +def test_transform_description(datadir, base_app): + """Test that the description field `520` is correctly transformed.""" + with base_app.app_context(): + # Load test data + data = load_json(datadir, "lecture.json") + + # Remove the 520 tag (description) from MARCXML + modified_data = data[0] + modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml( + modified_data["record"][-1]["marcxml"], "520" + ) + + dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + _, res = dump.latest_revision + + # Ensure json_converted_record don't have the description + assert "description" not in res + + # Transform record + record_entry = CDSToVideosRecordEntry() + metadata = record_entry._metadata(res) + + # Ensure description exists and matches the title + assert metadata["description"] == metadata["title"]["title"] + + +def test_transform_date(datadir, base_app): + """Test that the date field is correctly transformed.""" + with base_app.app_context(): + # Load test data + data = load_json(datadir, "lecture.json") + + # Test case: Fail due to multiple dates + modified_data = data[0] + modified_data["record"][-1]["marcxml"] = add_tag_to_marcxml( + modified_data["record"][-1]["marcxml"], "518", {"d": "2025-02-06"} + ) + dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + _, res = dump.latest_revision + + # Transform record + record_entry = CDSToVideosRecordEntry() + with pytest.raises(UnexpectedValue): + record_entry._metadata(res) + + # Test case: Fail due to missing dates + modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml( + modified_data["record"][-1]["marcxml"], "518" + ) + modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml( + modified_data["record"][-1]["marcxml"], "269" + ) + + dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + _, res = dump.latest_revision + + # Transform record + with pytest.raises(MissingRequiredField): + record_entry._metadata(res) + + +def test_transform_contributor(datadir, base_app): + """Test that the date field is correctly transformed.""" + with base_app.app_context(): + # Load test data + data = load_json(datadir, "lecture.json") + + # Test case: Fail due to missing contributor + modified_data = data[0] + modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml( + modified_data["record"][-1]["marcxml"], "700" + ) + modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml( + modified_data["record"][-1]["marcxml"], "906" + ) + + dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + _, res = dump.latest_revision + + # Transform record it should fail (no contributor) + record_entry = CDSToVideosRecordEntry() + with pytest.raises(MissingRequiredField): + record_entry._metadata(res) + + +def test_transform_digitized(datadir, base_app): + """Test digitized field is correctly transformed.""" + with base_app.app_context(): + # Load test data + data = load_json(datadir, "lecture.json") + + # Get digitized record and apply rules + entry_data = data[1] + dump = CDSRecordDump(data=entry_data, dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + _, res = dump.latest_revision + + digitized = [ + item["digitized"] for item in res["url_files"] if "digitized" in item + ] + + # Check length + assert len(digitized) == 3, f"Expected 3 digitized items, got {len(digitized)}" + + # Check all URLs contain "digital-memory" + for item in digitized: + assert ( + "digital-memory" in item["url"] + ), f"URL {item['url']} does not contain 'digital-memory'" + + # Transform record it should fail (no valid date, it has date range) + record_entry = CDSToVideosRecordEntry() + with pytest.raises(MissingRequiredField): + record_entry._metadata(res) + + +def test_transform_files(datadir, base_app): + """Test files field is correctly transformed.""" + with base_app.app_context(): + # Load test data + data = load_json(datadir, "lecture.json") + + # Get record and apply rules + entry_data = data[1] + dump = CDSRecordDump(data=entry_data, dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + _, res = dump.latest_revision + + # Test master paths + master_paths = [ + item["master_path"] for item in res["files"] if "master_path" in item + ] + assert ( + len(master_paths) == 3 + ), f"Expected 3 master_path items, got {len(master_paths)}" + for path in master_paths: + assert ( + "/mnt/master_share" in path + ), f"Path {path} does not contain '/mnt/master_share'" + + # Test file paths (excluding URLs) + file_paths = [ + item["path"] + for item in res["files"] + if "path" in item and "url" not in item + ] + assert ( + len(file_paths) == 6 + ), f"Expected 6 only path items, got {len(file_paths)}" + for path in file_paths: + assert path.startswith("/"), f"Path {path} does not start with '/'" + + # Test URL files + url_files = [item for item in res["files"] if "url" in item] + assert len(url_files) == 6, f"Expected 6 URL file items, got {len(url_files)}" + for url_file in url_files: + assert "url" in url_file, f"Missing 'url' key in item: {url_file}" + assert "path" in url_file, f"Missing 'path' key in item: {url_file}" + assert ( + "lecturemedia" in url_file["url"] + ), f"URL {url_file['url']} does not contain 'lecturemedia'" + + +def test_transform_internal_note(datadir, base_app): + """Test digitized field is correctly transformed.""" + with base_app.app_context(): + # Load test data + data = load_json(datadir, "lecture.json") + + # Get record and apply rules + entry_data = data[1] + dump = CDSRecordDump(data=entry_data, dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + _, res = dump.latest_revision + + # Record has one internal note + assert "internal_notes" in res + notes = [item for item in res["internal_notes"]] + assert notes + assert "date" not in notes[0] # note includes date but it's not valid + + # Transform record it should fail (no valid date, it has date range) + record_entry = CDSToVideosRecordEntry() + with pytest.raises(MissingRequiredField): + record_entry._metadata(res) + + # Test case: Add internal note which has a valid date to record + modified_data = data[1] + # Remove the current internal note + modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml( + modified_data["record"][-1]["marcxml"], "500" + ) + # Add new internal note with a valid date + modified_data["record"][-1]["marcxml"] = add_tag_to_marcxml( + modified_data["record"][-1]["marcxml"], "500", {"a": "Note, 16 Feb 2001"} + ) + dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + _, res = dump.latest_revision + + # Record has one internal note + assert "internal_notes" in res + notes = [item for item in res["internal_notes"]] + assert notes + assert "date" in notes[0] # note has a valid date + + # Transform record without failure (it has a valid date) + record_entry = CDSToVideosRecordEntry() + metadata = record_entry._metadata(res) + assert "date" in metadata + assert "2001-02-16" == metadata["date"] diff --git a/tests/helpers.py b/tests/helpers.py index b24b67aa..f40f45f8 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -9,6 +9,7 @@ """Helper functions for usage in tests.""" import json +import xml.etree.ElementTree as ET from os.path import join @@ -19,3 +20,45 @@ def load_json(datadir, filename): with open(filepath, "r") as file_: data = json.load(file_) return data + + +def remove_tag_from_marcxml(marcxml, tag): + """ + Removes a specific MARCXML datafield tag to manipulate the record. + + :param marcxml: The MARCXML string. + :param tag: The MARC tag (e.g., "520") to remove. + :return: Modified MARCXML string with the specified tag removed. + """ + root = ET.fromstring(marcxml) + + # Find and remove all elements with the specified tag + for datafield in root.findall(f".//datafield[@tag='{tag}']"): + root.remove(datafield) + + return ET.tostring(root, encoding="unicode") + + +def add_tag_to_marcxml(marcxml, tag, subfields): + """ + Adds a MARCXML datafield tag to manipulate the record. + + :param marcxml: The MARCXML string. + :param tag: The MARC tag (e.g., tag="999", ind1=" ", ind2=" ") to add. + :param subfields: Dictionary of subfields (e.g., {"a": "New Description"}). + :return: Modified MARCXML string with the new tag added. + """ + root = ET.fromstring(marcxml) + + # Create new datafield element + new_datafield = ET.Element("datafield", tag=tag, ind1=" ", ind2=" ") + + # Add subfields + for code, value in subfields.items(): + subfield = ET.SubElement(new_datafield, "subfield", code=code) + subfield.text = value + + # Append the new datafield + root.append(new_datafield) + + return ET.tostring(root, encoding="unicode") From 86825b2136a8d00143de81cb296d729757aff573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Wed, 12 Feb 2025 17:03:54 +0100 Subject: [PATCH 2/4] setup: separate installation for rdm and videos --- .github/workflows/tests.yml | 63 ++++++++++++---- README.rst | 79 ++++++++++++++++++--- cds_migrator_kit/base_config.py | 26 +++++++ cds_migrator_kit/base_minter.py | 25 +++++++ cds_migrator_kit/cli.py | 28 ++++++++ cds_migrator_kit/import_utils.py | 18 +++++ cds_migrator_kit/rdm/migration_config.py | 3 - cds_migrator_kit/videos/migration_config.py | 49 +++++++++++++ run-tests.sh | 6 ++ setup.cfg | 21 +++--- 10 files changed, 284 insertions(+), 34 deletions(-) create mode 100644 cds_migrator_kit/base_config.py create mode 100644 cds_migrator_kit/base_minter.py create mode 100644 cds_migrator_kit/cli.py create mode 100644 cds_migrator_kit/import_utils.py create mode 100644 cds_migrator_kit/videos/migration_config.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9f8d286d..2590f78d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,34 +14,34 @@ on: branches: master schedule: # * is a special character in YAML so you have to quote this string - - cron: '0 3 * * 6' + - cron: "0 3 * * 6" workflow_dispatch: inputs: reason: - description: 'Reason' + description: "Reason" required: false - default: 'Manual trigger' + default: "Manual trigger" jobs: - Tests: + RDMTests: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [3.9] - requirements-level: [pypi] - db-service: [postgresql14] - include: + python-version: [3.9] + requirements-level: [pypi] + db-service: [postgresql14] + include: - db-service: postgresql14 DB_EXTRAS: "postgresql" env: DB: ${{ matrix.db-service }} - EXTRAS: tests + EXTRAS: rdm,tests steps: - name: Install python-ldap dependencies run: | - sudo apt-get update - sudo apt-get install libsasl2-dev python-dev libldap2-dev libssl-dev + sudo apt-get update + sudo apt-get install libsasl2-dev python-dev libldap2-dev libssl-dev - name: Checkout uses: actions/checkout@v4 @@ -61,4 +61,43 @@ jobs: docker compose --version - name: Run tests - run: ./run-tests.sh + run: ./run-tests.sh rdm + VideosTests: + runs-on: ubuntu-20.04 + strategy: + matrix: + python-version: [3.9] + requirements-level: [pypi] + db-service: [postgresql14] + include: + - db-service: postgresql14 + DB_EXTRAS: "postgresql" + + env: + DB: ${{ matrix.db-service }} + EXTRAS: videos,tests + steps: + - name: Install python-ldap dependencies + run: | + sudo apt-get update + sudo apt-get install libsasl2-dev python-dev libldap2-dev libssl-dev + + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: setup.cfg + + - name: Install dependencies + run: | + pip install ".[$EXTRAS]" + pip freeze + docker --version + docker compose --version + + - name: Run tests + run: ./run-tests.sh videos diff --git a/README.rst b/README.rst index 4305c175..504b2479 100644 --- a/README.rst +++ b/README.rst @@ -7,22 +7,81 @@ cds-migrator-kit ================== +Installation +============ -TODO change here: +Default Installation (without RDM or Videos) +--------------------------------------------- +To install the package without RDM or videos, run: +.. code-block:: bash -Default Installation (without RDM or Videos) -pip install . + pip install . + +Installation for RDM +---------------------- +To install the package with RDM, run: + +.. code-block:: bash + + pip install ".[rdm]" + +To see available RDM commands, run: + +.. code-block:: bash + + invenio migration --help + +Installation for Videos +----------------------- +To install the package with cds-videos, run: + +.. code-block:: bash + + pip install ".[videos]" + +To see available videos commands, run: + +.. code-block:: bash -Install for RDM + invenio migration videos --help -pip install .[rdm] +Running Tests Locally +===================== -Install for Videos +For RDM +-------- +Install rdm and test dependencies: + +.. code-block:: bash + + pip install ".[rdm,tests]" + + +Run the tests with ignoring `cds-videos` tests: + +.. code-block:: bash + + ./run-tests.sh rdm + +For Videos +---------- +Install videos and test dependencies: + +.. code-block:: bash + + pip install ".[videos,tests]" + +Run the video tests: + +.. code-block:: bash + + ./run-tests.sh videos -pip install .[videos] To run the interface: -``` -gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app -``` +===================== +.. code-block:: bash + + gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app + diff --git a/cds_migrator_kit/base_config.py b/cds_migrator_kit/base_config.py new file mode 100644 index 00000000..2e7f4e7e --- /dev/null +++ b/cds_migrator_kit/base_config.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2025 CERN. +# +# cds-migrator-kit is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. +"""Migration configuration for CDS Migrator Kit.""" + +from cds_migrator_kit.import_utils import import_module + +selected_config = None + +# Check for `rdm` dependencies +if import_module("cds_rdm.__init__"): + from cds_migrator_kit.rdm import migration_config as selected_config + +# Check for `videos` dependencies +elif import_module("cds.version"): + from cds_migrator_kit.videos import migration_config as selected_config + +# If no valid module is found, use default one +if selected_config is None: + from cds_migrator_kit import config as selected_config + +# Set the selected config module +globals().update(vars(selected_config)) diff --git a/cds_migrator_kit/base_minter.py b/cds_migrator_kit/base_minter.py new file mode 100644 index 00000000..0b18fc33 --- /dev/null +++ b/cds_migrator_kit/base_minter.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2025 CERN. +# +# cds-migrator-kit is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. +"""Minter configuration for CDS Migrator Kit.""" + +import importlib +import warnings + +# Default: No minter +selected_minter = None + +# Check if `rdm` is installed and set the minter +try: + importlib.import_module("cds_rdm.__init__") + from cds_rdm.minters import legacy_recid_minter as selected_minter +except ImportError: + warnings.warn( + "No valid PID minter found. Ensure `rdm` is installed.", RuntimeWarning + ) + +# Expose the minter function +legacy = selected_minter diff --git a/cds_migrator_kit/cli.py b/cds_migrator_kit/cli.py new file mode 100644 index 00000000..0be7dc8d --- /dev/null +++ b/cds_migrator_kit/cli.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2025 CERN. +# +# cds-migrator-kit is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. +"""cds-migrator-kit command line module.""" + +import click + +from cds_migrator_kit.import_utils import import_module + + +@click.group() +def cli(): + """Base CLI command that loads the subcommands.""" + pass + + +# Check for `rdm` dependencies +if import_module("cds_rdm.__init__"): + from cds_migrator_kit.rdm.cli import migration + cli = migration + +# Check for `videos` dependencies +if import_module("cds.version"): + from cds_migrator_kit.videos.weblecture_migration.cli import videos + cli.add_command(videos, "videos") diff --git a/cds_migrator_kit/import_utils.py b/cds_migrator_kit/import_utils.py new file mode 100644 index 00000000..1ec5e870 --- /dev/null +++ b/cds_migrator_kit/import_utils.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2025 CERN. +# +# cds-migrator-kit is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. +"""Utility function for dynamically checking module availability.""" + +import importlib + + +def import_module(module_name): + """Try to import a module, return True if successful, otherwise False.""" + try: + importlib.import_module(module_name) + return True + except ImportError: + return False \ No newline at end of file diff --git a/cds_migrator_kit/rdm/migration_config.py b/cds_migrator_kit/rdm/migration_config.py index 240d7bbb..e896235a 100644 --- a/cds_migrator_kit/rdm/migration_config.py +++ b/cds_migrator_kit/rdm/migration_config.py @@ -368,9 +368,6 @@ def _(x): # needed to avoid start time failure with lazy strings CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/streams.yaml" -CDS_MIGRATOR_KIT_VIDEOS_STREAM_CONFIG = ( - "cds_migrator_kit/videos/weblecture_migration/streams.yaml" -) RDM_RECORDS_IDENTIFIERS_SCHEMES = { **RDM_RECORDS_IDENTIFIERS_SCHEMES, diff --git a/cds_migrator_kit/videos/migration_config.py b/cds_migrator_kit/videos/migration_config.py new file mode 100644 index 00000000..b4e09447 --- /dev/null +++ b/cds_migrator_kit/videos/migration_config.py @@ -0,0 +1,49 @@ +"""CDS-Videos settings for CDS-Videos project.""" + +import json +import os +from datetime import datetime, timedelta + + +def _(x): # needed to avoid start time failure with lazy strings + return x + + +# Since HAProxy and Nginx route all requests no matter the host header +# provided, the allowed hosts variable is set to localhost. In production it +# should be set to the correct host and it is strongly recommended to only +# route correct hosts to the application. +APP_ALLOWED_HOSTS = ["0.0.0.0", "localhost", "127.0.0.1", "localhost.cern.ch"] + +SQLALCHEMY_DATABASE_URI = ( + "postgresql+psycopg2://cds-videos:cds-videos@localhost/cds-videos" +) + +# SECURITY WARNING: keep the secret key used in production secret! +# Do not commit it to a source code repository. +# TODO: Set +SECRET_KEY = "CHANGE_ME" + +# TODO: Set with your own hostname when deploying to production +SITE_UI_URL = "https://127.0.0.1" + +SITE_API_URL = "https://127.0.0.1/api" + + +DATACITE_ENABLED = True +DATACITE_USERNAME = "" +DATACITE_PASSWORD = "" +DATACITE_PREFIX = "10.17181" +DATACITE_TEST_MODE = True +DATACITE_DATACENTER_SYMBOL = "" + +import cds_migrator_kit + +base_path = os.path.dirname(os.path.realpath(cds_migrator_kit.__file__)) +logs_dir = os.path.join(base_path, "tmp/logs/") +CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir +CDS_MIGRATOR_KIT_VIDEOS_STREAM_CONFIG = ( + "cds_migrator_kit/videos/weblecture_migration/streams.yaml" +) + +### CDS MIGRATOR ################################# diff --git a/run-tests.sh b/run-tests.sh index 02b13eb7..fa5ae42f 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -39,6 +39,12 @@ for arg in $@; do -K|--keep-services) keep_services=1 ;; + rdm) + pytest_args+=( "tests/cds-rdm" "tests/test_cds_migrator_kit.py" ) + ;; + videos) + pytest_args+=( "tests/cds-videos" "tests/test_cds_migrator_kit.py" ) + ;; *) pytest_args+=( ${arg} ) ;; diff --git a/setup.cfg b/setup.cfg index dcc1562d..046a50c7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,9 +15,7 @@ packages = find: python_requires = >=3.9 zip_safe = False install_requires = - invenio-app-rdm[opensearch2]>=13.0.0b1.dev4 sentry-sdk>=1.45,<2.0.0 - cds-rdm @ git+https://github.com/CERNDocumentServer/cds-rdm#egg=cds-rdm&subdirectory=site cds-dojson>=0.12.0 invenio-rdm-migrator>=5.0.0 lxml>=4.6.5 @@ -28,14 +26,20 @@ install_requires = flask-mail>=0.9.0,<0.10.0 fuzzywuzzy>=0.18.0 python-Levenshtein>=0.25.1 - invenio-preservation-sync==0.1.0 - invenio-cern-sync @ git+https://github.com/cerndocumentserver/invenio-cern-sync@v0.1.2#egg=invenio-cern-sync - # needed to run the server gunicorn [options.extras_require] +rdm = + invenio-app-rdm[opensearch2]>=13.0.0b1.dev4 + cds-rdm @ git+https://github.com/CERNDocumentServer/cds-rdm#egg=cds-rdm&subdirectory=site + invenio-preservation-sync==0.1.0 + invenio-cern-sync @ git+https://github.com/cerndocumentserver/invenio-cern-sync@v0.1.2#egg=invenio-cern-sync + +videos = + cds @ git+https://github.com/CERNDocumentServer/cds-videos#egg=cds + tests = pytest-black>=0.3.0 pytest-invenio>=2.1.0,<3.0.0 @@ -46,12 +50,11 @@ tests = console_scripts = migrator = invenio_app.cli:cli flask.commands = - migration = cds_migrator_kit.rdm.cli:migration - videos = cds_migrator_kit.videos.weblecture_migration.cli:videos + migration = cds_migrator_kit.cli:cli invenio_base.apps = cds_migrator_kit = cds_migrator_kit:CdsMigratorKit invenio_config.module = - invenio_app_rdm = cds_migrator_kit.rdm.migration_config + invenio_app_rdm = cds_migrator_kit.base_config invenio_base.blueprints = cds_migrator_kit_views = cds_migrator_kit.reports.views:blueprint cds_migrator_kit.migrator.affiliations.model = @@ -81,7 +84,7 @@ cds_migrator_kit.migrator.rules.thesis = cds_migrator_kit.migrator.rules.people = people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people invenio_pidstore.minters = - legacy = cds_rdm.minters:legacy_recid_minter + legacy = cds_migrator_kit.base_minter:legacy # videos migration cds_migrator_kit.videos.models = video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.models.video_lecture:model From e3e77effe21c40227dc88b910557beb0c22e07f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Mon, 17 Feb 2025 17:48:14 +0100 Subject: [PATCH 3/4] videos: improve transform rules --- .../transform/transform.py | 43 ++++++++++++------- .../xml_processing/quality/contributors.py | 14 +++--- .../xml_processing/rules/video_lecture.py | 22 ++++------ 3 files changed, 45 insertions(+), 34 deletions(-) diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py index dfbee8ad..947aaae5 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py @@ -96,43 +96,56 @@ def _media_files(self, entry): def _metadata(self, entry): """Transform the metadata of a record.""" - def extract_dates(json_data, key, subkey=None): - """Extracts date values from a given key in json_data.""" + def guess_dates(json_data, key, subkey=None): + """Try to get `date` from other fields. + + ### Examples: + 1. **8564 tag may include digitized file information, indico information (link, date) or any url file + json_data = {"url_files": [{"indico": {"url": "http://agenda.cern.ch/..", "date": "2002-03-18"}}], ...} + Calling the method with `key="url_files", subkey="indico"` + Returns all the possible: + json_data["url_files"]["indico"]["date] + + 2. **500__ tag: internal notes that may contain date information + json_data = {"internal_notes": [{"note": "note, 1 Jun 2025", "date": "2025-06-01"}], ...} + Calling the method with `key="internal_notes" + Returns all the possible: + json_data["internal_notes"]["date"] + + ### Returns: + - `set[str]`: A set of date strings. + """ items = json_data.get(key, []) if subkey: return { item[subkey]["date"] for item in items - if isinstance(item, dict) - and subkey in item - and isinstance(item[subkey], dict) + if subkey in item and "date" in item[subkey] } return { item["date"] for item in items - if isinstance(item, dict) and "date" in item + if "date" in item } def reformat_date(json_data): """Reformat the date for the cds-videos data model.""" - # 1. Check primary date field + # Check primary date field dates_set = {date for date in json_data.get("date", []) if date} - # 2. If no date found, check `indico_links` - if not dates_set: - dates_set = extract_dates(json_data, "url_files", subkey="indico") - - # 3. If still no date found, check `internal_notes` + # If no date found, check `indico_links` and `internal_notes` if not dates_set: - dates_set = extract_dates(json_data, "internal_notes") + indico_dates = guess_dates(json_data, "url_files", subkey="indico") + note_dates = guess_dates(json_data, "internal_notes") + dates_set.update(indico_dates, note_dates) - # 4. Return the valid date if only one is found + # Return the valid date if only one is found if len(dates_set) == 1: return next(iter(dates_set)) - # 5. Multiple dates (Must have different indico event videos?) + # Multiple dates (Must have different indico event videos?) if len(dates_set) > 1: raise UnexpectedValue( f"More than one date found in record: {json_data.get('recid')} dates: {dates_set}.", diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py index f9fcd76a..cfcf7156 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py @@ -38,7 +38,7 @@ def get_contributor_role(subfield, role, raise_unexpected=False): return translations[clean_role] -def get_contributor(key, value): +def get_contributor(key, value, contributor_role="", name=""): """Create contributor json for tag 518 and 269.""" beard = value.get("9") if beard is not None and beard != "#BEARD#": @@ -46,13 +46,15 @@ def get_contributor(key, value): # historically it was some kind of automatic script tagging # and it should be ignored if value == #BEARD# raise UnexpectedValue(field=key, subfield="9", value=beard) - name = value.get("a").strip() + if not name: + name = value.get("a").strip() affiliation = value.get("u", "") - role = value.get("e", "") contributor = {"name": name} - if role: - role = get_contributor_role("e", role) - contributor.update({"role": role}) if affiliation: contributor.update({"affiliations": [affiliation]}) + if contributor_role: + contributor.update({"role": contributor_role}) + elif value.get("e", ""): + role = get_contributor_role("e", value.get("e", "")) + contributor.update({"role": role}) return contributor diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py index dd807548..febf9dc8 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py @@ -26,6 +26,9 @@ for_each_value, require, ) +from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.contributors import ( + get_contributor, +) # ATTENTION when COPYING! important which model you use as decorator from ...models.video_lecture import model @@ -92,14 +95,12 @@ def imprint(self, key, value): @for_each_value @require(["a"]) def performer(self, key, value): - """Translates performer.""" - name = value.get("a").strip() + """Translates performer/Participant.""" role = value.get("e") - contributor = {"name": name, "role": "Performer"} # TODO or "Participant" - affiliation = value.get("u", "") - if affiliation: - contributor.update({"affiliations": [affiliation]}) - return contributor + if role and role.strip().lower() != "speaker": + # checking if anything else stored in this field + raise UnexpectedValue("Different role found", field=key, subfield="e", value=role) + return get_contributor(key, value, contributor_role="Performer") @model.over("contributors", "^906__") @@ -107,12 +108,7 @@ def performer(self, key, value): @require(["p"]) def event_speakers(self, key, value): """Translates event_speakers.""" - name = value.get("p").strip() - contributor = {"name": name, "role": "Speaker"} - affiliation = value.get("u", "") - if affiliation: - contributor.update({"affiliations": [affiliation]}) - return contributor + return get_contributor(key, value, contributor_role="Speaker", name=value.get("p").strip()) @model.over("url_files", "^8564_") From 21e0ba24108aaef671304297bc8b6fe3564971e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Mon, 17 Feb 2025 17:51:36 +0100 Subject: [PATCH 4/4] global: runner module, separated tests for rdm/videos --- cds_migrator_kit/base_minter.py | 8 ++++---- cds_migrator_kit/extract/__init__.py | 2 +- cds_migrator_kit/rdm/cli.py | 2 +- cds_migrator_kit/runner/__init__.py | 8 ++++++++ cds_migrator_kit/{rdm => runner}/runner.py | 4 ++-- cds_migrator_kit/videos/weblecture_migration/cli.py | 2 +- tests/{ => cds-rdm}/data/all_fields.json | 0 tests/{ => cds-rdm}/data/summer_note.json | 0 tests/cds-rdm/test_full_migration.py | 4 ++-- tests/{ => cds-rdm}/test_json_translation_rules.py | 0 tests/{ => cds-videos}/conftest.py | 0 11 files changed, 19 insertions(+), 11 deletions(-) create mode 100644 cds_migrator_kit/runner/__init__.py rename cds_migrator_kit/{rdm => runner}/runner.py (97%) rename tests/{ => cds-rdm}/data/all_fields.json (100%) rename tests/{ => cds-rdm}/data/summer_note.json (100%) rename tests/{ => cds-rdm}/test_json_translation_rules.py (100%) rename tests/{ => cds-videos}/conftest.py (100%) diff --git a/cds_migrator_kit/base_minter.py b/cds_migrator_kit/base_minter.py index 0b18fc33..49ee1c3f 100644 --- a/cds_migrator_kit/base_minter.py +++ b/cds_migrator_kit/base_minter.py @@ -6,17 +6,17 @@ # the terms of the MIT License; see LICENSE file for more details. """Minter configuration for CDS Migrator Kit.""" -import importlib import warnings +from cds_migrator_kit.import_utils import import_module + # Default: No minter selected_minter = None # Check if `rdm` is installed and set the minter -try: - importlib.import_module("cds_rdm.__init__") +if import_module("cds_rdm.__init__"): from cds_rdm.minters import legacy_recid_minter as selected_minter -except ImportError: +else: warnings.warn( "No valid PID minter found. Ensure `rdm` is installed.", RuntimeWarning ) diff --git a/cds_migrator_kit/extract/__init__.py b/cds_migrator_kit/extract/__init__.py index 20f79dd5..39a24dc5 100644 --- a/cds_migrator_kit/extract/__init__.py +++ b/cds_migrator_kit/extract/__init__.py @@ -2,7 +2,7 @@ # # Copyright (C) 2025 CERN. # -# CDS-Videos is free software; you can redistribute it and/or modify it under +# cds-migrator-kit is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. """Extract module.""" diff --git a/cds_migrator_kit/rdm/cli.py b/cds_migrator_kit/rdm/cli.py index e7548e2c..b66f0416 100644 --- a/cds_migrator_kit/rdm/cli.py +++ b/cds_migrator_kit/rdm/cli.py @@ -19,7 +19,6 @@ from cds_migrator_kit.rdm.records.streams import ( # UserStreamDefinition, RecordStreamDefinition, ) -from cds_migrator_kit.rdm.runner import Runner from cds_migrator_kit.rdm.stats.runner import RecordStatsRunner from cds_migrator_kit.rdm.stats.streams import RecordStatsStreamDefinition from cds_migrator_kit.rdm.users.runner import PeopleAuthorityRunner, SubmitterRunner @@ -30,6 +29,7 @@ from cds_migrator_kit.rdm.users.transform.xml_processing.models.people import ( PeopleAuthority, ) +from cds_migrator_kit.runner.runner import Runner cli_logger = logging.getLogger("migrator") diff --git a/cds_migrator_kit/runner/__init__.py b/cds_migrator_kit/runner/__init__.py new file mode 100644 index 00000000..e22f5897 --- /dev/null +++ b/cds_migrator_kit/runner/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2025 CERN. +# +# cds-migrator-kit is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""Runner module.""" diff --git a/cds_migrator_kit/rdm/runner.py b/cds_migrator_kit/runner/runner.py similarity index 97% rename from cds_migrator_kit/rdm/runner.py rename to cds_migrator_kit/runner/runner.py index 73b0d5b2..c63798ae 100644 --- a/cds_migrator_kit/rdm/runner.py +++ b/cds_migrator_kit/runner/runner.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2022 CERN. +# Copyright (C) 2022-2025 CERN. # -# Invenio-RDM-Migrator is free software; you can redistribute it and/or modify +# cds-migrator-kit is free software; you can redistribute it and/or modify # it under the terms of the MIT License; see LICENSE file for more details. """InvenioRDM migration streams runner.""" diff --git a/cds_migrator_kit/videos/weblecture_migration/cli.py b/cds_migrator_kit/videos/weblecture_migration/cli.py index 64d2f450..981f8fcb 100644 --- a/cds_migrator_kit/videos/weblecture_migration/cli.py +++ b/cds_migrator_kit/videos/weblecture_migration/cli.py @@ -13,7 +13,7 @@ from flask import current_app from flask.cli import with_appcontext -from cds_migrator_kit.rdm.runner import Runner +from cds_migrator_kit.runner.runner import Runner from cds_migrator_kit.videos.weblecture_migration.streams import RecordStreamDefinition cli_logger = logging.getLogger("migrator") diff --git a/tests/data/all_fields.json b/tests/cds-rdm/data/all_fields.json similarity index 100% rename from tests/data/all_fields.json rename to tests/cds-rdm/data/all_fields.json diff --git a/tests/data/summer_note.json b/tests/cds-rdm/data/summer_note.json similarity index 100% rename from tests/data/summer_note.json rename to tests/cds-rdm/data/summer_note.json diff --git a/tests/cds-rdm/test_full_migration.py b/tests/cds-rdm/test_full_migration.py index b924e8ed..9468818c 100644 --- a/tests/cds-rdm/test_full_migration.py +++ b/tests/cds-rdm/test_full_migration.py @@ -25,9 +25,9 @@ from invenio_vocabularies.contrib.names.models import NamesMetadata from cds_migrator_kit.rdm.records.streams import RecordStreamDefinition -from cds_migrator_kit.rdm.runner import Runner from cds_migrator_kit.rdm.users.runner import SubmitterRunner from cds_migrator_kit.rdm.users.streams import SubmitterStreamDefinition +from cds_migrator_kit.runner.runner import Runner def suite_multi_field(record): @@ -356,7 +356,7 @@ def test_full_migration_stream( Name.index.refresh() mocker.patch( - "cds_migrator_kit.rdm.runner.Runner._read_config", + "cds_migrator_kit.runner.runner.Runner._read_config", return_value={ "db_uri": "postgresql://cds-rdm-migration:cds-rdm-migration@localhost:5432/cds-rdm-migration", "records": { diff --git a/tests/test_json_translation_rules.py b/tests/cds-rdm/test_json_translation_rules.py similarity index 100% rename from tests/test_json_translation_rules.py rename to tests/cds-rdm/test_json_translation_rules.py diff --git a/tests/conftest.py b/tests/cds-videos/conftest.py similarity index 100% rename from tests/conftest.py rename to tests/cds-videos/conftest.py