From 4170b154118e39087df3f88f56b4f88e0af8992a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= <zubeydeecivelek@gmail.com>
Date: Thu, 6 Feb 2025 17:31:32 +0100
Subject: [PATCH 1/4] videos: migration required transform rules

---
 README.rst                                    |  15 +
 cds_migrator_kit/videos/__init__.py           |   2 +-
 .../videos/weblecture_migration/__init__.py   |   2 +-
 .../videos/weblecture_migration/cli.py        |   3 +-
 .../weblecture_migration/load/__init__.py     |   2 +-
 .../videos/weblecture_migration/load/load.py  |   7 -
 .../videos/weblecture_migration/streams.py    |   2 +-
 .../videos/weblecture_migration/streams.yaml  |  20 +-
 .../transform/__init__.py                     |   2 +-
 .../transform/models/__init__.py              |   2 +-
 .../transform/models/base.py                  |   2 +-
 .../transform/models/video_lecture.py         | 199 ++++++-----
 .../transform/transform.py                    |  84 ++++-
 .../transform/xml_processing/__init__.py      |   2 +-
 .../xml_processing/quality/__init__.py        |   2 +-
 .../xml_processing/quality/contributors.py    |  21 +-
 .../xml_processing/rules/__init__.py          |   2 +-
 .../transform/xml_processing/rules/base.py    |  24 +-
 .../xml_processing/rules/video_lecture.py     | 200 +++++++++--
 tests/cds-videos/__init__.py                  |   9 +
 tests/cds-videos/data/lecture.json            |  26 ++
 .../cds-videos/test_videos_transform_rules.py | 316 ++++++++++++++++++
 tests/helpers.py                              |  43 +++
 23 files changed, 824 insertions(+), 163 deletions(-)
 create mode 100644 tests/cds-videos/__init__.py
 create mode 100644 tests/cds-videos/data/lecture.json
 create mode 100644 tests/cds-videos/test_videos_transform_rules.py

diff --git a/README.rst b/README.rst
index df521510..4305c175 100644
--- a/README.rst
+++ b/README.rst
@@ -7,6 +7,21 @@
  cds-migrator-kit
 ==================
 
+
+TODO change here:
+
+
+Default Installation (without RDM or Videos)
+pip install .
+
+Install for RDM
+
+pip install .[rdm]
+
+Install for Videos
+
+pip install .[videos]
+
 To run the interface:
 ```
 gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app
diff --git a/cds_migrator_kit/videos/__init__.py b/cds_migrator_kit/videos/__init__.py
index d425caae..a9f8adc2 100644
--- a/cds_migrator_kit/videos/__init__.py
+++ b/cds_migrator_kit/videos/__init__.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
 #
 # cds-migrator-kit is free software; you can redistribute it and/or modify it
 # under the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/__init__.py b/cds_migrator_kit/videos/weblecture_migration/__init__.py
index 61138b57..31cfe2d5 100644
--- a/cds_migrator_kit/videos/weblecture_migration/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/cli.py b/cds_migrator_kit/videos/weblecture_migration/cli.py
index 13d6e954..64d2f450 100644
--- a/cds_migrator_kit/videos/weblecture_migration/cli.py
+++ b/cds_migrator_kit/videos/weblecture_migration/cli.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
@@ -44,5 +44,6 @@ def run(dry_run=False):
         stream_definitions=[RecordStreamDefinition],
         config_filepath=Path(stream_config).absolute(),
         dry_run=dry_run,
+        collection="weblectures",
     )
     runner.run()
diff --git a/cds_migrator_kit/videos/weblecture_migration/load/__init__.py b/cds_migrator_kit/videos/weblecture_migration/load/__init__.py
index cad2a5c0..7030a9d9 100644
--- a/cds_migrator_kit/videos/weblecture_migration/load/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/load/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/load/load.py b/cds_migrator_kit/videos/weblecture_migration/load/load.py
index 3ad5d23a..13d6db5d 100644
--- a/cds_migrator_kit/videos/weblecture_migration/load/load.py
+++ b/cds_migrator_kit/videos/weblecture_migration/load/load.py
@@ -17,17 +17,10 @@ def __init__(
         db_uri,
         data_dir,
         tmp_dir,
-        existing_data=False,
         entries=None,
         dry_run=False,
     ):
         """Constructor."""
-        self.db_uri = db_uri
-
-        self.data_dir = data_dir
-        self.tmp_dir = tmp_dir
-        self.existing_data = existing_data
-        self.entries = entries
         self.dry_run = dry_run
 
     def _prepare(self, entry):
diff --git a/cds_migrator_kit/videos/weblecture_migration/streams.py b/cds_migrator_kit/videos/weblecture_migration/streams.py
index 0fb53431..94e11af6 100644
--- a/cds_migrator_kit/videos/weblecture_migration/streams.py
+++ b/cds_migrator_kit/videos/weblecture_migration/streams.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/streams.yaml b/cds_migrator_kit/videos/weblecture_migration/streams.yaml
index 055c93da..5d3eb7c1 100644
--- a/cds_migrator_kit/videos/weblecture_migration/streams.yaml
+++ b/cds_migrator_kit/videos/weblecture_migration/streams.yaml
@@ -1,12 +1,10 @@
-data_dir: cds_migrator_kit/videos/weblecture_migration/data/
-tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp
-state_dir: cds_migrator_kit/videos/weblecture_migration/cache
-log_dir: cds_migrator_kit/videos/weblecture_migration/log
-db_uri: postgresql://cds-rdm:cds-rdm@localhost:5432/cds-rdm # TODO CHANGE
-old_secret_key: CHANGE_ME # TODO CHANGE
-new_secret_key: CHANGE_ME # TODO CHANGE
+db_uri: postgresql://cds-videos:cds-videos@localhost:5432/cds-videos
 records:
-  extract:
-    dirpath: cds_migrator_kit/videos/weblecture_migration/data/weblectures/dump/
-  transform:
-    files_dump_dir: cds_migrator_kit/videos/weblecture_migration/data/weblectures/files/
+  weblectures:
+    data_dir: cds_migrator_kit/videos/weblecture_migration/data/
+    tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp
+    log_dir: cds_migrator_kit/videos/weblecture_migration/log
+    extract:
+      dirpath: cds_migrator_kit/videos/weblecture_migration/data/weblectures/dump/
+    transform:
+      files_dump_dir: cds_migrator_kit/videos/weblecture_migration/data/weblectures/files/
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py
index a8216c0e..faebe66e 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py
index 6f9f543f..d1a351f4 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py
index 5885950c..1b01058c 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py
index a2d63dbc..09af17b8 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of CERN Document Server.
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -30,130 +30,177 @@ class VideoLecture(CdsOverdo):
     __query__ = '8567_.x:"Absolute master path" 8567_.d:/mnt/master_share* -980__.C:MIGRATED -980__.c:DELETED -5831_.a:digitized'
 
     __ignore_keys__ = {
-        "110__a",  # corporate author
-        "8567_y",  # File description
-        "111__c",  # Video location (indico location)
-        "518__r",  # Video/meeting location
+        "003",
+        # 340: Drop streaming video, anything else we copy over to curation field.
         "340__a",  # Physical medium -> curation field
         "340__d",  # Physical medium/recording technique -> curation field
+        "340__9",  # Physical medium/CD-ROM -> curation field
+        "340__k",  # Physical medium/ -> curation field
+        "340__j",  # Physical medium/ -> curation field
+        "340__8",  # Physical medium/id? -> curation field https://cds.cern.ch/record/2234827
+        # check with JY
         "961__x",  # Creation Date TODO? check with JY
         "961__c",  # modification Date TODO? check with JY
         "961__h",  # Hour? TODO? check with JY
         "961__l",  # Library? TODO? check with JY
+        "961__a",  # ? TODO? check with JY
+        "961__b",  # ? TODO? check with JY
         "964__a",  # Item owner TODO? check with JY
         "916__d",  # Status week? TODO? check with JY
         "901__u",  # Affiliation at Conversion? TODO? check with JY
         "583__a",  # Action note / curation TODO? check with JY
         "583__c",  # Action note / curation TODO? check with JY
         "583__z",  # Action note / curation TODO? check with JY
+        "916__n",  # Status week TODO? check with JY
+        "916__s",  # Status week TODO? check with JY
+        "916__w",  # Status week TODO? check with JY
+        "916__y",  # Status week TODO? check with JY
+        "916__a",  # Status week TODO? check with JY
+        "306__a",  # ? TODO? check with JY
+        "336__a",  # ? TODO? check with JY
+        "981__a",  # duplicate record id TODO? check with JY
+        "916__a",  # Status week TODO? check with JY
+        "916__d",  # Status week TODO? check with JY
+        "916__e",  # Status week TODO? check with JY
+        "916__s",  # Status week TODO? check with JY
+        "916__w",  # Status week TODO? check with JY
+        "916__y",  # Status week TODO? check with JY
+        "960__a",  # Base?
+        # Category, Collection, Series, Keywords
+        "980__a",  # collection tag
+        "980__b",  # Secondary collection indicator
         "65027a",  # TODO Subject category = Event?
+        "490__a",  # TODO Series
+        "490__v",  # Series: volume
+        "650172",  # subject provenance
+        "65017a",  # subject value
+        "6531_9",  # keyword provenance
+        "6531_a",  # keyword
+        "690C_a",  # collection name
+        # Conference Information/Indico
         "111__a",  # Title (indico)
         "111__9",  # Start date (indico)
         "111__g",  # Event id (indico)
         "111__z",  # End date (indico)
+        "111__c",  # Video location (indico location)
         "084__a",  # Indico?
         "084__2",  # Indico?
-        "8567_2",  # File system? 'MediaArchive'
-        "980__b",  # Secondary collection indicator
+        "518__r",  # Video/meeting location
+        "518__g",  # Lectures: conference identification
+        "970__a",  # alternative identifier, indico id?
+        # Copyright/License
         "542__d",  # Copyright holder
         "542__g",  # Copyright date
-        "490__a",  # TODO Series
-        "8567_u",  # File url
+        "542__3",  # Copyright materials
+        "540__a",  # License
+        "540__b",  # License person/organization
+        "540__u",  # License URL
+        "540__3",  # License material
+        # Alternative identifiers
         "962__n",  # `Presented at` note (conference/linked document)
         "962__b",  # `Presented at` record (conference/linked document)
-        "518__g",  # Lectures: conference identification
-        "490__v",  # Series: volume
-        "269__b",  # Name of publ.
         "088__9",  # Report number (make it alternative identifier with cds reference?)
         "088__z",  # Report number (make it alternative identifier with cds reference?)
-        # Files
-        "8564_q",  # File type (digitized) # TODO this record has both lecturemedia and DM https://cds.cern.ch/record/589875
+        "035__9",  # Inspire schema (Indico/AgendaMaker)
+        "035__a",  # Inspire id value
+        "088__a",  # Report Number --> alternative identifier with ds reference
+        # Additional Title, Volume, Note
+        "246__a",  # Additional title
+        "246__i",  # Additional title/display text
+        "246__b",  # Additional title remaining
+        "246__n",  # Volume
+        "246__p",  # Volume
+        "500__a",  # Note (-> internal note)
+        "500__b",  # Note (-> internal note)
+        "500__9",  # Note/type (-> internal note) https://cds.cern.ch/record/1561636
+        # Restricted
+        "5061_f",
+        "5061_d",
+        "5061_5",
+        "5061_a",
+        "5061_2",
+        # Location (Shelving/Library)
         "852__c",  # Location (Shelving/Library)
+        "852__b",  # Location (Shelving/?)
+        "852__8",  # Location (Shelving/id?) https://cds.cern.ch/record/2234827
         "852__h",  # Location (Shelving) example: https://cds.cern.ch/record/254588/
         "852__a",  # Location (Shelving) example: https://cds.cern.ch/record/558348
         "852__x",  # Location (Shelving/ type? DVD) example: https://cds.cern.ch/record/690000/
+        "852__9",  # Location (Shelving/ note?) example: https://cds.cern.ch/record/2233722
+        # Date/Extra Reduntant
+        "260__c",  # Redundant (more detailed value is in 269__c imprint.pub_date)
+        "260__a",
+        "260__b",
+        # Contributor?
+        "110__a",  # corporate author
+        "700__m",  # author's email
+        "270__p",  # document contact --> add as a contributor with a correct role
+        # Internal Note
+        "595__a",  # Internal Note --> curation field
+        "595__z",  # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean
+        # Collaboration --> add new role to contributors
+        "710__5",  # department / organisation author
+        "710__a",  # organisation author
+        "710__g",  # organisation author
+        # Accelerator/Facility, Experiment, Project, Study
+        "693__a",  # accelerator, create a custom field?
+        "693__e",  # experiments
+        "693__p",  # project
+        "693__s",  # study
+        # Submitter
+        "859__f",  # creator's email
+        "8560_f",  # submitter email
+        # OAI
+        "0248_a",  # oai identifier
+        "0248_p",  # oai identifier
+        "0248_q",
         # IGNORE
         "518__h",  # Lectures: Starting time
         "300__2",  # Imprint
         "300__b",  # Imprint
+        "300__8",  # Imprint
         "300__a",  # Number of pages / duration
         "250__a",  # Edition
         "700__0",  # Author id (eg: AUTHOR|(CDS)2067852)
         "518__l",  # Lectures: length of speech
-        # TODO copied from ssn
-        "0248_a",  # oai identifier, not needed to migrate, TBD
-        "0248_p",  # oai identifier, not needed to migrate, TBD
-        "0248_q",  # full text tag 2778897
-        "100__m",  # author's email <-- decided not to keep in RDM,
-        "260__c",  # Redundant (more detailed value is in 269__c imprint.pub_date)
-        "269__a",  # imprint place
-        "270__m",  # document contact email
-        "595__a",  # always value CERN EDS, not displayed, TODO: do we keep?
-        "595__z",  # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean
-        "700__m",  # author's email <-- decided not to keep in RDM,
-        "710__5",  # department / organisation author
-        "710__a",  # organisation author
-        "8564_8",  # Files system field
-        "8564_s",  # Files system field
-        "8564_u",  # Files
-        "8564_x",  # Files system field
-        "8564_y",  # Files
-        "937__c",  # modification date
-        "937__s",  # modification person
-        "960__a",  # collection id? usually value 12, to confirm if we ignore
-        "980__a",  # collection tag
-        "981__a",  # duplicate record id
-        "003",
-        "035__9",  # Inspire schema
-        "035__a",  # Inspire id value
-        "037__a",  # (Report number) alternative identifiers -> scheme "CDS REFERENCE"
-        "088__a",  # RN (manual introduced?) second report number (so the identifiers schemas are not unique!)
-        "246__a",
-        "246__i",  # abbreviation
-        "246__i",  # abbreviation tag, applies to value of 246__A
-        "270__p",  # document contact person name
-        "500__a",  # Note (-> description.type = other)
-        "562__c",  # note
-        "650172",  # subject provenance
-        "65017a",  # subject value
-        "6531_9",  # keyword provenance
-        "6531_a",  # keyword
-        "690C_a",  # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field)
-        "6931_9",  # keyword
-        "6931_a",  # keyword
-        "693__a",  # accelerator, do we create a custom field?
-        "693__b",  # beams recid: 2640381
-        "693__e",  # custom_fields.cern:experiments
-        "693__f",  # facility, do we create a custom field?
-        "693__p",  # project, do we create a custom field?
-        "693__s",  # study,  do we create a custom field?
-        "710__g",  # Collaboration, OK to migrate as corporate contributor (not creator)?
-        "859__f",  # creator's email, to be used to determine the owner
-        "916__n",
-        "916__s",
-        "916__w",
-        "963__a",
-        "970__a",  # alternative identifier, scheme ALEPH
+        "100__0",  # Author id (eg: AUTHOR|(CDS)2067852)
+        "240__a",  # Decided to drop, (Streaming Video)
+        "337__a",  # Decided to drop, (Video)
+        "963__a",  # values: PUBLIC/RESTRICTED
+        "8564_8",  # File: bibdoc id
+        "8564_s",  # File: file size
         # IMPLEMENTED
         # "520__a",  # Note (-> description.type = abstract
         # "001",
         # "041__a",  # languages
-        # "906__p",  # names, is it supervisor?
+        # "906__p",  # event speakers
         # "100__9",  # #BEARD# tag
         # "100__a",
         # "100__u",  # Author affiliation
-        # "700__e", # Contributor/Speaker role
-        # "700__0",  # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID
+        # "700__e",  # Contributor/Speaker role
+        # "700__0",  # Contributors (cds author id)
         # "700__9",  # #BEARD# tag
         # "700__a",  # Contributors (full name)
         # "700__u",  # Contributors (affiliation)
-        # "518__d", # Full date/time
-        # "269__c", # Date (full date/year)
-        # "518__a", # date?
+        # "518__d",  # Full date/time
+        # "269__c",  # Date (full date/year)
+        # "269__b",  # CERN (checked for other values)
+        # "269__a",  # Geneva (checked for other values)
+        # "518__a",  # Date
+        # "906__u", # Contributor Affiliation
+        # "511__u",  # Contributor Affiliation
+        # "8567_u",  # File url
+        # "8567_y",  # File description
+        # "8567_2",  # File system? 'MediaArchive'
+        # "8564_q",  # File type (digitized)
+        # "8564_8",  # Files system field
+        # "8564_s",  # Files system field
+        # "8564_u",  # Files
+        # "8564_x",  # Files system field
+        # "8564_y",  # Files
     }
 
 
 model = VideoLecture(
-    bases=(base_model,),
-    entry_point_group="cds_migrator_kit.videos.rules.video_lecture",
+    bases=(base_model,), entry_point_group="cds_migrator_kit.videos.rules.video_lecture"
 )
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
index f17c2890..dfbee8ad 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
@@ -96,18 +96,52 @@ def _media_files(self, entry):
     def _metadata(self, entry):
         """Transform the metadata of a record."""
 
+        def extract_dates(json_data, key, subkey=None):
+            """Extracts date values from a given key in json_data."""
+            items = json_data.get(key, [])
+            if subkey:
+                return {
+                    item[subkey]["date"]
+                    for item in items
+                    if isinstance(item, dict)
+                    and subkey in item
+                    and isinstance(item[subkey], dict)
+                    and "date" in item[subkey]
+                }
+
+            return {
+                item["date"]
+                for item in items
+                if isinstance(item, dict) and "date" in item
+            }
+
         def reformat_date(json_data):
             """Reformat the date for the cds-videos data model."""
-            dates = json_data.get("date", [])
-            dates_set = {date for date in dates if date is not None}
+            # 1. Check primary date field
+            dates_set = {date for date in json_data.get("date", []) if date}
 
-            if len(dates_set) == 1:  # Should be only one value
-                return next(iter(dates_set))  # Get the single date from the set
-            if len(dates_set) > 1:
-                return next(iter(dates_set))  # return the first
+            # 2. If no date found, check `indico_links`
+            if not dates_set:
+                dates_set = extract_dates(json_data, "url_files", subkey="indico")
 
-            raise UnexpectedValue(
-                "No valid date found in record: {}.".format(json_data.get("recid"))
+            # 3. If still no date found, check `internal_notes`
+            if not dates_set:
+                dates_set = extract_dates(json_data, "internal_notes")
+
+            # 4. Return the valid date if only one is found
+            if len(dates_set) == 1:
+                return next(iter(dates_set))
+
+            # 5. Multiple dates (Must have different indico event videos?)
+            if len(dates_set) > 1:
+                raise UnexpectedValue(
+                    f"More than one date found in record: {json_data.get('recid')} dates: {dates_set}.",
+                    stage="transform",
+                )
+
+            raise MissingRequiredField(
+                f"No valid date found in record: {json_data.get('recid')}.",
+                stage="transform",
             )
 
         def description(json_data):
@@ -116,11 +150,41 @@ def description(json_data):
                 return json_data.get("title").get("title")
             return json_data.get("description")
 
+        def format_contributors(json_data):
+            """
+            Same contributors could be both in tag 700 and 906.
+
+            TODO: Should we keep them both? https://cds.cern.ch/record/2233152/export/xm?ln=en
+            Removes duplicate contributors based on name, role, and affiliations.
+            """
+            contributors = json_data.get("contributors")
+            if not contributors:
+                raise MissingRequiredField(
+                    f"No valid contributor found in record: {json_data.get('recid')}.",
+                    stage="transform",
+                )
+
+            unique_contributors = []
+            seen = set()
+
+            for contributor in contributors:
+                # Create a tuple to identify contributors
+                identifier = (
+                    contributor["name"],
+                    contributor.get("role"),
+                    tuple(contributor.get("affiliations", [])),
+                )
+                if identifier not in seen:
+                    seen.add(identifier)
+                    unique_contributors.append(contributor)
+
+            return unique_contributors
+
         metadata = {
             "title": entry["title"],
             "description": description(entry),
-            "contributors": entry.get("contributors"),
-            "languages": entry.get("language"),
+            "contributors": format_contributors(entry),
+            "language": entry.get("language"),
             "date": reformat_date(entry),
         }
         # filter empty keys
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py
index ce5e1662..29b34a6c 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py
index d6e355a1..bd60b721 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
index 9e5a6577..f9fcd76a 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
@@ -38,4 +38,21 @@ def get_contributor_role(subfield, role, raise_unexpected=False):
     return translations[clean_role]
 
 
-# TODO contributor affiliation will be implemented
+def get_contributor(key, value):
+    """Create contributor json for tag 518 and 269."""
+    beard = value.get("9")
+    if beard is not None and beard != "#BEARD#":
+        # checking if anything else stored in this field
+        # historically it was some kind of automatic script tagging
+        # and it should be ignored if value == #BEARD#
+        raise UnexpectedValue(field=key, subfield="9", value=beard)
+    name = value.get("a").strip()
+    affiliation = value.get("u", "")
+    role = value.get("e", "")
+    contributor = {"name": name}
+    if role:
+        role = get_contributor_role("e", role)
+        contributor.update({"role": role})
+    if affiliation:
+        contributor.update({"affiliations": [affiliation]})
+    return contributor
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py
index aa43604f..72582f04 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
 #
 # cds-migrator-kit is free software; you can redistribute it and/or modify it
 # under the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py
index fd751c6f..fc10c171 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
 #
 # CDS-Videos is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
@@ -21,7 +21,7 @@
 )
 
 from ...models.base import model
-from ..quality.contributors import get_contributor_role
+from ..quality.contributors import get_contributor
 
 
 @model.over("legacy_recid", "^001")
@@ -68,26 +68,12 @@ def language(self, key, value):
 @require(["a"])
 def creators(self, key, value):
     """Translates the creators field."""
-    role = value.get("e")
-    if role:
-        role = get_contributor_role("e", role)
-    beard = value.get("9")
-    if beard is not None and beard != "#BEARD#":
-        # checking if anything else stored in this field
-        # historically it was some kind of automatic script tagging
-        # and it should be ignored if value == #BEARD#
-        raise UnexpectedValue(field=key, subfield="9", value=beard)
-    name = value.get("a").strip()
-    contributor = {"name": name}
-    if role:
-        contributor.update({"role": role})
-    # TODO contributor affiliation will be implemented
-
-    return contributor
+    return get_contributor(key, value)
 
 
 @model.over("contributors", "^700__")
+@for_each_value
 @require(["a"])
 def contributors(self, key, value):
     """Translates contributors."""
-    return creators(self, key, value)
+    return get_contributor(key, value)
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
index 205ac8aa..dd807548 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of CERN Document Server.
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -17,8 +17,11 @@
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """Common Videos fields."""
+import re
+
 from dateutil.parser import ParserError, parse
 
+from cds_migrator_kit.errors import UnexpectedValue
 from cds_migrator_kit.transform.xml_processing.quality.decorators import (
     for_each_value,
     require,
@@ -28,39 +31,61 @@
 from ...models.video_lecture import model
 
 
+def parse_date(date_str):
+    """Parses a date string into 'YYYY-MM-DD' format.
+
+    Returns None if the string is missing, too short, too long,
+    or if it contains incomplete or ambiguous date information.
+
+    Examples:
+    - Some values only contain year (e.g., "1998")
+    - Some values has date range (e.g., "23 - 27 Nov 1998")
+    """
+    if not date_str:
+        return None
+    if len(date_str) < 10 or len(date_str) > 13:
+        # Too short/long to have the full date info
+        return None
+    try:
+        parsed_date = parse(date_str)
+        return parsed_date.strftime("%Y-%m-%d")
+    except ParserError:
+        return
+
+
 @model.over("date", "^518__")
 @for_each_value
 def date(self, key, value):
     """Translates date from tag 518."""
-
-    def parse_date(date_str):
-        """Parses a date string into 'YYYY-MM-DD' format."""
-        try:
-            if len(date_str) < 10:  # Too short to have the full date info
-                return None
-            parsed_date = parse(date_str)
-            return parsed_date.strftime("%Y-%m-%d")
-        except ParserError:
-            return
-
-    # List of possible subfields containing dates
-    possible_date_fields = [
-        value.get("d"),  # 518 'd' subfield (e.g., '2024-11-19T14:00:00')
-        value.get("c"),  # 269 'c' subfield (e.g., '1993-08-09')
-        value.get("a"),  # 518 'a' subfield (e.g., 'CERN, Geneva, 23 - 27 Nov 1998')
-    ]
-
-    for date_field in possible_date_fields:
-        if date_field:
-            parsed_date = parse_date(date_field)
-            if parsed_date:  # If parsing succeeds, return the formatted date
-                return parsed_date
+    # 518 'd' subfield, take the first 10 char, it might have another character (e.g., 2008-03-11T14:00:00Z)
+    parsed_date = parse_date((value.get("d") or "")[:10])
+    if parsed_date:
+        return parsed_date
+    # 518 'a' subfield (e.g., 'CERN, Geneva, 23 - 27 Nov 1998')
+    parsed_date = parse_date(value.get("a", "").split(",")[-1])
+    if parsed_date:
+        return parsed_date
 
 
 @model.over("date", "^269__")
+@for_each_value
 def imprint(self, key, value):
     """Translates date from tag 269."""
-    return date(self, key, value)
+    name = value.get("b")
+    if name and name.strip().upper() != "CERN":
+        # checking if anything else stored in this field
+        # and it should be ignored if value == CERN
+        raise UnexpectedValue(field=key, subfield="b", value=name)
+    place = value.get("a")
+    if place and place.strip().upper() != "GENEVA":
+        # checking if anything else stored in this field
+        # and it should be ignored if value == Geneva
+        raise UnexpectedValue(field=key, subfield="a", value=place)
+
+    date_field = value.get("c")  # 269 'c' subfield (e.g., '1993-08-09')
+    parsed_date = parse_date(date_field)
+    if parsed_date:  # If parsing succeeds, return the formatted date
+        return parsed_date
 
 
 @model.over("contributors", "^511__")
@@ -71,7 +96,9 @@ def performer(self, key, value):
     name = value.get("a").strip()
     role = value.get("e")
     contributor = {"name": name, "role": "Performer"}  # TODO or "Participant"
-    # TODO contributor affiliation will be implemented
+    affiliation = value.get("u", "")
+    if affiliation:
+        contributor.update({"affiliations": [affiliation]})
     return contributor
 
 
@@ -82,5 +109,124 @@ def event_speakers(self, key, value):
     """Translates event_speakers."""
     name = value.get("p").strip()
     contributor = {"name": name, "role": "Speaker"}
-    # TODO contributor affiliation will be implemented
+    affiliation = value.get("u", "")
+    if affiliation:
+        contributor.update({"affiliations": [affiliation]})
     return contributor
+
+
+@model.over("url_files", "^8564_")
+@for_each_value
+@require(["u"])
+def url_files(self, key, value):
+    """Detects 8564 files."""
+    url = value.get("u")
+    if "digital-memory" in url:
+        return {
+            "digitized": {
+                "url": url,
+                "format": value.get("q"),
+                "link_text": value.get("y"),
+                "public_note": value.get("z"),
+                "nonpublic_note": value.get("x"),
+                "md5_checksum": value.get("w"),
+                "source": value.get("2"),
+            }
+        }
+    elif "indico" in url or "agenda" in url:
+        indico_link = {"url": url}
+
+        # Try to get event id
+        match_id = re.search(r"(?:ida=|confId=|event/)([\w\d]+)", url)
+        if match_id:
+            event_id = match_id.group(1)
+            if event_id:
+                indico_link["event_id"] = event_id
+
+        # Try to get the date from text
+        text = value.get("y")
+        if text:
+            indico_link["text"] = text
+        match_date = re.search(r"(?:Talk\s*)?(\d{1,2}\s\w{3}\s\d{4})", text)
+        if match_date:
+            parsed_date = parse_date(match_date.group(1))
+            if parsed_date:
+                indico_link["date"] = parsed_date
+
+        return {"indico": indico_link}
+
+    url_file = {"url_file": {"url": url}}
+    text = value.get("y")
+    if text:
+        url_file["url_file"]["text"] = text
+
+    nonpublic_note = value.get("x")
+    if nonpublic_note:
+        url_file["url_file"]["nonpublic_note"] = nonpublic_note
+    return url_file
+
+
+@model.over("internal_notes", "^500__")
+@for_each_value
+@require(["a"])
+def internal_notes(self, key, value):
+    """Detects internal notes."""
+    note = value.get("a").strip()
+    if value.get("9"):
+        note = value.get("9").strip() + " : " + value.get("a").strip()
+    internal_note = {"note": note}
+
+    parts = note.split(",")
+    match_date = parts[-1].strip() if len(parts) > 1 else ""
+    if match_date:
+        parsed_date = parse_date(match_date)
+        if parsed_date:
+            internal_note.update({"date": parsed_date})
+
+    return internal_note
+
+
+@model.over("files", "^8567_")
+@for_each_value
+def files(self, key, value):
+    """Detects files."""
+    source = value.get("2")
+    if source and source.strip() != "MediaArchive":
+        # Check if anything else stored
+        raise UnexpectedValue(field=key, subfield="2", value=source)
+
+    file = {}
+
+    # Master path
+    master_path = value.get("d", "").strip()
+    if master_path:
+        if master_path.startswith("/mnt/master_share"):
+            file["master_path"] = master_path
+            file_type = value.get("x", "").strip()
+            if file_type and file_type != "Absolute master path":
+                # Check if anything else stored
+                raise UnexpectedValue(field=key, subfield="x", value=file_type)
+        else:
+            # Raise error if anything else stored
+            raise UnexpectedValue(field=key, subfield="d", value=master_path)
+
+    # File with url/path
+    url = value.get("u", "").strip()
+    if url:
+        if url.startswith("/"):
+            file["path"] = url  # Relative path
+        elif url.startswith("https://lecturemedia.cern.ch"):
+            file["url"] = url
+            file["path"] = url.replace("https://lecturemedia.cern.ch", "")
+        else:
+            # Check if anything else stored
+            raise UnexpectedValue(field=key, subfield="u", value=url)
+        file_type = value.get("x")
+        if file_type:
+            file["type"] = file_type.strip()
+
+        description = value.get("y")
+        if description:
+            file["description"] = description.strip()
+
+    return file
diff --git a/tests/cds-videos/__init__.py b/tests/cds-videos/__init__.py
new file mode 100644
index 00000000..c001bb54
--- /dev/null
+++ b/tests/cds-videos/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of CDS.
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it
+# under the terms of the MIT License; see LICENSE file for more details.
+
+"""Migration tool kit from old CDS to new CDS-Videos - test suite."""
diff --git a/tests/cds-videos/data/lecture.json b/tests/cds-videos/data/lecture.json
new file mode 100644
index 00000000..3b0c0e04
--- /dev/null
+++ b/tests/cds-videos/data/lecture.json
@@ -0,0 +1,26 @@
+[
+  {
+    "files": [],
+    "collections": null,
+    "recid": 2233152,
+    "record": [
+      {
+        "marcxml": "<record>\n  <controlfield tag=\"001\">2233152</controlfield>\n  <controlfield tag=\"005\">20240626123513.0</controlfield>\n  <datafield tag=\"024\" ind1=\"8\" ind2=\" \">\n    <subfield code=\"a\">oai:cds.cern.ch:2233152</subfield>\n    <subfield code=\"p\">cerncds:TALK</subfield>\n  </datafield>\n  <datafield tag=\"041\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">eng</subfield>\n  </datafield>\n  <datafield tag=\"110\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">CERN. Geneva</subfield>\n  </datafield>\n  <datafield tag=\"111\" ind1=\" \" ind2=\" \">\n    <subfield code=\"9\">2016-10-24T10:00:00</subfield>\n    <subfield code=\"a\">Glimos Instructions for CMS Underground Guiding - in english</subfield>\n    <subfield code=\"c\">CERN - 513-R-068</subfield>\n    <subfield code=\"g\">588590</subfield>\n    <subfield code=\"z\">2016-10-24T12:00:00</subfield>\n  </datafield>\n  <datafield tag=\"245\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Glimos Instructions for CMS Underground Guiding - in english</subfield>\n  </datafield>\n  <datafield tag=\"260\" ind1=\" \" ind2=\" \">\n    <subfield code=\"c\">2016</subfield>\n  </datafield>\n  <datafield tag=\"269\" ind1=\" \" ind2=\" \">\n    <subfield code=\"c\">2016-10-24</subfield>\n  </datafield>\n  <datafield tag=\"300\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">317</subfield>\n  </datafield>\n  <datafield tag=\"542\" ind1=\" \" ind2=\" \">\n    <subfield code=\"d\">CERN</subfield>\n    <subfield code=\"g\">2016</subfield>\n  </datafield>\n  <datafield tag=\"340\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Streaming video</subfield>\n  </datafield>\n  <datafield tag=\"490\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">e-learning</subfield>\n  </datafield>\n  <datafield tag=\"518\" ind1=\" \" ind2=\" \">\n    <subfield code=\"d\">2016-10-24T10:00:00</subfield>\n  </datafield>\n  <datafield tag=\"520\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">&lt;!--HTML-->&lt;p>In this &lt;strong>presentation in english&lt;/strong>, the basic safety rules for CMS underground visits are explained. The trainees are taught how to plan/organize a CMS underground visit along important safety aspects of the CMS underground (Point 5).&lt;/p>\n\n&lt;p>Content owners and presenters (CMS safety team) :&lt;/p>\n\n&lt;p>Niels Dupont (in french), Michael Brodski (in german), William Esposito (in english)&lt;/p>\n\n&lt;p>A pdf document on the subject is available as material from the indico event page. (TO BE DONE from &lt;a href=\"https://twiki.cern.ch/Edutech/CMSGlimosInstructions\">https://twiki.cern.ch/Edutech/CMSGlimosInstructions&lt;/a>!)&lt;/p>\n\n&lt;p>&amp;nbsp;&lt;/p>\n\n&lt;p>Tell us what you think via e-learning.support at cern.ch More tutorials in the e-learning collection of the CERN Document Server (CDS) &lt;a href=\"http://cds.cern.ch/collection/E-learning%20modules?ln=en\">http://cds.cern.ch/collection/E-learning%20modules?ln=en&lt;/a>&lt;/p>\n\n&lt;p>All info about the CERN rapid e-learning project is linked from &lt;a href=\"http://twiki.cern.ch/ELearning\">http://twiki.cern.ch/ELearning&lt;/a>&lt;/p>\n\n&lt;p>&amp;nbsp;&lt;/p></subfield>\n  </datafield>\n  <datafield tag=\"650\" ind1=\"1\" ind2=\"7\">\n    <subfield code=\"a\">e-learning</subfield>\n  </datafield>\n  <datafield tag=\"650\" ind1=\"2\" ind2=\"7\">\n    <subfield code=\"a\">Event</subfield>\n  </datafield>\n  <datafield tag=\"690\" ind1=\"C\" ind2=\" \">\n    <subfield code=\"a\">TALK</subfield>\n  </datafield>\n  <datafield tag=\"690\" ind1=\"C\" ind2=\" \">\n    <subfield code=\"a\">CERN</subfield>\n  </datafield>\n  <datafield tag=\"690\" ind1=\"C\" ind2=\" \">\n    <subfield code=\"a\">movingimages</subfield>\n  </datafield>\n  <datafield tag=\"700\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Brodski, Michael</subfield>\n    <subfield code=\"e\">speaker</subfield>\n    <subfield code=\"u\">Rheinisch-Westfaelische Tech. Hoch. (DE)</subfield>\n  </datafield>\n  <datafield tag=\"700\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Dupont, Niels</subfield>\n    <subfield code=\"e\">speaker</subfield>\n    <subfield code=\"u\">CERN</subfield>\n  </datafield>\n  <datafield tag=\"700\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Esposito, William</subfield>\n    <subfield code=\"e\">speaker</subfield>\n    <subfield code=\"u\">CERN</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"4\" ind2=\" \">\n    <subfield code=\"u\">https://indico.cern.ch/event/588590/</subfield>\n    <subfield code=\"y\">Event details</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"d\">/mnt/master_share/master_data/2016/588590</subfield>\n    <subfield code=\"x\">Absolute master path</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">/2016/588590/588590_en.vtt</subfield>\n    <subfield code=\"x\">subtitle</subfield>\n    <subfield code=\"y\">subtitle English</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">/2016/588590/588590_fr.vtt</subfield>\n    <subfield code=\"x\">subtitle</subfield>\n    <subfield code=\"y\">subtitle Fran\u00e7ais</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2016/588590/588590-posterframe-640x360-at-5.0-percent.jpg</subfield>\n    <subfield code=\"x\">pngthumbnail</subfield>\n    <subfield code=\"y\">thumbnail weblecture</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2016/588590/588590-1000-kbps-853x480-25-fps-audio-96-kbps-44-kHz-stereo.mp4</subfield>\n    <subfield code=\"x\">video/mp4</subfield>\n    <subfield code=\"y\">Content: presenter. Resolution: 853x480. Baudrate: 1000</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2016/588590/588590-2000-kbps-1280x720-25-fps-audio-96-kbps-44-kHz-stereo.mp4</subfield>\n    <subfield code=\"x\">video/mp4</subfield>\n    <subfield code=\"y\">Content: presenter. Resolution: 1280x720. Baudrate: 2000</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2016/588590/588590-4000-kbps-1920x1080-25-fps-audio-96-kbps-44-kHz-stereo.mp4</subfield>\n    <subfield code=\"x\">video/mp4</subfield>\n    <subfield code=\"y\">Content: presenter. Resolution: 1920x1080. Baudrate: 4000</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2016/588590/588590-512-kbps-426x240-25-fps-audio-96-kbps-44-kHz-stereo.mp4</subfield>\n    <subfield code=\"x\">video/mp4</subfield>\n    <subfield code=\"y\">Content: presenter. Resolution: 426x240. Baudrate: 512</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2016/588590/588590-800-kbps-640x360-25-fps-audio-96-kbps-44-kHz-stereo.mp4</subfield>\n    <subfield code=\"x\">video/mp4</subfield>\n    <subfield code=\"y\">Content: presenter. Resolution: 640x360. Baudrate: 800</subfield>\n  </datafield>\n  <datafield tag=\"859\" ind1=\" \" ind2=\" \">\n    <subfield code=\"f\">maria.dimou@cern.ch</subfield>\n  </datafield>\n  <datafield tag=\"906\" ind1=\" \" ind2=\" \">\n    <subfield code=\"p\">Brodski, Michael</subfield>\n    <subfield code=\"u\">Rheinisch-Westfaelische Tech. Hoch. (DE)</subfield>\n  </datafield>\n  <datafield tag=\"906\" ind1=\" \" ind2=\" \">\n    <subfield code=\"p\">Dupont, Niels</subfield>\n    <subfield code=\"u\">CERN</subfield>\n  </datafield>\n  <datafield tag=\"906\" ind1=\" \" ind2=\" \">\n    <subfield code=\"p\">Esposito, William</subfield>\n    <subfield code=\"u\">CERN</subfield>\n  </datafield>\n  <datafield tag=\"961\" ind1=\" \" ind2=\" \">\n    <subfield code=\"c\">2016-11-14T14:42:36</subfield>\n    <subfield code=\"x\">2016-11-14T14:41:23</subfield>\n  </datafield>\n  <datafield tag=\"963\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">PUBLIC</subfield>\n  </datafield>\n  <datafield tag=\"970\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">INDICO.588590</subfield>\n  </datafield>\n  <datafield tag=\"980\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Indico</subfield>\n    <subfield code=\"b\">e-learning</subfield>\n  </datafield>\n</record>",
+        "json": null,
+        "modification_datetime": "2024-06-26T10:35:13+00:00"
+      }
+    ]
+  },
+  {
+    "files": [],
+    "collections": null,
+    "recid": 489562,
+    "record": [
+      {
+        "marcxml": "<record>\n  <controlfield tag=\"001\">489562</controlfield>\n  <controlfield tag=\"003\">SzGeCERN</controlfield>\n  <controlfield tag=\"005\">20240626120523.0</controlfield>\n  <datafield tag=\"024\" ind1=\"8\" ind2=\" \">\n    <subfield code=\"a\">oai:cds.cern.ch:489562</subfield>\n    <subfield code=\"p\">forSciTalks</subfield>\n    <subfield code=\"p\">cerncds:TALK</subfield>\n    <subfield code=\"q\">cerncds:FULLTEXT</subfield>\n    <subfield code=\"q\">DOI</subfield>\n    <subfield code=\"q\">cerncds:TALK:FULLTEXT</subfield>\n    <subfield code=\"q\">INIS</subfield>\n  </datafield>\n  <datafield tag=\"035\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">2243887CERCER</subfield>\n  </datafield>\n  <datafield tag=\"035\" ind1=\" \" ind2=\" \">\n    <subfield code=\"9\">AgendaMaker</subfield>\n    <subfield code=\"a\">a032617</subfield>\n  </datafield>\n  <datafield tag=\"035\" ind1=\" \" ind2=\" \">\n    <subfield code=\"9\">Indico</subfield>\n    <subfield code=\"a\">a032617</subfield>\n  </datafield>\n  <datafield tag=\"035\" ind1=\" \" ind2=\" \">\n    <subfield code=\"9\">Indico</subfield>\n    <subfield code=\"a\">a032618</subfield>\n  </datafield>\n  <datafield tag=\"035\" ind1=\" \" ind2=\" \">\n    <subfield code=\"9\">Indico</subfield>\n    <subfield code=\"a\">a032619</subfield>\n  </datafield>\n  <datafield tag=\"035\" ind1=\" \" ind2=\" \">\n    <subfield code=\"9\">Indico</subfield>\n    <subfield code=\"a\">a032620</subfield>\n  </datafield>\n  <datafield tag=\"035\" ind1=\" \" ind2=\" \">\n    <subfield code=\"9\">Indico</subfield>\n    <subfield code=\"a\">a032621</subfield>\n  </datafield>\n  <datafield tag=\"041\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">eng</subfield>\n  </datafield>\n  <datafield tag=\"245\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Issues in arms control</subfield>\n  </datafield>\n  <datafield tag=\"260\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Geneva</subfield>\n    <subfield code=\"b\">CERN</subfield>\n    <subfield code=\"c\">2001</subfield>\n  </datafield>\n  <datafield tag=\"300\" ind1=\" \" ind2=\" \">\n    <subfield code=\"b\">720x576 4/3, 25</subfield>\n    <subfield code=\"a\">4334</subfield>\n  </datafield>\n  <datafield tag=\"340\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">paper</subfield>\n  </datafield>\n  <datafield tag=\"490\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">CERN Academic Training Lecture</subfield>\n    <subfield code=\"v\">392</subfield>\n  </datafield>\n  <datafield tag=\"490\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Regular Lecture Programme</subfield>\n  </datafield>\n  <datafield tag=\"500\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">CERN, Geneva, 12 - 16 Feb 2001</subfield>\n  </datafield>\n  <datafield tag=\"540\" ind1=\" \" ind2=\" \">\n    <subfield code=\"3\">Report</subfield>\n    <subfield code=\"a\">CC-BY-3.0</subfield>\n  </datafield>\n  <datafield tag=\"542\" ind1=\" \" ind2=\" \">\n    <subfield code=\"3\">Report</subfield>\n    <subfield code=\"d\">CERN</subfield>\n    <subfield code=\"g\">2001</subfield>\n  </datafield>\n  <datafield tag=\"595\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">OA</subfield>\n  </datafield>\n  <datafield tag=\"650\" ind1=\"1\" ind2=\"7\">\n    <subfield code=\"2\">SzGeCERN</subfield>\n    <subfield code=\"a\">Commerce, Economics, Social Science</subfield>\n  </datafield>\n  <datafield tag=\"690\" ind1=\"C\" ind2=\" \">\n    <subfield code=\"a\">ACAD</subfield>\n  </datafield>\n  <datafield tag=\"690\" ind1=\"C\" ind2=\" \">\n    <subfield code=\"a\">CERN</subfield>\n  </datafield>\n  <datafield tag=\"690\" ind1=\"C\" ind2=\" \">\n    <subfield code=\"a\">movingimages</subfield>\n  </datafield>\n  <datafield tag=\"700\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Calogero, Francesco</subfield>\n    <subfield code=\"e\">speaker</subfield>\n  </datafield>\n  <datafield tag=\"710\" ind1=\" \" ind2=\" \">\n    <subfield code=\"5\">TH</subfield>\n  </datafield>\n  <datafield tag=\"852\" ind1=\" \" ind2=\" \">\n    <subfield code=\"c\">CERN Central Library</subfield>\n    <subfield code=\"h\">Acad. Train. 392</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"4\" ind2=\" \">\n    <subfield code=\"u\">http://documents.cern.ch/cgi-bin/setlink?base=cernrep&amp;categ=Yellow_Report&amp;id=2001-004</subfield>\n    <subfield code=\"y\">Fulltext</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"4\" ind2=\" \">\n    <subfield code=\"2\">EOS</subfield>\n    <subfield code=\"q\">MP4</subfield>\n    <subfield code=\"u\">https://cern.ch/digital-memory/media-archive/video/open/mp4/ac_C1V1_CM-A00000339-058.mp4</subfield>\n    <subfield code=\"y\">Video-CM-A00000339-058</subfield>\n    <subfield code=\"z\">mp4 video format (640x360)</subfield>\n    <subfield code=\"w\">(PICTURAE:MD5)CM-A00000339-058.mp4;173e693fc8fc12a92eafb02377803058</subfield>\n    <subfield code=\"x\">Recovered and reviewed by Tom Barthelemy in Novembre 2020</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"4\" ind2=\" \">\n    <subfield code=\"2\">EOS</subfield>\n    <subfield code=\"q\">MP4</subfield>\n    <subfield code=\"u\">https://cern.ch/digital-memory/media-archive/video/open/mp4/ac_C1V1_CM-A00000339-059.mp4</subfield>\n    <subfield code=\"y\">Video-CM-A00000339-059</subfield>\n    <subfield code=\"z\">mp4 video format (640x360)</subfield>\n    <subfield code=\"w\">(PICTURAE:MD5)CM-A00000339-059.mp4;9fd9d955dd6da0230cb7216874e48716</subfield>\n    <subfield code=\"x\">Recovered and reviewed by Tom Barthelemy in Novembre 2020</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"4\" ind2=\" \">\n    <subfield code=\"2\">EOS</subfield>\n    <subfield code=\"q\">MP4</subfield>\n    <subfield code=\"u\">https://cern.ch/digital-memory/media-archive/video/open/mp4/ac_C1V1_CM-A00000339-065.mp4</subfield>\n    <subfield code=\"y\">Video-CM-A00000339-065</subfield>\n    <subfield code=\"z\">mp4 video format (640x360)</subfield>\n    <subfield code=\"w\">(PICTURAE:MD5)CM-A00000339-065.mp4;07ed118d45df8808a4e425db525e618f</subfield>\n    <subfield code=\"x\">Recovered and reviewed by Tom Barthelemy in Novembre 2020</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"d\">/mnt/master_share/master_data/2001/412092</subfield>\n    <subfield code=\"x\">Absolute master path</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">/2001/412092/412092_en.vtt</subfield>\n    <subfield code=\"x\">subtitle</subfield>\n    <subfield code=\"y\">subtitle English</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">/2001/412092/412092_fr.vtt</subfield>\n    <subfield code=\"x\">subtitle</subfield>\n    <subfield code=\"y\">subtitle Français</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2001/412092/real/slides/img006.JPG</subfield>\n    <subfield code=\"x\">pngthumbnail</subfield>\n    <subfield code=\"y\">thumbnail weblecture</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"d\">/mnt/master_share/master_data/2001/412093</subfield>\n    <subfield code=\"x\">Absolute master path</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">/2001/412093/412093_en.vtt</subfield>\n    <subfield code=\"x\">subtitle</subfield>\n    <subfield code=\"y\">subtitle English</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">/2001/412093/412093_fr.vtt</subfield>\n    <subfield code=\"x\">subtitle</subfield>\n    <subfield code=\"y\">subtitle Français</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2001/412093/real/slides/img013.JPG</subfield>\n    <subfield code=\"x\">pngthumbnail</subfield>\n    <subfield code=\"y\">thumbnail weblecture</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"d\">/mnt/master_share/master_data/2001/412094</subfield>\n    <subfield code=\"x\">Absolute master path</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">/2001/412094/412094_en.vtt</subfield>\n    <subfield code=\"x\">subtitle</subfield>\n    <subfield code=\"y\">subtitle English</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">/2001/412094/412094_fr.vtt</subfield>\n    <subfield code=\"x\">subtitle</subfield>\n    <subfield code=\"y\">subtitle Français</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2001/412094/real/slides/img001.JPG</subfield>\n    <subfield code=\"x\">pngthumbnail</subfield>\n    <subfield code=\"y\">thumbnail weblecture</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2001/412092/master.mp4</subfield>\n    <subfield code=\"x\">video/mp4</subfield>\n    <subfield code=\"y\">Content: presenter. Resolution: 640x480. Baudrate: 210000</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2001/412093/master.mp4</subfield>\n    <subfield code=\"x\">video/mp4</subfield>\n    <subfield code=\"y\">Content: presenter. Resolution: 160x120. Baudrate: 210000</subfield>\n  </datafield>\n  <datafield tag=\"856\" ind1=\"7\" ind2=\" \">\n    <subfield code=\"2\">MediaArchive</subfield>\n    <subfield code=\"u\">https://lecturemedia.cern.ch/2001/412094/master.mp4</subfield>\n    <subfield code=\"x\">video/mp4</subfield>\n    <subfield code=\"y\">Content: presenter. Resolution: 160x120. Baudrate: 210000</subfield>\n  </datafield>\n  <datafield tag=\"916\" ind1=\" \" ind2=\" \">\n    <subfield code=\"s\">n</subfield>\n    <subfield code=\"w\">200137</subfield>\n    <subfield code=\"y\">a2001</subfield>\n  </datafield>\n  <datafield tag=\"961\" ind1=\" \" ind2=\" \">\n    <subfield code=\"c\">20080429</subfield>\n    <subfield code=\"h\">2023</subfield>\n    <subfield code=\"l\">CER01</subfield>\n    <subfield code=\"x\">20030512</subfield>\n  </datafield>\n  <datafield tag=\"962\" ind1=\" \" ind2=\" \">\n    <subfield code=\"b\">515422</subfield>\n    <subfield code=\"n\">cern20000901</subfield>\n  </datafield>\n  <datafield tag=\"963\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">PUBLIC</subfield>\n  </datafield>\n  <datafield tag=\"964\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">0002</subfield>\n  </datafield>\n  <datafield tag=\"970\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">002243887CER</subfield>\n  </datafield>\n  <datafield tag=\"980\" ind1=\" \" ind2=\" \">\n    <subfield code=\"a\">Indico</subfield>\n    <subfield code=\"b\">ACAD</subfield>\n  </datafield>\n</record>",
+        "json": null,
+        "modification_datetime": "2024-06-26T10:05:23+00:00"
+      }
+    ]
+  }
+]
diff --git a/tests/cds-videos/test_videos_transform_rules.py b/tests/cds-videos/test_videos_transform_rules.py
new file mode 100644
index 00000000..42d43983
--- /dev/null
+++ b/tests/cds-videos/test_videos_transform_rules.py
@@ -0,0 +1,316 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of CDS.
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it
+# under the terms of the MIT License; see LICENSE file for more details.
+
+"""CDS-Videos migration tests."""
+
+from os.path import dirname, join
+
+import pytest
+
+from cds_migrator_kit.errors import (
+    MissingRequiredField,
+    UnexpectedValue,
+)
+from cds_migrator_kit.transform.dumper import CDSRecordDump
+from cds_migrator_kit.videos.weblecture_migration.transform import (
+    videos_migrator_marc21,
+)
+from cds_migrator_kit.videos.weblecture_migration.transform.transform import (
+    CDSToVideosRecordEntry,
+)
+from tests.helpers import add_tag_to_marcxml, load_json, remove_tag_from_marcxml
+
+
+@pytest.fixture()
+def datadir():
+    """Get data directory."""
+    return join(dirname(__file__), "data")
+
+
+def test_transform_rules_reqired_metadata(datadir, base_app):
+    """Test migration rules."""
+    with base_app.app_context():
+        data = load_json(datadir, "lecture.json")
+        dump = CDSRecordDump(data=data[0], dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        created_date, res = dump.latest_revision
+
+        assert res["legacy_recid"] == 2233152
+        assert res["recid"] == "2233152"
+        assert res["language"] == "en"
+        assert res["contributors"] == [
+            {
+                "name": "Brodski, Michael",
+                "role": "Speaker",
+                "affiliations": ["Rheinisch-Westfaelische Tech. Hoch. (DE)"],
+            },
+            {"name": "Dupont, Niels", "role": "Speaker", "affiliations": ["CERN"]},
+            {"name": "Esposito, William", "role": "Speaker", "affiliations": ["CERN"]},
+            {
+                "name": "Brodski, Michael",
+                "role": "Speaker",
+                "affiliations": ["Rheinisch-Westfaelische Tech. Hoch. (DE)"],
+            },
+            {"name": "Dupont, Niels", "role": "Speaker", "affiliations": ["CERN"]},
+            {"name": "Esposito, William", "role": "Speaker", "affiliations": ["CERN"]},
+        ]
+        assert res["title"] == {
+            "title": "Glimos Instructions for CMS Underground Guiding - in english"
+        }
+        assert "2016-10-24" in res["date"]
+        assert res["description"].startswith(
+            "<!--HTML--><p>In this <strong>presentation in english</strong>"
+        )
+
+
+def test_transform_required_metadata(datadir, base_app):
+    """Test migration transform."""
+    with base_app.app_context():
+        data = load_json(datadir, "lecture.json")
+        dump = CDSRecordDump(data=data[0], dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        created_date, res = dump.latest_revision
+
+        # Transform record
+        record_entry = CDSToVideosRecordEntry()
+        metadata = record_entry._metadata(res)
+        assert metadata["title"] == {
+            "title": "Glimos Instructions for CMS Underground Guiding - in english"
+        }
+        assert metadata["date"] == "2016-10-24"
+        # It should be same with the title
+        assert metadata["description"].startswith(
+            "<!--HTML--><p>In this <strong>presentation in english</strong>"
+        )
+        assert metadata["contributors"] == [
+            {
+                "name": "Brodski, Michael",
+                "role": "Speaker",
+                "affiliations": ["Rheinisch-Westfaelische Tech. Hoch. (DE)"],
+            },
+            {"name": "Dupont, Niels", "role": "Speaker", "affiliations": ["CERN"]},
+            {"name": "Esposito, William", "role": "Speaker", "affiliations": ["CERN"]},
+        ]
+        assert metadata["language"] == "en"
+
+
+def test_transform_description(datadir, base_app):
+    """Test that the description field `520` is correctly transformed."""
+    with base_app.app_context():
+        # Load test data
+        data = load_json(datadir, "lecture.json")
+
+        # Remove the 520 tag (description) from MARCXML
+        modified_data = data[0]
+        modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+            modified_data["record"][-1]["marcxml"], "520"
+        )
+
+        dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        _, res = dump.latest_revision
+
+        # Ensure json_converted_record don't have the description
+        assert "description" not in res
+
+        # Transform record
+        record_entry = CDSToVideosRecordEntry()
+        metadata = record_entry._metadata(res)
+
+        # Ensure description exists and matches the title
+        assert metadata["description"] == metadata["title"]["title"]
+
+
+def test_transform_date(datadir, base_app):
+    """Test that the date field is correctly transformed."""
+    with base_app.app_context():
+        # Load test data
+        data = load_json(datadir, "lecture.json")
+
+        # Test case: Fail due to multiple dates
+        modified_data = data[0]
+        modified_data["record"][-1]["marcxml"] = add_tag_to_marcxml(
+            modified_data["record"][-1]["marcxml"], "518", {"d": "2025-02-06"}
+        )
+        dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        _, res = dump.latest_revision
+
+        # Transform record
+        record_entry = CDSToVideosRecordEntry()
+        with pytest.raises(UnexpectedValue):
+            record_entry._metadata(res)
+
+        # Test case: Fail due to missing dates
+        modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+            modified_data["record"][-1]["marcxml"], "518"
+        )
+        modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+            modified_data["record"][-1]["marcxml"], "269"
+        )
+
+        dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        _, res = dump.latest_revision
+
+        # Transform record
+        with pytest.raises(MissingRequiredField):
+            record_entry._metadata(res)
+
+
+def test_transform_contributor(datadir, base_app):
+    """Test that the date field is correctly transformed."""
+    with base_app.app_context():
+        # Load test data
+        data = load_json(datadir, "lecture.json")
+
+        # Test case: Fail due to missing contributor
+        modified_data = data[0]
+        modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+            modified_data["record"][-1]["marcxml"], "700"
+        )
+        modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+            modified_data["record"][-1]["marcxml"], "906"
+        )
+
+        dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        _, res = dump.latest_revision
+
+        # Transform record it should fail (no contributor)
+        record_entry = CDSToVideosRecordEntry()
+        with pytest.raises(MissingRequiredField):
+            record_entry._metadata(res)
+
+
+def test_transform_digitized(datadir, base_app):
+    """Test digitized field is correctly transformed."""
+    with base_app.app_context():
+        # Load test data
+        data = load_json(datadir, "lecture.json")
+
+        # Get digitized record and apply rules
+        entry_data = data[1]
+        dump = CDSRecordDump(data=entry_data, dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        _, res = dump.latest_revision
+
+        digitized = [
+            item["digitized"] for item in res["url_files"] if "digitized" in item
+        ]
+
+        # Check length
+        assert len(digitized) == 3, f"Expected 3 digitized items, got {len(digitized)}"
+
+        # Check all URLs contain "digital-memory"
+        for item in digitized:
+            assert (
+                "digital-memory" in item["url"]
+            ), f"URL {item['url']} does not contain 'digital-memory'"
+
+        # Transform record it should fail (no valid date, it has date range)
+        record_entry = CDSToVideosRecordEntry()
+        with pytest.raises(MissingRequiredField):
+            record_entry._metadata(res)
+
+
+def test_transform_files(datadir, base_app):
+    """Test files field is correctly transformed."""
+    with base_app.app_context():
+        # Load test data
+        data = load_json(datadir, "lecture.json")
+
+        # Get record and apply rules
+        entry_data = data[1]
+        dump = CDSRecordDump(data=entry_data, dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        _, res = dump.latest_revision
+
+        # Test master paths
+        master_paths = [
+            item["master_path"] for item in res["files"] if "master_path" in item
+        ]
+        assert (
+            len(master_paths) == 3
+        ), f"Expected 3 master_path items, got {len(master_paths)}"
+        for path in master_paths:
+            assert (
+                "/mnt/master_share" in path
+            ), f"Path {path} does not contain '/mnt/master_share'"
+
+        # Test file paths (excluding URLs)
+        file_paths = [
+            item["path"]
+            for item in res["files"]
+            if "path" in item and "url" not in item
+        ]
+        assert (
+            len(file_paths) == 6
+        ), f"Expected 6 only path items, got {len(file_paths)}"
+        for path in file_paths:
+            assert path.startswith("/"), f"Path {path} does not start with '/'"
+
+        # Test URL files
+        url_files = [item for item in res["files"] if "url" in item]
+        assert len(url_files) == 6, f"Expected 6 URL file items, got {len(url_files)}"
+        for url_file in url_files:
+            assert "url" in url_file, f"Missing 'url' key in item: {url_file}"
+            assert "path" in url_file, f"Missing 'path' key in item: {url_file}"
+            assert (
+                "lecturemedia" in url_file["url"]
+            ), f"URL {url_file['url']} does not contain 'lecturemedia'"
+
+
+def test_transform_internal_note(datadir, base_app):
+    """Test digitized field is correctly transformed."""
+    with base_app.app_context():
+        # Load test data
+        data = load_json(datadir, "lecture.json")
+
+        # Get record and apply rules
+        entry_data = data[1]
+        dump = CDSRecordDump(data=entry_data, dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        _, res = dump.latest_revision
+
+        # Record has one internal note
+        assert "internal_notes" in res
+        notes = [item for item in res["internal_notes"]]
+        assert notes
+        assert "date" not in notes[0]  # note includes date but it's not valid
+
+        # Transform record it should fail (no valid date, it has date range)
+        record_entry = CDSToVideosRecordEntry()
+        with pytest.raises(MissingRequiredField):
+            record_entry._metadata(res)
+
+        # Test case: Add internal note which has a valid date to record
+        modified_data = data[1]
+        # Remove the current internal note
+        modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+            modified_data["record"][-1]["marcxml"], "500"
+        )
+        # Add new internal note with a valid date
+        modified_data["record"][-1]["marcxml"] = add_tag_to_marcxml(
+            modified_data["record"][-1]["marcxml"], "500", {"a": "Note, 16 Feb 2001"}
+        )
+        dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+        dump.prepare_revisions()
+        _, res = dump.latest_revision
+
+        # Record has one internal note
+        assert "internal_notes" in res
+        notes = [item for item in res["internal_notes"]]
+        assert notes
+        assert "date" in notes[0]  # note has a valid date
+
+        # Transform record without failure (it has a valid date)
+        record_entry = CDSToVideosRecordEntry()
+        metadata = record_entry._metadata(res)
+        assert "date" in metadata
+        assert "2001-02-16" == metadata["date"]
diff --git a/tests/helpers.py b/tests/helpers.py
index b24b67aa..f40f45f8 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -9,6 +9,7 @@
 """Helper functions for usage in tests."""
 
 import json
+import xml.etree.ElementTree as ET
 from os.path import join
 
 
@@ -19,3 +20,45 @@ def load_json(datadir, filename):
     with open(filepath, "r") as file_:
         data = json.load(file_)
     return data
+
+
+def remove_tag_from_marcxml(marcxml, tag):
+    """
+    Removes a specific MARCXML datafield tag to manipulate the record.
+
+    :param marcxml: The MARCXML string.
+    :param tag: The MARC tag (e.g., "520") to remove.
+    :return: Modified MARCXML string with the specified tag removed.
+    """
+    root = ET.fromstring(marcxml)
+
+    # Find and remove all <datafield> elements with the specified tag
+    for datafield in root.findall(f".//datafield[@tag='{tag}']"):
+        root.remove(datafield)
+
+    return ET.tostring(root, encoding="unicode")
+
+
+def add_tag_to_marcxml(marcxml, tag, subfields):
+    """
+    Adds a MARCXML datafield tag to manipulate the record.
+
+    :param marcxml: The MARCXML string.
+    :param tag: The MARC tag (e.g., tag="999", ind1=" ", ind2=" ") to add.
+    :param subfields: Dictionary of subfields (e.g., {"a": "New Description"}).
+    :return: Modified MARCXML string with the new tag added.
+    """
+    root = ET.fromstring(marcxml)
+
+    # Create new datafield element
+    new_datafield = ET.Element("datafield", tag=tag, ind1=" ", ind2=" ")
+
+    # Add subfields
+    for code, value in subfields.items():
+        subfield = ET.SubElement(new_datafield, "subfield", code=code)
+        subfield.text = value
+
+    # Append the new datafield
+    root.append(new_datafield)
+
+    return ET.tostring(root, encoding="unicode")

From 86825b2136a8d00143de81cb296d729757aff573 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= <zubeydeecivelek@gmail.com>
Date: Wed, 12 Feb 2025 17:03:54 +0100
Subject: [PATCH 2/4] setup: separate installation for rdm and videos

---
 .github/workflows/tests.yml                 | 63 ++++++++++++----
 README.rst                                  | 79 ++++++++++++++++++---
 cds_migrator_kit/base_config.py             | 26 +++++++
 cds_migrator_kit/base_minter.py             | 25 +++++++
 cds_migrator_kit/cli.py                     | 28 ++++++++
 cds_migrator_kit/import_utils.py            | 18 +++++
 cds_migrator_kit/rdm/migration_config.py    |  3 -
 cds_migrator_kit/videos/migration_config.py | 49 +++++++++++++
 run-tests.sh                                |  6 ++
 setup.cfg                                   | 21 +++---
 10 files changed, 284 insertions(+), 34 deletions(-)
 create mode 100644 cds_migrator_kit/base_config.py
 create mode 100644 cds_migrator_kit/base_minter.py
 create mode 100644 cds_migrator_kit/cli.py
 create mode 100644 cds_migrator_kit/import_utils.py
 create mode 100644 cds_migrator_kit/videos/migration_config.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9f8d286d..2590f78d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -14,34 +14,34 @@ on:
     branches: master
   schedule:
     # * is a special character in YAML so you have to quote this string
-    - cron:  '0 3 * * 6'
+    - cron: "0 3 * * 6"
   workflow_dispatch:
     inputs:
       reason:
-        description: 'Reason'
+        description: "Reason"
         required: false
-        default: 'Manual trigger'
+        default: "Manual trigger"
 
 jobs:
-  Tests:
+  RDMTests:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-          python-version: [3.9]
-          requirements-level: [pypi]
-          db-service: [postgresql14]
-          include:
+        python-version: [3.9]
+        requirements-level: [pypi]
+        db-service: [postgresql14]
+        include:
           - db-service: postgresql14
             DB_EXTRAS: "postgresql"
 
     env:
       DB: ${{ matrix.db-service }}
-      EXTRAS: tests
+      EXTRAS: rdm,tests
     steps:
       - name: Install python-ldap dependencies
         run: |
-            sudo apt-get update
-            sudo apt-get install libsasl2-dev python-dev libldap2-dev libssl-dev
+          sudo apt-get update
+          sudo apt-get install libsasl2-dev python-dev libldap2-dev libssl-dev
 
       - name: Checkout
         uses: actions/checkout@v4
@@ -61,4 +61,43 @@ jobs:
           docker compose --version
 
       - name: Run tests
-        run: ./run-tests.sh
+        run: ./run-tests.sh rdm
+  VideosTests:
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        python-version: [3.9]
+        requirements-level: [pypi]
+        db-service: [postgresql14]
+        include:
+          - db-service: postgresql14
+            DB_EXTRAS: "postgresql"
+
+    env:
+      DB: ${{ matrix.db-service }}
+      EXTRAS: videos,tests
+    steps:
+      - name: Install python-ldap dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install libsasl2-dev python-dev libldap2-dev libssl-dev
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: pip
+          cache-dependency-path: setup.cfg
+
+      - name: Install dependencies
+        run: |
+          pip install ".[$EXTRAS]"
+          pip freeze
+          docker --version
+          docker compose --version
+
+      - name: Run tests
+        run: ./run-tests.sh videos
diff --git a/README.rst b/README.rst
index 4305c175..504b2479 100644
--- a/README.rst
+++ b/README.rst
@@ -7,22 +7,81 @@
  cds-migrator-kit
 ==================
 
+Installation
+============
 
-TODO change here:
+Default Installation (without RDM or Videos)
+---------------------------------------------
+To install the package without RDM or videos, run:
 
+.. code-block:: bash
 
-Default Installation (without RDM or Videos)
-pip install .
+    pip install .
+
+Installation for RDM
+----------------------
+To install the package with RDM, run:
+
+.. code-block:: bash
+
+    pip install ".[rdm]"
+
+To see available RDM commands, run:
+
+.. code-block:: bash
+
+    invenio migration --help
+
+Installation for Videos
+-----------------------
+To install the package with cds-videos, run:
+
+.. code-block:: bash
+
+    pip install ".[videos]"
+
+To see available videos commands, run:
+
+.. code-block:: bash
 
-Install for RDM
+    invenio migration videos --help
 
-pip install .[rdm]
+Running Tests Locally
+=====================
 
-Install for Videos
+For RDM
+--------
+Install rdm and test dependencies:
+
+.. code-block:: bash
+
+    pip install ".[rdm,tests]"
+
+
+Run the tests with ignoring `cds-videos` tests:
+
+.. code-block:: bash
+
+    ./run-tests.sh rdm
+
+For Videos
+----------
+Install videos and test dependencies:
+
+.. code-block:: bash
+
+    pip install ".[videos,tests]"
+
+Run the video tests:
+
+.. code-block:: bash
+
+    ./run-tests.sh videos
 
-pip install .[videos]
 
 To run the interface:
-```
-gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app
-```
+=====================
+.. code-block:: bash
+    
+    gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app
+
diff --git a/cds_migrator_kit/base_config.py b/cds_migrator_kit/base_config.py
new file mode 100644
index 00000000..2e7f4e7e
--- /dev/null
+++ b/cds_migrator_kit/base_config.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+"""Migration configuration for CDS Migrator Kit."""
+
+from cds_migrator_kit.import_utils import import_module
+
+selected_config = None
+
+# Check for `rdm` dependencies
+if import_module("cds_rdm.__init__"):
+    from cds_migrator_kit.rdm import migration_config as selected_config
+
+# Check for `videos` dependencies
+elif import_module("cds.version"):
+    from cds_migrator_kit.videos import migration_config as selected_config
+
+# If no valid module is found, use default one
+if selected_config is None:
+    from cds_migrator_kit import config as selected_config
+
+# Set the selected config module
+globals().update(vars(selected_config))
diff --git a/cds_migrator_kit/base_minter.py b/cds_migrator_kit/base_minter.py
new file mode 100644
index 00000000..0b18fc33
--- /dev/null
+++ b/cds_migrator_kit/base_minter.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+"""Minter configuration for CDS Migrator Kit."""
+
+import importlib
+import warnings
+
+# Default: No minter
+selected_minter = None
+
+# Check if `rdm` is installed and set the minter
+try:
+    importlib.import_module("cds_rdm.__init__")
+    from cds_rdm.minters import legacy_recid_minter as selected_minter
+except ImportError:
+    warnings.warn(
+        "No valid PID minter found. Ensure `rdm` is installed.", RuntimeWarning
+    )
+
+# Expose the minter function
+legacy = selected_minter
diff --git a/cds_migrator_kit/cli.py b/cds_migrator_kit/cli.py
new file mode 100644
index 00000000..0be7dc8d
--- /dev/null
+++ b/cds_migrator_kit/cli.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+"""cds-migrator-kit command line module."""
+
+import click
+
+from cds_migrator_kit.import_utils import import_module
+
+
+@click.group()
+def cli():
+    """Base CLI command that loads the subcommands."""
+    pass
+
+
+# Check for `rdm` dependencies
+if import_module("cds_rdm.__init__"):
+    from cds_migrator_kit.rdm.cli import migration
+    cli = migration
+
+# Check for `videos` dependencies
+if import_module("cds.version"):
+    from cds_migrator_kit.videos.weblecture_migration.cli import videos
+    cli.add_command(videos, "videos")
diff --git a/cds_migrator_kit/import_utils.py b/cds_migrator_kit/import_utils.py
new file mode 100644
index 00000000..1ec5e870
--- /dev/null
+++ b/cds_migrator_kit/import_utils.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+"""Utility function for dynamically checking module availability."""
+
+import importlib
+
+
+def import_module(module_name):
+    """Try to import a module, return True if successful, otherwise False."""
+    try:
+        importlib.import_module(module_name)
+        return True
+    except ImportError:
+        return False
\ No newline at end of file
diff --git a/cds_migrator_kit/rdm/migration_config.py b/cds_migrator_kit/rdm/migration_config.py
index 240d7bbb..e896235a 100644
--- a/cds_migrator_kit/rdm/migration_config.py
+++ b/cds_migrator_kit/rdm/migration_config.py
@@ -368,9 +368,6 @@ def _(x):  # needed to avoid start time failure with lazy strings
 CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
 
 CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/streams.yaml"
-CDS_MIGRATOR_KIT_VIDEOS_STREAM_CONFIG = (
-    "cds_migrator_kit/videos/weblecture_migration/streams.yaml"
-)
 
 RDM_RECORDS_IDENTIFIERS_SCHEMES = {
     **RDM_RECORDS_IDENTIFIERS_SCHEMES,
diff --git a/cds_migrator_kit/videos/migration_config.py b/cds_migrator_kit/videos/migration_config.py
new file mode 100644
index 00000000..b4e09447
--- /dev/null
+++ b/cds_migrator_kit/videos/migration_config.py
@@ -0,0 +1,49 @@
+"""CDS-Videos settings for CDS-Videos project."""
+
+import json
+import os
+from datetime import datetime, timedelta
+
+
+def _(x):  # needed to avoid start time failure with lazy strings
+    return x
+
+
+# Since HAProxy and Nginx route all requests no matter the host header
+# provided, the allowed hosts variable is set to localhost. In production it
+# should be set to the correct host and it is strongly recommended to only
+# route correct hosts to the application.
+APP_ALLOWED_HOSTS = ["0.0.0.0", "localhost", "127.0.0.1", "localhost.cern.ch"]
+
+SQLALCHEMY_DATABASE_URI = (
+    "postgresql+psycopg2://cds-videos:cds-videos@localhost/cds-videos"
+)
+
+# SECURITY WARNING: keep the secret key used in production secret!
+# Do not commit it to a source code repository.
+# TODO: Set
+SECRET_KEY = "CHANGE_ME"
+
+# TODO: Set with your own hostname when deploying to production
+SITE_UI_URL = "https://127.0.0.1"
+
+SITE_API_URL = "https://127.0.0.1/api"
+
+
+DATACITE_ENABLED = True
+DATACITE_USERNAME = ""
+DATACITE_PASSWORD = ""
+DATACITE_PREFIX = "10.17181"
+DATACITE_TEST_MODE = True
+DATACITE_DATACENTER_SYMBOL = ""
+
+import cds_migrator_kit
+
+base_path = os.path.dirname(os.path.realpath(cds_migrator_kit.__file__))
+logs_dir = os.path.join(base_path, "tmp/logs/")
+CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
+CDS_MIGRATOR_KIT_VIDEOS_STREAM_CONFIG = (
+    "cds_migrator_kit/videos/weblecture_migration/streams.yaml"
+)
+
+### CDS MIGRATOR #################################
diff --git a/run-tests.sh b/run-tests.sh
index 02b13eb7..fa5ae42f 100755
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -39,6 +39,12 @@ for arg in $@; do
 		-K|--keep-services)
 			keep_services=1
 			;;
+		rdm)
+            pytest_args+=( "tests/cds-rdm" "tests/test_cds_migrator_kit.py" )
+            ;;
+        videos)
+            pytest_args+=( "tests/cds-videos" "tests/test_cds_migrator_kit.py" )
+            ;;
 		*)
 			pytest_args+=( ${arg} )
 			;;
diff --git a/setup.cfg b/setup.cfg
index dcc1562d..046a50c7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -15,9 +15,7 @@ packages = find:
 python_requires = >=3.9
 zip_safe = False
 install_requires =
-    invenio-app-rdm[opensearch2]>=13.0.0b1.dev4
     sentry-sdk>=1.45,<2.0.0
-    cds-rdm @ git+https://github.com/CERNDocumentServer/cds-rdm#egg=cds-rdm&subdirectory=site
     cds-dojson>=0.12.0
     invenio-rdm-migrator>=5.0.0
     lxml>=4.6.5
@@ -28,14 +26,20 @@ install_requires =
     flask-mail>=0.9.0,<0.10.0
     fuzzywuzzy>=0.18.0
     python-Levenshtein>=0.25.1
-    invenio-preservation-sync==0.1.0
-    invenio-cern-sync @ git+https://github.com/cerndocumentserver/invenio-cern-sync@v0.1.2#egg=invenio-cern-sync
-
 
     # needed to run the server
     gunicorn
 
 [options.extras_require]
+rdm = 
+    invenio-app-rdm[opensearch2]>=13.0.0b1.dev4
+    cds-rdm @ git+https://github.com/CERNDocumentServer/cds-rdm#egg=cds-rdm&subdirectory=site
+    invenio-preservation-sync==0.1.0
+    invenio-cern-sync @ git+https://github.com/cerndocumentserver/invenio-cern-sync@v0.1.2#egg=invenio-cern-sync
+
+videos = 
+    cds @ git+https://github.com/CERNDocumentServer/cds-videos#egg=cds
+        
 tests =
     pytest-black>=0.3.0
     pytest-invenio>=2.1.0,<3.0.0
@@ -46,12 +50,11 @@ tests =
 console_scripts =
     migrator = invenio_app.cli:cli
 flask.commands =
-    migration = cds_migrator_kit.rdm.cli:migration
-    videos = cds_migrator_kit.videos.weblecture_migration.cli:videos
+    migration = cds_migrator_kit.cli:cli
 invenio_base.apps =
     cds_migrator_kit = cds_migrator_kit:CdsMigratorKit
 invenio_config.module =
-    invenio_app_rdm = cds_migrator_kit.rdm.migration_config
+    invenio_app_rdm = cds_migrator_kit.base_config
 invenio_base.blueprints =
     cds_migrator_kit_views = cds_migrator_kit.reports.views:blueprint
 cds_migrator_kit.migrator.affiliations.model =
@@ -81,7 +84,7 @@ cds_migrator_kit.migrator.rules.thesis =
 cds_migrator_kit.migrator.rules.people =
     people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people
 invenio_pidstore.minters =
-    legacy = cds_rdm.minters:legacy_recid_minter
+    legacy = cds_migrator_kit.base_minter:legacy
 # videos migration
 cds_migrator_kit.videos.models =
     video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.models.video_lecture:model

From e3e77effe21c40227dc88b910557beb0c22e07f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= <zubeydeecivelek@gmail.com>
Date: Mon, 17 Feb 2025 17:48:14 +0100
Subject: [PATCH 3/4] videos: improve transform rules

---
 .../transform/transform.py                    | 43 ++++++++++++-------
 .../xml_processing/quality/contributors.py    | 14 +++---
 .../xml_processing/rules/video_lecture.py     | 22 ++++------
 3 files changed, 45 insertions(+), 34 deletions(-)

diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
index dfbee8ad..947aaae5 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
@@ -96,43 +96,56 @@ def _media_files(self, entry):
     def _metadata(self, entry):
         """Transform the metadata of a record."""
 
-        def extract_dates(json_data, key, subkey=None):
-            """Extracts date values from a given key in json_data."""
+        def guess_dates(json_data, key, subkey=None):
+            """Try to get `date` from other fields.
+            
+            ### Examples:
+            1. **8564 tag may include digitized file information, indico information (link, date) or any url file
+                json_data = {"url_files": [{"indico": {"url": "http://agenda.cern.ch/..", "date": "2002-03-18"}}], ...}
+                Calling the method with `key="url_files", subkey="indico"`
+                Returns all the possible:
+                json_data["url_files"]["indico"]["date]
+                
+            2. **500__ tag: internal notes that may contain date information
+                json_data = {"internal_notes": [{"note": "note, 1 Jun 2025", "date": "2025-06-01"}], ...}
+                Calling the method with `key="internal_notes"
+                Returns all the possible:
+                json_data["internal_notes"]["date"]
+            
+            ### Returns:
+            - `set[str]`: A set of date strings.
+            """
             items = json_data.get(key, [])
             if subkey:
                 return {
                     item[subkey]["date"]
                     for item in items
-                    if isinstance(item, dict)
-                    and subkey in item
-                    and isinstance(item[subkey], dict)
+                    if subkey in item
                     and "date" in item[subkey]
                 }
 
             return {
                 item["date"]
                 for item in items
-                if isinstance(item, dict) and "date" in item
+                if "date" in item
             }
 
         def reformat_date(json_data):
             """Reformat the date for the cds-videos data model."""
-            # 1. Check primary date field
+            # Check primary date field
             dates_set = {date for date in json_data.get("date", []) if date}
 
-            # 2. If no date found, check `indico_links`
-            if not dates_set:
-                dates_set = extract_dates(json_data, "url_files", subkey="indico")
-
-            # 3. If still no date found, check `internal_notes`
+            # If no date found, check `indico_links` and `internal_notes`
             if not dates_set:
-                dates_set = extract_dates(json_data, "internal_notes")
+                indico_dates = guess_dates(json_data, "url_files", subkey="indico")
+                note_dates = guess_dates(json_data, "internal_notes")
+                dates_set.update(indico_dates, note_dates)
 
-            # 4. Return the valid date if only one is found
+            # Return the valid date if only one is found
             if len(dates_set) == 1:
                 return next(iter(dates_set))
 
-            # 5. Multiple dates (Must have different indico event videos?)
+            # Multiple dates (Must have different indico event videos?)
             if len(dates_set) > 1:
                 raise UnexpectedValue(
                     f"More than one date found in record: {json_data.get('recid')} dates: {dates_set}.",
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
index f9fcd76a..cfcf7156 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
@@ -38,7 +38,7 @@ def get_contributor_role(subfield, role, raise_unexpected=False):
     return translations[clean_role]
 
 
-def get_contributor(key, value):
+def get_contributor(key, value, contributor_role="", name=""):
     """Create contributor json for tag 518 and 269."""
     beard = value.get("9")
     if beard is not None and beard != "#BEARD#":
@@ -46,13 +46,15 @@ def get_contributor(key, value):
         # historically it was some kind of automatic script tagging
         # and it should be ignored if value == #BEARD#
         raise UnexpectedValue(field=key, subfield="9", value=beard)
-    name = value.get("a").strip()
+    if not name:
+        name = value.get("a").strip()
     affiliation = value.get("u", "")
-    role = value.get("e", "")
     contributor = {"name": name}
-    if role:
-        role = get_contributor_role("e", role)
-        contributor.update({"role": role})
     if affiliation:
         contributor.update({"affiliations": [affiliation]})
+    if contributor_role:
+        contributor.update({"role": contributor_role})
+    elif value.get("e", ""):
+        role = get_contributor_role("e", value.get("e", ""))
+        contributor.update({"role": role})
     return contributor
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
index dd807548..febf9dc8 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
@@ -26,6 +26,9 @@
     for_each_value,
     require,
 )
+from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.contributors import (
+    get_contributor,
+)
 
 # ATTENTION when COPYING! important which model you use as decorator
 from ...models.video_lecture import model
@@ -92,14 +95,12 @@ def imprint(self, key, value):
 @for_each_value
 @require(["a"])
 def performer(self, key, value):
-    """Translates performer."""
-    name = value.get("a").strip()
+    """Translates performer/Participant."""
     role = value.get("e")
-    contributor = {"name": name, "role": "Performer"}  # TODO or "Participant"
-    affiliation = value.get("u", "")
-    if affiliation:
-        contributor.update({"affiliations": [affiliation]})
-    return contributor
+    if role and role.strip().lower() != "speaker":
+        # checking if anything else stored in this field
+        raise UnexpectedValue("Different role found", field=key, subfield="e", value=role)
+    return get_contributor(key, value, contributor_role="Performer")
 
 
 @model.over("contributors", "^906__")
@@ -107,12 +108,7 @@ def performer(self, key, value):
 @require(["p"])
 def event_speakers(self, key, value):
     """Translates event_speakers."""
-    name = value.get("p").strip()
-    contributor = {"name": name, "role": "Speaker"}
-    affiliation = value.get("u", "")
-    if affiliation:
-        contributor.update({"affiliations": [affiliation]})
-    return contributor
+    return get_contributor(key, value, contributor_role="Speaker", name=value.get("p").strip())
 
 
 @model.over("url_files", "^8564_")

From 21e0ba24108aaef671304297bc8b6fe3564971e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= <zubeydeecivelek@gmail.com>
Date: Mon, 17 Feb 2025 17:51:36 +0100
Subject: [PATCH 4/4] global: runner module, separated tests for rdm/videos

---
 cds_migrator_kit/base_minter.py                     | 8 ++++----
 cds_migrator_kit/extract/__init__.py                | 2 +-
 cds_migrator_kit/rdm/cli.py                         | 2 +-
 cds_migrator_kit/runner/__init__.py                 | 8 ++++++++
 cds_migrator_kit/{rdm => runner}/runner.py          | 4 ++--
 cds_migrator_kit/videos/weblecture_migration/cli.py | 2 +-
 tests/{ => cds-rdm}/data/all_fields.json            | 0
 tests/{ => cds-rdm}/data/summer_note.json           | 0
 tests/cds-rdm/test_full_migration.py                | 4 ++--
 tests/{ => cds-rdm}/test_json_translation_rules.py  | 0
 tests/{ => cds-videos}/conftest.py                  | 0
 11 files changed, 19 insertions(+), 11 deletions(-)
 create mode 100644 cds_migrator_kit/runner/__init__.py
 rename cds_migrator_kit/{rdm => runner}/runner.py (97%)
 rename tests/{ => cds-rdm}/data/all_fields.json (100%)
 rename tests/{ => cds-rdm}/data/summer_note.json (100%)
 rename tests/{ => cds-rdm}/test_json_translation_rules.py (100%)
 rename tests/{ => cds-videos}/conftest.py (100%)

diff --git a/cds_migrator_kit/base_minter.py b/cds_migrator_kit/base_minter.py
index 0b18fc33..49ee1c3f 100644
--- a/cds_migrator_kit/base_minter.py
+++ b/cds_migrator_kit/base_minter.py
@@ -6,17 +6,17 @@
 # the terms of the MIT License; see LICENSE file for more details.
 """Minter configuration for CDS Migrator Kit."""
 
-import importlib
 import warnings
 
+from cds_migrator_kit.import_utils import import_module
+
 # Default: No minter
 selected_minter = None
 
 # Check if `rdm` is installed and set the minter
-try:
-    importlib.import_module("cds_rdm.__init__")
+if import_module("cds_rdm.__init__"):
     from cds_rdm.minters import legacy_recid_minter as selected_minter
-except ImportError:
+else:
     warnings.warn(
         "No valid PID minter found. Ensure `rdm` is installed.", RuntimeWarning
     )
diff --git a/cds_migrator_kit/extract/__init__.py b/cds_migrator_kit/extract/__init__.py
index 20f79dd5..39a24dc5 100644
--- a/cds_migrator_kit/extract/__init__.py
+++ b/cds_migrator_kit/extract/__init__.py
@@ -2,7 +2,7 @@
 #
 # Copyright (C) 2025 CERN.
 #
-# CDS-Videos is free software; you can redistribute it and/or modify it under
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
 # the terms of the MIT License; see LICENSE file for more details.
 
 """Extract module."""
diff --git a/cds_migrator_kit/rdm/cli.py b/cds_migrator_kit/rdm/cli.py
index e7548e2c..b66f0416 100644
--- a/cds_migrator_kit/rdm/cli.py
+++ b/cds_migrator_kit/rdm/cli.py
@@ -19,7 +19,6 @@
 from cds_migrator_kit.rdm.records.streams import (  # UserStreamDefinition,
     RecordStreamDefinition,
 )
-from cds_migrator_kit.rdm.runner import Runner
 from cds_migrator_kit.rdm.stats.runner import RecordStatsRunner
 from cds_migrator_kit.rdm.stats.streams import RecordStatsStreamDefinition
 from cds_migrator_kit.rdm.users.runner import PeopleAuthorityRunner, SubmitterRunner
@@ -30,6 +29,7 @@
 from cds_migrator_kit.rdm.users.transform.xml_processing.models.people import (
     PeopleAuthority,
 )
+from cds_migrator_kit.runner.runner import Runner
 
 cli_logger = logging.getLogger("migrator")
 
diff --git a/cds_migrator_kit/runner/__init__.py b/cds_migrator_kit/runner/__init__.py
new file mode 100644
index 00000000..e22f5897
--- /dev/null
+++ b/cds_migrator_kit/runner/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+
+"""Runner module."""
diff --git a/cds_migrator_kit/rdm/runner.py b/cds_migrator_kit/runner/runner.py
similarity index 97%
rename from cds_migrator_kit/rdm/runner.py
rename to cds_migrator_kit/runner/runner.py
index 73b0d5b2..c63798ae 100644
--- a/cds_migrator_kit/rdm/runner.py
+++ b/cds_migrator_kit/runner/runner.py
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2022-2025 CERN.
 #
-# Invenio-RDM-Migrator is free software; you can redistribute it and/or modify
+# cds-migrator-kit is free software; you can redistribute it and/or modify
 # it under the terms of the MIT License; see LICENSE file for more details.
 
 """InvenioRDM migration streams runner."""
diff --git a/cds_migrator_kit/videos/weblecture_migration/cli.py b/cds_migrator_kit/videos/weblecture_migration/cli.py
index 64d2f450..981f8fcb 100644
--- a/cds_migrator_kit/videos/weblecture_migration/cli.py
+++ b/cds_migrator_kit/videos/weblecture_migration/cli.py
@@ -13,7 +13,7 @@
 from flask import current_app
 from flask.cli import with_appcontext
 
-from cds_migrator_kit.rdm.runner import Runner
+from cds_migrator_kit.runner.runner import Runner
 from cds_migrator_kit.videos.weblecture_migration.streams import RecordStreamDefinition
 
 cli_logger = logging.getLogger("migrator")
diff --git a/tests/data/all_fields.json b/tests/cds-rdm/data/all_fields.json
similarity index 100%
rename from tests/data/all_fields.json
rename to tests/cds-rdm/data/all_fields.json
diff --git a/tests/data/summer_note.json b/tests/cds-rdm/data/summer_note.json
similarity index 100%
rename from tests/data/summer_note.json
rename to tests/cds-rdm/data/summer_note.json
diff --git a/tests/cds-rdm/test_full_migration.py b/tests/cds-rdm/test_full_migration.py
index b924e8ed..9468818c 100644
--- a/tests/cds-rdm/test_full_migration.py
+++ b/tests/cds-rdm/test_full_migration.py
@@ -25,9 +25,9 @@
 from invenio_vocabularies.contrib.names.models import NamesMetadata
 
 from cds_migrator_kit.rdm.records.streams import RecordStreamDefinition
-from cds_migrator_kit.rdm.runner import Runner
 from cds_migrator_kit.rdm.users.runner import SubmitterRunner
 from cds_migrator_kit.rdm.users.streams import SubmitterStreamDefinition
+from cds_migrator_kit.runner.runner import Runner
 
 
 def suite_multi_field(record):
@@ -356,7 +356,7 @@ def test_full_migration_stream(
     Name.index.refresh()
 
     mocker.patch(
-        "cds_migrator_kit.rdm.runner.Runner._read_config",
+        "cds_migrator_kit.runner.runner.Runner._read_config",
         return_value={
             "db_uri": "postgresql://cds-rdm-migration:cds-rdm-migration@localhost:5432/cds-rdm-migration",
             "records": {
diff --git a/tests/test_json_translation_rules.py b/tests/cds-rdm/test_json_translation_rules.py
similarity index 100%
rename from tests/test_json_translation_rules.py
rename to tests/cds-rdm/test_json_translation_rules.py
diff --git a/tests/conftest.py b/tests/cds-videos/conftest.py
similarity index 100%
rename from tests/conftest.py
rename to tests/cds-videos/conftest.py