diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9f8d286d..2590f78d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -14,34 +14,34 @@ on:
branches: master
schedule:
# * is a special character in YAML so you have to quote this string
- - cron: '0 3 * * 6'
+ - cron: "0 3 * * 6"
workflow_dispatch:
inputs:
reason:
- description: 'Reason'
+ description: "Reason"
required: false
- default: 'Manual trigger'
+ default: "Manual trigger"
jobs:
- Tests:
+ RDMTests:
runs-on: ubuntu-20.04
strategy:
matrix:
- python-version: [3.9]
- requirements-level: [pypi]
- db-service: [postgresql14]
- include:
+ python-version: [3.9]
+ requirements-level: [pypi]
+ db-service: [postgresql14]
+ include:
- db-service: postgresql14
DB_EXTRAS: "postgresql"
env:
DB: ${{ matrix.db-service }}
- EXTRAS: tests
+ EXTRAS: rdm,tests
steps:
- name: Install python-ldap dependencies
run: |
- sudo apt-get update
- sudo apt-get install libsasl2-dev python-dev libldap2-dev libssl-dev
+ sudo apt-get update
+ sudo apt-get install libsasl2-dev python-dev libldap2-dev libssl-dev
- name: Checkout
uses: actions/checkout@v4
@@ -61,4 +61,43 @@ jobs:
docker compose --version
- name: Run tests
- run: ./run-tests.sh
+ run: ./run-tests.sh rdm
+ VideosTests:
+ runs-on: ubuntu-20.04
+ strategy:
+ matrix:
+ python-version: [3.9]
+ requirements-level: [pypi]
+ db-service: [postgresql14]
+ include:
+ - db-service: postgresql14
+ DB_EXTRAS: "postgresql"
+
+ env:
+ DB: ${{ matrix.db-service }}
+ EXTRAS: videos,tests
+ steps:
+ - name: Install python-ldap dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install libsasl2-dev python-dev libldap2-dev libssl-dev
+
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: pip
+ cache-dependency-path: setup.cfg
+
+ - name: Install dependencies
+ run: |
+ pip install ".[$EXTRAS]"
+ pip freeze
+ docker --version
+ docker compose --version
+
+ - name: Run tests
+ run: ./run-tests.sh videos
diff --git a/README.rst b/README.rst
index df521510..504b2479 100644
--- a/README.rst
+++ b/README.rst
@@ -7,7 +7,81 @@
cds-migrator-kit
==================
+Installation
+============
+
+Default Installation (without RDM or Videos)
+---------------------------------------------
+To install the package without RDM or videos, run:
+
+.. code-block:: bash
+
+ pip install .
+
+Installation for RDM
+----------------------
+To install the package with RDM, run:
+
+.. code-block:: bash
+
+ pip install ".[rdm]"
+
+To see available RDM commands, run:
+
+.. code-block:: bash
+
+ invenio migration --help
+
+Installation for Videos
+-----------------------
+To install the package with cds-videos, run:
+
+.. code-block:: bash
+
+ pip install ".[videos]"
+
+To see available videos commands, run:
+
+.. code-block:: bash
+
+ invenio migration videos --help
+
+Running Tests Locally
+=====================
+
+For RDM
+--------
+Install rdm and test dependencies:
+
+.. code-block:: bash
+
+ pip install ".[rdm,tests]"
+
+
+Run the tests with ignoring `cds-videos` tests:
+
+.. code-block:: bash
+
+ ./run-tests.sh rdm
+
+For Videos
+----------
+Install videos and test dependencies:
+
+.. code-block:: bash
+
+ pip install ".[videos,tests]"
+
+Run the video tests:
+
+.. code-block:: bash
+
+ ./run-tests.sh videos
+
+
To run the interface:
-```
-gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app
-```
+=====================
+.. code-block:: bash
+
+ gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app
+
diff --git a/cds_migrator_kit/base_config.py b/cds_migrator_kit/base_config.py
new file mode 100644
index 00000000..2e7f4e7e
--- /dev/null
+++ b/cds_migrator_kit/base_config.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+"""Migration configuration for CDS Migrator Kit."""
+
+from cds_migrator_kit.import_utils import import_module
+
+selected_config = None
+
+# Check for `rdm` dependencies
+if import_module("cds_rdm.__init__"):
+ from cds_migrator_kit.rdm import migration_config as selected_config
+
+# Check for `videos` dependencies
+elif import_module("cds.version"):
+ from cds_migrator_kit.videos import migration_config as selected_config
+
+# If no valid module is found, use default one
+if selected_config is None:
+ from cds_migrator_kit import config as selected_config
+
+# Set the selected config module
+globals().update(vars(selected_config))
diff --git a/cds_migrator_kit/base_minter.py b/cds_migrator_kit/base_minter.py
new file mode 100644
index 00000000..49ee1c3f
--- /dev/null
+++ b/cds_migrator_kit/base_minter.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+"""Minter configuration for CDS Migrator Kit."""
+
+import warnings
+
+from cds_migrator_kit.import_utils import import_module
+
+# Default: No minter
+selected_minter = None
+
+# Check if `rdm` is installed and set the minter
+if import_module("cds_rdm.__init__"):
+ from cds_rdm.minters import legacy_recid_minter as selected_minter
+else:
+ warnings.warn(
+ "No valid PID minter found. Ensure `rdm` is installed.", RuntimeWarning
+ )
+
+# Expose the minter function
+legacy = selected_minter
diff --git a/cds_migrator_kit/cli.py b/cds_migrator_kit/cli.py
new file mode 100644
index 00000000..0be7dc8d
--- /dev/null
+++ b/cds_migrator_kit/cli.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+"""cds-migrator-kit command line module."""
+
+import click
+
+from cds_migrator_kit.import_utils import import_module
+
+
+@click.group()
+def cli():
+ """Base CLI command that loads the subcommands."""
+ pass
+
+
+# Check for `rdm` dependencies
+if import_module("cds_rdm.__init__"):
+ from cds_migrator_kit.rdm.cli import migration
+ cli = migration
+
+# Check for `videos` dependencies
+if import_module("cds.version"):
+ from cds_migrator_kit.videos.weblecture_migration.cli import videos
+ cli.add_command(videos, "videos")
diff --git a/cds_migrator_kit/extract/__init__.py b/cds_migrator_kit/extract/__init__.py
index 20f79dd5..39a24dc5 100644
--- a/cds_migrator_kit/extract/__init__.py
+++ b/cds_migrator_kit/extract/__init__.py
@@ -2,7 +2,7 @@
#
# Copyright (C) 2025 CERN.
#
-# CDS-Videos is free software; you can redistribute it and/or modify it under
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
"""Extract module."""
diff --git a/cds_migrator_kit/import_utils.py b/cds_migrator_kit/import_utils.py
new file mode 100644
index 00000000..1ec5e870
--- /dev/null
+++ b/cds_migrator_kit/import_utils.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+"""Utility function for dynamically checking module availability."""
+
+import importlib
+
+
+def import_module(module_name):
+ """Try to import a module, return True if successful, otherwise False."""
+ try:
+ importlib.import_module(module_name)
+ return True
+ except ImportError:
+ return False
\ No newline at end of file
diff --git a/cds_migrator_kit/rdm/cli.py b/cds_migrator_kit/rdm/cli.py
index e7548e2c..b66f0416 100644
--- a/cds_migrator_kit/rdm/cli.py
+++ b/cds_migrator_kit/rdm/cli.py
@@ -19,7 +19,6 @@
from cds_migrator_kit.rdm.records.streams import ( # UserStreamDefinition,
RecordStreamDefinition,
)
-from cds_migrator_kit.rdm.runner import Runner
from cds_migrator_kit.rdm.stats.runner import RecordStatsRunner
from cds_migrator_kit.rdm.stats.streams import RecordStatsStreamDefinition
from cds_migrator_kit.rdm.users.runner import PeopleAuthorityRunner, SubmitterRunner
@@ -30,6 +29,7 @@
from cds_migrator_kit.rdm.users.transform.xml_processing.models.people import (
PeopleAuthority,
)
+from cds_migrator_kit.runner.runner import Runner
cli_logger = logging.getLogger("migrator")
diff --git a/cds_migrator_kit/rdm/migration_config.py b/cds_migrator_kit/rdm/migration_config.py
index 240d7bbb..e896235a 100644
--- a/cds_migrator_kit/rdm/migration_config.py
+++ b/cds_migrator_kit/rdm/migration_config.py
@@ -368,9 +368,6 @@ def _(x): # needed to avoid start time failure with lazy strings
CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/streams.yaml"
-CDS_MIGRATOR_KIT_VIDEOS_STREAM_CONFIG = (
- "cds_migrator_kit/videos/weblecture_migration/streams.yaml"
-)
RDM_RECORDS_IDENTIFIERS_SCHEMES = {
**RDM_RECORDS_IDENTIFIERS_SCHEMES,
diff --git a/cds_migrator_kit/runner/__init__.py b/cds_migrator_kit/runner/__init__.py
new file mode 100644
index 00000000..e22f5897
--- /dev/null
+++ b/cds_migrator_kit/runner/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+
+"""Runner module."""
diff --git a/cds_migrator_kit/rdm/runner.py b/cds_migrator_kit/runner/runner.py
similarity index 97%
rename from cds_migrator_kit/rdm/runner.py
rename to cds_migrator_kit/runner/runner.py
index 73b0d5b2..c63798ae 100644
--- a/cds_migrator_kit/rdm/runner.py
+++ b/cds_migrator_kit/runner/runner.py
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2022-2025 CERN.
#
-# Invenio-RDM-Migrator is free software; you can redistribute it and/or modify
+# cds-migrator-kit is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""InvenioRDM migration streams runner."""
diff --git a/cds_migrator_kit/videos/__init__.py b/cds_migrator_kit/videos/__init__.py
index d425caae..a9f8adc2 100644
--- a/cds_migrator_kit/videos/__init__.py
+++ b/cds_migrator_kit/videos/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
#
# cds-migrator-kit is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/migration_config.py b/cds_migrator_kit/videos/migration_config.py
new file mode 100644
index 00000000..b4e09447
--- /dev/null
+++ b/cds_migrator_kit/videos/migration_config.py
@@ -0,0 +1,49 @@
+"""CDS-Videos settings for CDS-Videos project."""
+
+import json
+import os
+from datetime import datetime, timedelta
+
+
+def _(x): # needed to avoid start time failure with lazy strings
+ return x
+
+
+# Since HAProxy and Nginx route all requests no matter the host header
+# provided, the allowed hosts variable is set to localhost. In production it
+# should be set to the correct host and it is strongly recommended to only
+# route correct hosts to the application.
+APP_ALLOWED_HOSTS = ["0.0.0.0", "localhost", "127.0.0.1", "localhost.cern.ch"]
+
+SQLALCHEMY_DATABASE_URI = (
+ "postgresql+psycopg2://cds-videos:cds-videos@localhost/cds-videos"
+)
+
+# SECURITY WARNING: keep the secret key used in production secret!
+# Do not commit it to a source code repository.
+# TODO: Set
+SECRET_KEY = "CHANGE_ME"
+
+# TODO: Set with your own hostname when deploying to production
+SITE_UI_URL = "https://127.0.0.1"
+
+SITE_API_URL = "https://127.0.0.1/api"
+
+
+DATACITE_ENABLED = True
+DATACITE_USERNAME = ""
+DATACITE_PASSWORD = ""
+DATACITE_PREFIX = "10.17181"
+DATACITE_TEST_MODE = True
+DATACITE_DATACENTER_SYMBOL = ""
+
+import cds_migrator_kit
+
+base_path = os.path.dirname(os.path.realpath(cds_migrator_kit.__file__))
+logs_dir = os.path.join(base_path, "tmp/logs/")
+CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
+CDS_MIGRATOR_KIT_VIDEOS_STREAM_CONFIG = (
+ "cds_migrator_kit/videos/weblecture_migration/streams.yaml"
+)
+
+### CDS MIGRATOR #################################
diff --git a/cds_migrator_kit/videos/weblecture_migration/__init__.py b/cds_migrator_kit/videos/weblecture_migration/__init__.py
index 61138b57..31cfe2d5 100644
--- a/cds_migrator_kit/videos/weblecture_migration/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/cli.py b/cds_migrator_kit/videos/weblecture_migration/cli.py
index 13d6e954..981f8fcb 100644
--- a/cds_migrator_kit/videos/weblecture_migration/cli.py
+++ b/cds_migrator_kit/videos/weblecture_migration/cli.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
@@ -13,7 +13,7 @@
from flask import current_app
from flask.cli import with_appcontext
-from cds_migrator_kit.rdm.runner import Runner
+from cds_migrator_kit.runner.runner import Runner
from cds_migrator_kit.videos.weblecture_migration.streams import RecordStreamDefinition
cli_logger = logging.getLogger("migrator")
@@ -44,5 +44,6 @@ def run(dry_run=False):
stream_definitions=[RecordStreamDefinition],
config_filepath=Path(stream_config).absolute(),
dry_run=dry_run,
+ collection="weblectures",
)
runner.run()
diff --git a/cds_migrator_kit/videos/weblecture_migration/load/__init__.py b/cds_migrator_kit/videos/weblecture_migration/load/__init__.py
index cad2a5c0..7030a9d9 100644
--- a/cds_migrator_kit/videos/weblecture_migration/load/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/load/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/load/load.py b/cds_migrator_kit/videos/weblecture_migration/load/load.py
index 3ad5d23a..13d6db5d 100644
--- a/cds_migrator_kit/videos/weblecture_migration/load/load.py
+++ b/cds_migrator_kit/videos/weblecture_migration/load/load.py
@@ -17,17 +17,10 @@ def __init__(
db_uri,
data_dir,
tmp_dir,
- existing_data=False,
entries=None,
dry_run=False,
):
"""Constructor."""
- self.db_uri = db_uri
-
- self.data_dir = data_dir
- self.tmp_dir = tmp_dir
- self.existing_data = existing_data
- self.entries = entries
self.dry_run = dry_run
def _prepare(self, entry):
diff --git a/cds_migrator_kit/videos/weblecture_migration/streams.py b/cds_migrator_kit/videos/weblecture_migration/streams.py
index 0fb53431..94e11af6 100644
--- a/cds_migrator_kit/videos/weblecture_migration/streams.py
+++ b/cds_migrator_kit/videos/weblecture_migration/streams.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/streams.yaml b/cds_migrator_kit/videos/weblecture_migration/streams.yaml
index 055c93da..5d3eb7c1 100644
--- a/cds_migrator_kit/videos/weblecture_migration/streams.yaml
+++ b/cds_migrator_kit/videos/weblecture_migration/streams.yaml
@@ -1,12 +1,10 @@
-data_dir: cds_migrator_kit/videos/weblecture_migration/data/
-tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp
-state_dir: cds_migrator_kit/videos/weblecture_migration/cache
-log_dir: cds_migrator_kit/videos/weblecture_migration/log
-db_uri: postgresql://cds-rdm:cds-rdm@localhost:5432/cds-rdm # TODO CHANGE
-old_secret_key: CHANGE_ME # TODO CHANGE
-new_secret_key: CHANGE_ME # TODO CHANGE
+db_uri: postgresql://cds-videos:cds-videos@localhost:5432/cds-videos
records:
- extract:
- dirpath: cds_migrator_kit/videos/weblecture_migration/data/weblectures/dump/
- transform:
- files_dump_dir: cds_migrator_kit/videos/weblecture_migration/data/weblectures/files/
+ weblectures:
+ data_dir: cds_migrator_kit/videos/weblecture_migration/data/
+ tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp
+ log_dir: cds_migrator_kit/videos/weblecture_migration/log
+ extract:
+ dirpath: cds_migrator_kit/videos/weblecture_migration/data/weblectures/dump/
+ transform:
+ files_dump_dir: cds_migrator_kit/videos/weblecture_migration/data/weblectures/files/
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py
index a8216c0e..faebe66e 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py
index 6f9f543f..d1a351f4 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py
index 5885950c..1b01058c 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/base.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py
index a2d63dbc..09af17b8 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of CERN Document Server.
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
@@ -30,130 +30,177 @@ class VideoLecture(CdsOverdo):
__query__ = '8567_.x:"Absolute master path" 8567_.d:/mnt/master_share* -980__.C:MIGRATED -980__.c:DELETED -5831_.a:digitized'
__ignore_keys__ = {
- "110__a", # corporate author
- "8567_y", # File description
- "111__c", # Video location (indico location)
- "518__r", # Video/meeting location
+ "003",
+ # 340: Drop streaming video, anything else we copy over to curation field.
"340__a", # Physical medium -> curation field
"340__d", # Physical medium/recording technique -> curation field
+ "340__9", # Physical medium/CD-ROM -> curation field
+ "340__k", # Physical medium/ -> curation field
+ "340__j", # Physical medium/ -> curation field
+ "340__8", # Physical medium/id? -> curation field https://cds.cern.ch/record/2234827
+ # check with JY
"961__x", # Creation Date TODO? check with JY
"961__c", # modification Date TODO? check with JY
"961__h", # Hour? TODO? check with JY
"961__l", # Library? TODO? check with JY
+ "961__a", # ? TODO? check with JY
+ "961__b", # ? TODO? check with JY
"964__a", # Item owner TODO? check with JY
"916__d", # Status week? TODO? check with JY
"901__u", # Affiliation at Conversion? TODO? check with JY
"583__a", # Action note / curation TODO? check with JY
"583__c", # Action note / curation TODO? check with JY
"583__z", # Action note / curation TODO? check with JY
+ "916__n", # Status week TODO? check with JY
+ "916__s", # Status week TODO? check with JY
+ "916__w", # Status week TODO? check with JY
+ "916__y", # Status week TODO? check with JY
+ "916__a", # Status week TODO? check with JY
+ "306__a", # ? TODO? check with JY
+ "336__a", # ? TODO? check with JY
+ "981__a", # duplicate record id TODO? check with JY
+ "916__a", # Status week TODO? check with JY
+ "916__d", # Status week TODO? check with JY
+ "916__e", # Status week TODO? check with JY
+ "916__s", # Status week TODO? check with JY
+ "916__w", # Status week TODO? check with JY
+ "916__y", # Status week TODO? check with JY
+ "960__a", # Base?
+ # Category, Collection, Series, Keywords
+ "980__a", # collection tag
+ "980__b", # Secondary collection indicator
"65027a", # TODO Subject category = Event?
+ "490__a", # TODO Series
+ "490__v", # Series: volume
+ "650172", # subject provenance
+ "65017a", # subject value
+ "6531_9", # keyword provenance
+ "6531_a", # keyword
+ "690C_a", # collection name
+ # Conference Information/Indico
"111__a", # Title (indico)
"111__9", # Start date (indico)
"111__g", # Event id (indico)
"111__z", # End date (indico)
+ "111__c", # Video location (indico location)
"084__a", # Indico?
"084__2", # Indico?
- "8567_2", # File system? 'MediaArchive'
- "980__b", # Secondary collection indicator
+ "518__r", # Video/meeting location
+ "518__g", # Lectures: conference identification
+ "970__a", # alternative identifier, indico id?
+ # Copyright/License
"542__d", # Copyright holder
"542__g", # Copyright date
- "490__a", # TODO Series
- "8567_u", # File url
+ "542__3", # Copyright materials
+ "540__a", # License
+ "540__b", # License person/organization
+ "540__u", # License URL
+ "540__3", # License material
+ # Alternative identifiers
"962__n", # `Presented at` note (conference/linked document)
"962__b", # `Presented at` record (conference/linked document)
- "518__g", # Lectures: conference identification
- "490__v", # Series: volume
- "269__b", # Name of publ.
"088__9", # Report number (make it alternative identifier with cds reference?)
"088__z", # Report number (make it alternative identifier with cds reference?)
- # Files
- "8564_q", # File type (digitized) # TODO this record has both lecturemedia and DM https://cds.cern.ch/record/589875
+ "035__9", # Inspire schema (Indico/AgendaMaker)
+ "035__a", # Inspire id value
+ "088__a", # Report Number --> alternative identifier with ds reference
+ # Additional Title, Volume, Note
+ "246__a", # Additional title
+ "246__i", # Additional title/display text
+ "246__b", # Additional title remaining
+ "246__n", # Volume
+ "246__p", # Volume
+ "500__a", # Note (-> internal note)
+ "500__b", # Note (-> internal note)
+ "500__9", # Note/type (-> internal note) https://cds.cern.ch/record/1561636
+ # Restricted
+ "5061_f",
+ "5061_d",
+ "5061_5",
+ "5061_a",
+ "5061_2",
+ # Location (Shelving/Library)
"852__c", # Location (Shelving/Library)
+ "852__b", # Location (Shelving/?)
+ "852__8", # Location (Shelving/id?) https://cds.cern.ch/record/2234827
"852__h", # Location (Shelving) example: https://cds.cern.ch/record/254588/
"852__a", # Location (Shelving) example: https://cds.cern.ch/record/558348
"852__x", # Location (Shelving/ type? DVD) example: https://cds.cern.ch/record/690000/
+ "852__9", # Location (Shelving/ note?) example: https://cds.cern.ch/record/2233722
+ # Date/Extra Reduntant
+ "260__c", # Redundant (more detailed value is in 269__c imprint.pub_date)
+ "260__a",
+ "260__b",
+ # Contributor?
+ "110__a", # corporate author
+ "700__m", # author's email
+ "270__p", # document contact --> add as a contributor with a correct role
+ # Internal Note
+ "595__a", # Internal Note --> curation field
+ "595__z", # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean
+ # Collaboration --> add new role to contributors
+ "710__5", # department / organisation author
+ "710__a", # organisation author
+ "710__g", # organisation author
+ # Accelerator/Facility, Experiment, Project, Study
+ "693__a", # accelerator, create a custom field?
+ "693__e", # experiments
+ "693__p", # project
+ "693__s", # study
+ # Submitter
+ "859__f", # creator's email
+ "8560_f", # submitter email
+ # OAI
+ "0248_a", # oai identifier
+ "0248_p", # oai identifier
+ "0248_q",
# IGNORE
"518__h", # Lectures: Starting time
"300__2", # Imprint
"300__b", # Imprint
+ "300__8", # Imprint
"300__a", # Number of pages / duration
"250__a", # Edition
"700__0", # Author id (eg: AUTHOR|(CDS)2067852)
"518__l", # Lectures: length of speech
- # TODO copied from ssn
- "0248_a", # oai identifier, not needed to migrate, TBD
- "0248_p", # oai identifier, not needed to migrate, TBD
- "0248_q", # full text tag 2778897
- "100__m", # author's email <-- decided not to keep in RDM,
- "260__c", # Redundant (more detailed value is in 269__c imprint.pub_date)
- "269__a", # imprint place
- "270__m", # document contact email
- "595__a", # always value CERN EDS, not displayed, TODO: do we keep?
- "595__z", # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean
- "700__m", # author's email <-- decided not to keep in RDM,
- "710__5", # department / organisation author
- "710__a", # organisation author
- "8564_8", # Files system field
- "8564_s", # Files system field
- "8564_u", # Files
- "8564_x", # Files system field
- "8564_y", # Files
- "937__c", # modification date
- "937__s", # modification person
- "960__a", # collection id? usually value 12, to confirm if we ignore
- "980__a", # collection tag
- "981__a", # duplicate record id
- "003",
- "035__9", # Inspire schema
- "035__a", # Inspire id value
- "037__a", # (Report number) alternative identifiers -> scheme "CDS REFERENCE"
- "088__a", # RN (manual introduced?) second report number (so the identifiers schemas are not unique!)
- "246__a",
- "246__i", # abbreviation
- "246__i", # abbreviation tag, applies to value of 246__A
- "270__p", # document contact person name
- "500__a", # Note (-> description.type = other)
- "562__c", # note
- "650172", # subject provenance
- "65017a", # subject value
- "6531_9", # keyword provenance
- "6531_a", # keyword
- "690C_a", # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field)
- "6931_9", # keyword
- "6931_a", # keyword
- "693__a", # accelerator, do we create a custom field?
- "693__b", # beams recid: 2640381
- "693__e", # custom_fields.cern:experiments
- "693__f", # facility, do we create a custom field?
- "693__p", # project, do we create a custom field?
- "693__s", # study, do we create a custom field?
- "710__g", # Collaboration, OK to migrate as corporate contributor (not creator)?
- "859__f", # creator's email, to be used to determine the owner
- "916__n",
- "916__s",
- "916__w",
- "963__a",
- "970__a", # alternative identifier, scheme ALEPH
+ "100__0", # Author id (eg: AUTHOR|(CDS)2067852)
+ "240__a", # Decided to drop, (Streaming Video)
+ "337__a", # Decided to drop, (Video)
+ "963__a", # values: PUBLIC/RESTRICTED
+ "8564_8", # File: bibdoc id
+ "8564_s", # File: file size
# IMPLEMENTED
# "520__a", # Note (-> description.type = abstract
# "001",
# "041__a", # languages
- # "906__p", # names, is it supervisor?
+ # "906__p", # event speakers
# "100__9", # #BEARD# tag
# "100__a",
# "100__u", # Author affiliation
- # "700__e", # Contributor/Speaker role
- # "700__0", # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID
+ # "700__e", # Contributor/Speaker role
+ # "700__0", # Contributors (cds author id)
# "700__9", # #BEARD# tag
# "700__a", # Contributors (full name)
# "700__u", # Contributors (affiliation)
- # "518__d", # Full date/time
- # "269__c", # Date (full date/year)
- # "518__a", # date?
+ # "518__d", # Full date/time
+ # "269__c", # Date (full date/year)
+ # "269__b", # CERN (checked for other values)
+ # "269__a", # Geneva (checked for other values)
+ # "518__a", # Date
+ # "906__u", # Contributor Affiliation
+ # "511__u", # Contributor Affiliation
+ # "8567_u", # File url
+ # "8567_y", # File description
+ # "8567_2", # File system? 'MediaArchive'
+ # "8564_q", # File type (digitized)
+ # "8564_8", # Files system field
+ # "8564_s", # Files system field
+ # "8564_u", # Files
+ # "8564_x", # Files system field
+ # "8564_y", # Files
}
model = VideoLecture(
- bases=(base_model,),
- entry_point_group="cds_migrator_kit.videos.rules.video_lecture",
+ bases=(base_model,), entry_point_group="cds_migrator_kit.videos.rules.video_lecture"
)
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
index f17c2890..947aaae5 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
@@ -96,18 +96,65 @@ def _media_files(self, entry):
def _metadata(self, entry):
"""Transform the metadata of a record."""
+ def guess_dates(json_data, key, subkey=None):
+ """Try to get `date` from other fields.
+
+ ### Examples:
+ 1. **8564 tag may include digitized file information, indico information (link, date) or any url file
+ json_data = {"url_files": [{"indico": {"url": "http://agenda.cern.ch/..", "date": "2002-03-18"}}], ...}
+ Calling the method with `key="url_files", subkey="indico"`
+ Returns all the possible:
+ json_data["url_files"]["indico"]["date]
+
+ 2. **500__ tag: internal notes that may contain date information
+ json_data = {"internal_notes": [{"note": "note, 1 Jun 2025", "date": "2025-06-01"}], ...}
+ Calling the method with `key="internal_notes"
+ Returns all the possible:
+ json_data["internal_notes"]["date"]
+
+ ### Returns:
+ - `set[str]`: A set of date strings.
+ """
+ items = json_data.get(key, [])
+ if subkey:
+ return {
+ item[subkey]["date"]
+ for item in items
+ if subkey in item
+ and "date" in item[subkey]
+ }
+
+ return {
+ item["date"]
+ for item in items
+ if "date" in item
+ }
+
def reformat_date(json_data):
"""Reformat the date for the cds-videos data model."""
- dates = json_data.get("date", [])
- dates_set = {date for date in dates if date is not None}
+ # Check primary date field
+ dates_set = {date for date in json_data.get("date", []) if date}
- if len(dates_set) == 1: # Should be only one value
- return next(iter(dates_set)) # Get the single date from the set
- if len(dates_set) > 1:
- return next(iter(dates_set)) # return the first
+ # If no date found, check `indico_links` and `internal_notes`
+ if not dates_set:
+ indico_dates = guess_dates(json_data, "url_files", subkey="indico")
+ note_dates = guess_dates(json_data, "internal_notes")
+ dates_set.update(indico_dates, note_dates)
+
+ # Return the valid date if only one is found
+ if len(dates_set) == 1:
+ return next(iter(dates_set))
- raise UnexpectedValue(
- "No valid date found in record: {}.".format(json_data.get("recid"))
+ # Multiple dates (Must have different indico event videos?)
+ if len(dates_set) > 1:
+ raise UnexpectedValue(
+ f"More than one date found in record: {json_data.get('recid')} dates: {dates_set}.",
+ stage="transform",
+ )
+
+ raise MissingRequiredField(
+ f"No valid date found in record: {json_data.get('recid')}.",
+ stage="transform",
)
def description(json_data):
@@ -116,11 +163,41 @@ def description(json_data):
return json_data.get("title").get("title")
return json_data.get("description")
+ def format_contributors(json_data):
+ """
+ Same contributors could be both in tag 700 and 906.
+
+ TODO: Should we keep them both? https://cds.cern.ch/record/2233152/export/xm?ln=en
+ Removes duplicate contributors based on name, role, and affiliations.
+ """
+ contributors = json_data.get("contributors")
+ if not contributors:
+ raise MissingRequiredField(
+ f"No valid contributor found in record: {json_data.get('recid')}.",
+ stage="transform",
+ )
+
+ unique_contributors = []
+ seen = set()
+
+ for contributor in contributors:
+ # Create a tuple to identify contributors
+ identifier = (
+ contributor["name"],
+ contributor.get("role"),
+ tuple(contributor.get("affiliations", [])),
+ )
+ if identifier not in seen:
+ seen.add(identifier)
+ unique_contributors.append(contributor)
+
+ return unique_contributors
+
metadata = {
"title": entry["title"],
"description": description(entry),
- "contributors": entry.get("contributors"),
- "languages": entry.get("language"),
+ "contributors": format_contributors(entry),
+ "language": entry.get("language"),
"date": reformat_date(entry),
}
# filter empty keys
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py
index ce5e1662..29b34a6c 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py
index d6e355a1..bd60b721 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
index 9e5a6577..cfcf7156 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
@@ -38,4 +38,23 @@ def get_contributor_role(subfield, role, raise_unexpected=False):
return translations[clean_role]
-# TODO contributor affiliation will be implemented
+def get_contributor(key, value, contributor_role="", name=""):
+ """Create contributor json for tag 518 and 269."""
+ beard = value.get("9")
+ if beard is not None and beard != "#BEARD#":
+ # checking if anything else stored in this field
+ # historically it was some kind of automatic script tagging
+ # and it should be ignored if value == #BEARD#
+ raise UnexpectedValue(field=key, subfield="9", value=beard)
+ if not name:
+ name = value.get("a").strip()
+ affiliation = value.get("u", "")
+ contributor = {"name": name}
+ if affiliation:
+ contributor.update({"affiliations": [affiliation]})
+ if contributor_role:
+ contributor.update({"role": contributor_role})
+ elif value.get("e", ""):
+ role = get_contributor_role("e", value.get("e", ""))
+ contributor.update({"role": role})
+ return contributor
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py
index aa43604f..72582f04 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
#
# cds-migrator-kit is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py
index fd751c6f..fc10c171 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
-# Copyright (C) 2022 CERN.
+# Copyright (C) 2025 CERN.
#
# CDS-Videos is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.
@@ -21,7 +21,7 @@
)
from ...models.base import model
-from ..quality.contributors import get_contributor_role
+from ..quality.contributors import get_contributor
@model.over("legacy_recid", "^001")
@@ -68,26 +68,12 @@ def language(self, key, value):
@require(["a"])
def creators(self, key, value):
"""Translates the creators field."""
- role = value.get("e")
- if role:
- role = get_contributor_role("e", role)
- beard = value.get("9")
- if beard is not None and beard != "#BEARD#":
- # checking if anything else stored in this field
- # historically it was some kind of automatic script tagging
- # and it should be ignored if value == #BEARD#
- raise UnexpectedValue(field=key, subfield="9", value=beard)
- name = value.get("a").strip()
- contributor = {"name": name}
- if role:
- contributor.update({"role": role})
- # TODO contributor affiliation will be implemented
-
- return contributor
+ return get_contributor(key, value)
@model.over("contributors", "^700__")
+@for_each_value
@require(["a"])
def contributors(self, key, value):
"""Translates contributors."""
- return creators(self, key, value)
+ return get_contributor(key, value)
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
index 205ac8aa..febf9dc8 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of CERN Document Server.
-# Copyright (C) 2024 CERN.
+# Copyright (C) 2025 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
@@ -17,62 +17,90 @@
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Common Videos fields."""
+import re
+
from dateutil.parser import ParserError, parse
+from cds_migrator_kit.errors import UnexpectedValue
from cds_migrator_kit.transform.xml_processing.quality.decorators import (
for_each_value,
require,
)
+from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.contributors import (
+ get_contributor,
+)
# ATTENTION when COPYING! important which model you use as decorator
from ...models.video_lecture import model
+def parse_date(date_str):
+ """Parses a date string into 'YYYY-MM-DD' format.
+
+ Returns None if the string is missing, too short, too long,
+ or if it contains incomplete or ambiguous date information.
+
+ Examples:
+ - Some values only contain year (e.g., "1998")
+ - Some values has date range (e.g., "23 - 27 Nov 1998")
+ """
+ if not date_str:
+ return None
+ if len(date_str) < 10 or len(date_str) > 13:
+ # Too short/long to have the full date info
+ return None
+ try:
+ parsed_date = parse(date_str)
+ return parsed_date.strftime("%Y-%m-%d")
+ except ParserError:
+ return
+
+
@model.over("date", "^518__")
@for_each_value
def date(self, key, value):
"""Translates date from tag 518."""
-
- def parse_date(date_str):
- """Parses a date string into 'YYYY-MM-DD' format."""
- try:
- if len(date_str) < 10: # Too short to have the full date info
- return None
- parsed_date = parse(date_str)
- return parsed_date.strftime("%Y-%m-%d")
- except ParserError:
- return
-
- # List of possible subfields containing dates
- possible_date_fields = [
- value.get("d"), # 518 'd' subfield (e.g., '2024-11-19T14:00:00')
- value.get("c"), # 269 'c' subfield (e.g., '1993-08-09')
- value.get("a"), # 518 'a' subfield (e.g., 'CERN, Geneva, 23 - 27 Nov 1998')
- ]
-
- for date_field in possible_date_fields:
- if date_field:
- parsed_date = parse_date(date_field)
- if parsed_date: # If parsing succeeds, return the formatted date
- return parsed_date
+ # 518 'd' subfield, take the first 10 char, it might have another character (e.g., 2008-03-11T14:00:00Z)
+ parsed_date = parse_date((value.get("d") or "")[:10])
+ if parsed_date:
+ return parsed_date
+ # 518 'a' subfield (e.g., 'CERN, Geneva, 23 - 27 Nov 1998')
+ parsed_date = parse_date(value.get("a", "").split(",")[-1])
+ if parsed_date:
+ return parsed_date
@model.over("date", "^269__")
+@for_each_value
def imprint(self, key, value):
"""Translates date from tag 269."""
- return date(self, key, value)
+ name = value.get("b")
+ if name and name.strip().upper() != "CERN":
+ # checking if anything else stored in this field
+ # and it should be ignored if value == CERN
+ raise UnexpectedValue(field=key, subfield="b", value=name)
+ place = value.get("a")
+ if place and place.strip().upper() != "GENEVA":
+ # checking if anything else stored in this field
+ # and it should be ignored if value == Geneva
+ raise UnexpectedValue(field=key, subfield="a", value=place)
+
+ date_field = value.get("c") # 269 'c' subfield (e.g., '1993-08-09')
+ parsed_date = parse_date(date_field)
+ if parsed_date: # If parsing succeeds, return the formatted date
+ return parsed_date
@model.over("contributors", "^511__")
@for_each_value
@require(["a"])
def performer(self, key, value):
- """Translates performer."""
- name = value.get("a").strip()
+ """Translates performer/Participant."""
role = value.get("e")
- contributor = {"name": name, "role": "Performer"} # TODO or "Participant"
- # TODO contributor affiliation will be implemented
- return contributor
+ if role and role.strip().lower() != "speaker":
+ # checking if anything else stored in this field
+ raise UnexpectedValue("Different role found", field=key, subfield="e", value=role)
+ return get_contributor(key, value, contributor_role="Performer")
@model.over("contributors", "^906__")
@@ -80,7 +108,121 @@ def performer(self, key, value):
@require(["p"])
def event_speakers(self, key, value):
"""Translates event_speakers."""
- name = value.get("p").strip()
- contributor = {"name": name, "role": "Speaker"}
- # TODO contributor affiliation will be implemented
- return contributor
+ return get_contributor(key, value, contributor_role="Speaker", name=value.get("p").strip())
+
+
+@model.over("url_files", "^8564_")
+@for_each_value
+@require(["u"])
+def url_files(self, key, value):
+ """Detects 8564 files."""
+ url = value.get("u")
+ if "digital-memory" in url:
+ return {
+ "digitized": {
+ "url": url,
+ "format": value.get("q"),
+ "link_text": value.get("y"),
+ "public_note": value.get("z"),
+ "nonpublic_note": value.get("x"),
+ "md5_checksum": value.get("w"),
+ "source": value.get("2"),
+ }
+ }
+ elif "indico" in url or "agenda" in url:
+ indico_link = {"url": url}
+
+ # Try to get event id
+ match_id = re.search(r"(?:ida=|confId=|event/)([\w\d]+)", url)
+ if match_id:
+ event_id = match_id.group(1)
+ if event_id:
+ indico_link["event_id"] = event_id
+
+ # Try to get the date from text
+ text = value.get("y")
+ if text:
+ indico_link["text"] = text
+ match_date = re.search(r"(?:Talk\s*)?(\d{1,2}\s\w{3}\s\d{4})", text)
+ if match_date:
+ parsed_date = parse_date(match_date.group(1))
+ if parsed_date:
+ indico_link["date"] = parsed_date
+
+ return {"indico": indico_link}
+
+ url_file = {"url_file": {"url": url}}
+ text = value.get("y")
+ if text:
+ url_file["url_file"]["text"] = text
+
+ nonpublic_note = value.get("x")
+ if nonpublic_note:
+ url_file["url_file"]["nonpublic_note"] = nonpublic_note
+ return url_file
+
+
+@model.over("internal_notes", "^500__")
+@for_each_value
+@require(["a"])
+def internal_notes(self, key, value):
+ """Detects internal notes."""
+ note = value.get("a").strip()
+ if value.get("9"):
+ note = value.get("9").strip() + " : " + value.get("a").strip()
+ internal_note = {"note": note}
+
+ parts = note.split(",")
+ match_date = parts[-1].strip() if len(parts) > 1 else ""
+ if match_date:
+ parsed_date = parse_date(match_date)
+ if parsed_date:
+ internal_note.update({"date": parsed_date})
+
+ return internal_note
+
+
+@model.over("files", "^8567_")
+@for_each_value
+def files(self, key, value):
+ """Detects files."""
+ source = value.get("2")
+ if source and source.strip() != "MediaArchive":
+ # Check if anything else stored
+ raise UnexpectedValue(field=key, subfield="2", value=source)
+
+ file = {}
+
+ # Master path
+ master_path = value.get("d", "").strip()
+ if master_path:
+ if master_path.startswith("/mnt/master_share"):
+ file["master_path"] = master_path
+ file_type = value.get("x", "").strip()
+ if file_type and file_type != "Absolute master path":
+ # Check if anything else stored
+ raise UnexpectedValue(field=key, subfield="x", value=file_type)
+ else:
+ # Raise error if anything else stored
+ raise UnexpectedValue(field=key, subfield="d", value=master_path)
+
+ # File with url/path
+ url = value.get("u", "").strip()
+ if url:
+ if url.startswith("/"):
+ file["path"] = url # Relative path
+ elif url.startswith("https://lecturemedia.cern.ch"):
+ file["url"] = url
+ file["path"] = url.replace("https://lecturemedia.cern.ch", "")
+ else:
+ # Check if anything else stored
+ raise UnexpectedValue(field=key, subfield="u", value=url)
+ file_type = value.get("x")
+ if file_type:
+ file["type"] = file_type.strip()
+
+ description = value.get("y")
+ if description:
+ file["description"] = description.strip()
+
+ return file
diff --git a/run-tests.sh b/run-tests.sh
index 02b13eb7..fa5ae42f 100755
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -39,6 +39,12 @@ for arg in $@; do
-K|--keep-services)
keep_services=1
;;
+ rdm)
+ pytest_args+=( "tests/cds-rdm" "tests/test_cds_migrator_kit.py" )
+ ;;
+ videos)
+ pytest_args+=( "tests/cds-videos" "tests/test_cds_migrator_kit.py" )
+ ;;
*)
pytest_args+=( ${arg} )
;;
diff --git a/setup.cfg b/setup.cfg
index dcc1562d..046a50c7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -15,9 +15,7 @@ packages = find:
python_requires = >=3.9
zip_safe = False
install_requires =
- invenio-app-rdm[opensearch2]>=13.0.0b1.dev4
sentry-sdk>=1.45,<2.0.0
- cds-rdm @ git+https://github.com/CERNDocumentServer/cds-rdm#egg=cds-rdm&subdirectory=site
cds-dojson>=0.12.0
invenio-rdm-migrator>=5.0.0
lxml>=4.6.5
@@ -28,14 +26,20 @@ install_requires =
flask-mail>=0.9.0,<0.10.0
fuzzywuzzy>=0.18.0
python-Levenshtein>=0.25.1
- invenio-preservation-sync==0.1.0
- invenio-cern-sync @ git+https://github.com/cerndocumentserver/invenio-cern-sync@v0.1.2#egg=invenio-cern-sync
-
# needed to run the server
gunicorn
[options.extras_require]
+rdm =
+ invenio-app-rdm[opensearch2]>=13.0.0b1.dev4
+ cds-rdm @ git+https://github.com/CERNDocumentServer/cds-rdm#egg=cds-rdm&subdirectory=site
+ invenio-preservation-sync==0.1.0
+ invenio-cern-sync @ git+https://github.com/cerndocumentserver/invenio-cern-sync@v0.1.2#egg=invenio-cern-sync
+
+videos =
+ cds @ git+https://github.com/CERNDocumentServer/cds-videos#egg=cds
+
tests =
pytest-black>=0.3.0
pytest-invenio>=2.1.0,<3.0.0
@@ -46,12 +50,11 @@ tests =
console_scripts =
migrator = invenio_app.cli:cli
flask.commands =
- migration = cds_migrator_kit.rdm.cli:migration
- videos = cds_migrator_kit.videos.weblecture_migration.cli:videos
+ migration = cds_migrator_kit.cli:cli
invenio_base.apps =
cds_migrator_kit = cds_migrator_kit:CdsMigratorKit
invenio_config.module =
- invenio_app_rdm = cds_migrator_kit.rdm.migration_config
+ invenio_app_rdm = cds_migrator_kit.base_config
invenio_base.blueprints =
cds_migrator_kit_views = cds_migrator_kit.reports.views:blueprint
cds_migrator_kit.migrator.affiliations.model =
@@ -81,7 +84,7 @@ cds_migrator_kit.migrator.rules.thesis =
cds_migrator_kit.migrator.rules.people =
people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people
invenio_pidstore.minters =
- legacy = cds_rdm.minters:legacy_recid_minter
+ legacy = cds_migrator_kit.base_minter:legacy
# videos migration
cds_migrator_kit.videos.models =
video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.models.video_lecture:model
diff --git a/tests/data/all_fields.json b/tests/cds-rdm/data/all_fields.json
similarity index 100%
rename from tests/data/all_fields.json
rename to tests/cds-rdm/data/all_fields.json
diff --git a/tests/data/summer_note.json b/tests/cds-rdm/data/summer_note.json
similarity index 100%
rename from tests/data/summer_note.json
rename to tests/cds-rdm/data/summer_note.json
diff --git a/tests/cds-rdm/test_full_migration.py b/tests/cds-rdm/test_full_migration.py
index b924e8ed..9468818c 100644
--- a/tests/cds-rdm/test_full_migration.py
+++ b/tests/cds-rdm/test_full_migration.py
@@ -25,9 +25,9 @@
from invenio_vocabularies.contrib.names.models import NamesMetadata
from cds_migrator_kit.rdm.records.streams import RecordStreamDefinition
-from cds_migrator_kit.rdm.runner import Runner
from cds_migrator_kit.rdm.users.runner import SubmitterRunner
from cds_migrator_kit.rdm.users.streams import SubmitterStreamDefinition
+from cds_migrator_kit.runner.runner import Runner
def suite_multi_field(record):
@@ -356,7 +356,7 @@ def test_full_migration_stream(
Name.index.refresh()
mocker.patch(
- "cds_migrator_kit.rdm.runner.Runner._read_config",
+ "cds_migrator_kit.runner.runner.Runner._read_config",
return_value={
"db_uri": "postgresql://cds-rdm-migration:cds-rdm-migration@localhost:5432/cds-rdm-migration",
"records": {
diff --git a/tests/test_json_translation_rules.py b/tests/cds-rdm/test_json_translation_rules.py
similarity index 100%
rename from tests/test_json_translation_rules.py
rename to tests/cds-rdm/test_json_translation_rules.py
diff --git a/tests/cds-videos/__init__.py b/tests/cds-videos/__init__.py
new file mode 100644
index 00000000..c001bb54
--- /dev/null
+++ b/tests/cds-videos/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of CDS.
+# Copyright (C) 2025 CERN.
+#
+# cds-migrator-kit is free software; you can redistribute it and/or modify it
+# under the terms of the MIT License; see LICENSE file for more details.
+
+"""Migration tool kit from old CDS to new CDS-Videos - test suite."""
diff --git a/tests/conftest.py b/tests/cds-videos/conftest.py
similarity index 100%
rename from tests/conftest.py
rename to tests/cds-videos/conftest.py
diff --git a/tests/cds-videos/data/lecture.json b/tests/cds-videos/data/lecture.json
new file mode 100644
index 00000000..3b0c0e04
--- /dev/null
+++ b/tests/cds-videos/data/lecture.json
@@ -0,0 +1,26 @@
+[
+ {
+ "files": [],
+ "collections": null,
+ "recid": 2233152,
+ "record": [
+ {
+ "marcxml": "
In this presentation in english" + ) + + +def test_transform_required_metadata(datadir, base_app): + """Test migration transform.""" + with base_app.app_context(): + data = load_json(datadir, "lecture.json") + dump = CDSRecordDump(data=data[0], dojson_model=videos_migrator_marc21) + dump.prepare_revisions() + created_date, res = dump.latest_revision + + # Transform record + record_entry = CDSToVideosRecordEntry() + metadata = record_entry._metadata(res) + assert metadata["title"] == { + "title": "Glimos Instructions for CMS Underground Guiding - in english" + } + assert metadata["date"] == "2016-10-24" + # It should be same with the title + assert metadata["description"].startswith( + "
In this presentation in english"
+ )
+ assert metadata["contributors"] == [
+ {
+ "name": "Brodski, Michael",
+ "role": "Speaker",
+ "affiliations": ["Rheinisch-Westfaelische Tech. Hoch. (DE)"],
+ },
+ {"name": "Dupont, Niels", "role": "Speaker", "affiliations": ["CERN"]},
+ {"name": "Esposito, William", "role": "Speaker", "affiliations": ["CERN"]},
+ ]
+ assert metadata["language"] == "en"
+
+
+def test_transform_description(datadir, base_app):
+ """Test that the description field `520` is correctly transformed."""
+ with base_app.app_context():
+ # Load test data
+ data = load_json(datadir, "lecture.json")
+
+ # Remove the 520 tag (description) from MARCXML
+ modified_data = data[0]
+ modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+ modified_data["record"][-1]["marcxml"], "520"
+ )
+
+ dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+ dump.prepare_revisions()
+ _, res = dump.latest_revision
+
+ # Ensure json_converted_record don't have the description
+ assert "description" not in res
+
+ # Transform record
+ record_entry = CDSToVideosRecordEntry()
+ metadata = record_entry._metadata(res)
+
+ # Ensure description exists and matches the title
+ assert metadata["description"] == metadata["title"]["title"]
+
+
+def test_transform_date(datadir, base_app):
+ """Test that the date field is correctly transformed."""
+ with base_app.app_context():
+ # Load test data
+ data = load_json(datadir, "lecture.json")
+
+ # Test case: Fail due to multiple dates
+ modified_data = data[0]
+ modified_data["record"][-1]["marcxml"] = add_tag_to_marcxml(
+ modified_data["record"][-1]["marcxml"], "518", {"d": "2025-02-06"}
+ )
+ dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+ dump.prepare_revisions()
+ _, res = dump.latest_revision
+
+ # Transform record
+ record_entry = CDSToVideosRecordEntry()
+ with pytest.raises(UnexpectedValue):
+ record_entry._metadata(res)
+
+ # Test case: Fail due to missing dates
+ modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+ modified_data["record"][-1]["marcxml"], "518"
+ )
+ modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+ modified_data["record"][-1]["marcxml"], "269"
+ )
+
+ dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+ dump.prepare_revisions()
+ _, res = dump.latest_revision
+
+ # Transform record
+ with pytest.raises(MissingRequiredField):
+ record_entry._metadata(res)
+
+
+def test_transform_contributor(datadir, base_app):
+ """Test that the date field is correctly transformed."""
+ with base_app.app_context():
+ # Load test data
+ data = load_json(datadir, "lecture.json")
+
+ # Test case: Fail due to missing contributor
+ modified_data = data[0]
+ modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+ modified_data["record"][-1]["marcxml"], "700"
+ )
+ modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+ modified_data["record"][-1]["marcxml"], "906"
+ )
+
+ dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+ dump.prepare_revisions()
+ _, res = dump.latest_revision
+
+ # Transform record it should fail (no contributor)
+ record_entry = CDSToVideosRecordEntry()
+ with pytest.raises(MissingRequiredField):
+ record_entry._metadata(res)
+
+
+def test_transform_digitized(datadir, base_app):
+ """Test digitized field is correctly transformed."""
+ with base_app.app_context():
+ # Load test data
+ data = load_json(datadir, "lecture.json")
+
+ # Get digitized record and apply rules
+ entry_data = data[1]
+ dump = CDSRecordDump(data=entry_data, dojson_model=videos_migrator_marc21)
+ dump.prepare_revisions()
+ _, res = dump.latest_revision
+
+ digitized = [
+ item["digitized"] for item in res["url_files"] if "digitized" in item
+ ]
+
+ # Check length
+ assert len(digitized) == 3, f"Expected 3 digitized items, got {len(digitized)}"
+
+ # Check all URLs contain "digital-memory"
+ for item in digitized:
+ assert (
+ "digital-memory" in item["url"]
+ ), f"URL {item['url']} does not contain 'digital-memory'"
+
+ # Transform record it should fail (no valid date, it has date range)
+ record_entry = CDSToVideosRecordEntry()
+ with pytest.raises(MissingRequiredField):
+ record_entry._metadata(res)
+
+
+def test_transform_files(datadir, base_app):
+ """Test files field is correctly transformed."""
+ with base_app.app_context():
+ # Load test data
+ data = load_json(datadir, "lecture.json")
+
+ # Get record and apply rules
+ entry_data = data[1]
+ dump = CDSRecordDump(data=entry_data, dojson_model=videos_migrator_marc21)
+ dump.prepare_revisions()
+ _, res = dump.latest_revision
+
+ # Test master paths
+ master_paths = [
+ item["master_path"] for item in res["files"] if "master_path" in item
+ ]
+ assert (
+ len(master_paths) == 3
+ ), f"Expected 3 master_path items, got {len(master_paths)}"
+ for path in master_paths:
+ assert (
+ "/mnt/master_share" in path
+ ), f"Path {path} does not contain '/mnt/master_share'"
+
+ # Test file paths (excluding URLs)
+ file_paths = [
+ item["path"]
+ for item in res["files"]
+ if "path" in item and "url" not in item
+ ]
+ assert (
+ len(file_paths) == 6
+ ), f"Expected 6 only path items, got {len(file_paths)}"
+ for path in file_paths:
+ assert path.startswith("/"), f"Path {path} does not start with '/'"
+
+ # Test URL files
+ url_files = [item for item in res["files"] if "url" in item]
+ assert len(url_files) == 6, f"Expected 6 URL file items, got {len(url_files)}"
+ for url_file in url_files:
+ assert "url" in url_file, f"Missing 'url' key in item: {url_file}"
+ assert "path" in url_file, f"Missing 'path' key in item: {url_file}"
+ assert (
+ "lecturemedia" in url_file["url"]
+ ), f"URL {url_file['url']} does not contain 'lecturemedia'"
+
+
+def test_transform_internal_note(datadir, base_app):
+ """Test digitized field is correctly transformed."""
+ with base_app.app_context():
+ # Load test data
+ data = load_json(datadir, "lecture.json")
+
+ # Get record and apply rules
+ entry_data = data[1]
+ dump = CDSRecordDump(data=entry_data, dojson_model=videos_migrator_marc21)
+ dump.prepare_revisions()
+ _, res = dump.latest_revision
+
+ # Record has one internal note
+ assert "internal_notes" in res
+ notes = [item for item in res["internal_notes"]]
+ assert notes
+ assert "date" not in notes[0] # note includes date but it's not valid
+
+ # Transform record it should fail (no valid date, it has date range)
+ record_entry = CDSToVideosRecordEntry()
+ with pytest.raises(MissingRequiredField):
+ record_entry._metadata(res)
+
+ # Test case: Add internal note which has a valid date to record
+ modified_data = data[1]
+ # Remove the current internal note
+ modified_data["record"][-1]["marcxml"] = remove_tag_from_marcxml(
+ modified_data["record"][-1]["marcxml"], "500"
+ )
+ # Add new internal note with a valid date
+ modified_data["record"][-1]["marcxml"] = add_tag_to_marcxml(
+ modified_data["record"][-1]["marcxml"], "500", {"a": "Note, 16 Feb 2001"}
+ )
+ dump = CDSRecordDump(data=modified_data, dojson_model=videos_migrator_marc21)
+ dump.prepare_revisions()
+ _, res = dump.latest_revision
+
+ # Record has one internal note
+ assert "internal_notes" in res
+ notes = [item for item in res["internal_notes"]]
+ assert notes
+ assert "date" in notes[0] # note has a valid date
+
+ # Transform record without failure (it has a valid date)
+ record_entry = CDSToVideosRecordEntry()
+ metadata = record_entry._metadata(res)
+ assert "date" in metadata
+ assert "2001-02-16" == metadata["date"]
diff --git a/tests/helpers.py b/tests/helpers.py
index b24b67aa..f40f45f8 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -9,6 +9,7 @@
"""Helper functions for usage in tests."""
import json
+import xml.etree.ElementTree as ET
from os.path import join
@@ -19,3 +20,45 @@ def load_json(datadir, filename):
with open(filepath, "r") as file_:
data = json.load(file_)
return data
+
+
+def remove_tag_from_marcxml(marcxml, tag):
+ """
+ Removes a specific MARCXML datafield tag to manipulate the record.
+
+ :param marcxml: The MARCXML string.
+ :param tag: The MARC tag (e.g., "520") to remove.
+ :return: Modified MARCXML string with the specified tag removed.
+ """
+ root = ET.fromstring(marcxml)
+
+ # Find and remove all