From 83befebb24e30a7610e63b5b6b755036da104f37 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 13 Nov 2025 15:56:59 +0100 Subject: [PATCH 01/13] Colorize warn and error log levels --- scripts/dataset/logger/index.js | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/dataset/logger/index.js b/scripts/dataset/logger/index.js index 802eb7cb9..5de6d0b36 100644 --- a/scripts/dataset/logger/index.js +++ b/scripts/dataset/logger/index.js @@ -13,7 +13,19 @@ logger.format = combine( const timestampPrefix = config.get('@opentermsarchive/engine.logger.timestampPrefix') ? `${timestamp} ` : ''; - return `${timestampPrefix}${level.padEnd(15)} ${prefix.padEnd(50)} ${message}`; + const levelStr = level.padEnd(15); + let coloredLevel = levelStr; + let coloredMessage = message; + + if (level.includes('warn')) { + coloredLevel = `\x1b[33m${levelStr}\x1b[0m`; + coloredMessage = `\x1b[33m${message}\x1b[0m`; + } else if (level.includes('error')) { + coloredLevel = `\x1b[31m${levelStr}\x1b[0m`; + coloredMessage = `\x1b[31m${message}\x1b[0m`; + } + + return `${timestampPrefix} ${coloredLevel} ${prefix.padEnd(50)} ${coloredMessage}`; }), ); From 9c2f451c2fa58fa120a65886d33c3a9ac288ee3b Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 13 Nov 2025 15:58:21 +0100 Subject: [PATCH 02/13] Create datagouv publisher --- .env.example | 1 + scripts/dataset/publish/datagouv/dataset.js | 115 ++++++++++++++++++++ scripts/dataset/publish/datagouv/index.js | 47 ++++++++ 3 files changed, 163 insertions(+) create mode 100644 scripts/dataset/publish/datagouv/dataset.js create mode 100644 scripts/dataset/publish/datagouv/index.js diff --git a/.env.example b/.env.example index 6a2bad0ff..928c8bd79 100644 --- a/.env.example +++ b/.env.example @@ -4,6 +4,7 @@ OTA_ENGINE_GITHUB_TOKEN=your_github_token_here OTA_ENGINE_GITLAB_TOKEN=your_gitlab_token_here OTA_ENGINE_GITLAB_RELEASES_TOKEN=your_gitlab_releases_token_here +OTA_ENGINE_DATAGOUV_API_KEY=your_datagouv_api_key_here OTA_ENGINE_SENDINBLUE_API_KEY=your_sendinblue_api_key_here OTA_ENGINE_SMTP_PASSWORD=your_smtp_password_here diff --git a/scripts/dataset/publish/datagouv/dataset.js b/scripts/dataset/publish/datagouv/dataset.js new file mode 100644 index 000000000..9c059a29c --- /dev/null +++ b/scripts/dataset/publish/datagouv/dataset.js @@ -0,0 +1,115 @@ +import fsApi from 'fs'; +import path from 'path'; + +import FormData from 'form-data'; +import nodeFetch from 'node-fetch'; + +import * as readme from '../../assets/README.template.js'; +import logger from '../../logger/index.js'; + +const DATASET_LICENSE = 'odc-odbl'; +const DEFAULT_RESOURCE_DESCRIPTION = 'See README.md inside the archive for dataset structure and usage information.'; + +export async function updateDatasetMetadata({ apiBaseUrl, headers, datasetId, releaseDate, stats }) { + const updatePayload = { + title: readme.title({ releaseDate }), + description: readme.body(stats), + license: DATASET_LICENSE, + }; + + if (stats?.firstVersionDate && stats?.lastVersionDate) { + updatePayload.temporal_coverage = { + start: stats.firstVersionDate.toISOString(), + end: stats.lastVersionDate.toISOString(), + }; + } + + const updateResponse = await nodeFetch(`${apiBaseUrl}/datasets/${datasetId}/`, { + method: 'PUT', + headers: { + ...headers, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(updatePayload), + }); + + if (!updateResponse.ok) { + const errorText = await updateResponse.text(); + + throw new Error(`Failed to update dataset metadata: ${updateResponse.status} ${updateResponse.statusText} - ${errorText}`); + } +} + +export async function uploadResource({ apiBaseUrl, headers, datasetId, archivePath }) { + logger.info('Uploading dataset archive…'); + + const formData = new FormData(); + const fileName = path.basename(archivePath); + const fileStats = fsApi.statSync(archivePath); + + formData.append('file', fsApi.createReadStream(archivePath), { + filename: fileName, + contentType: 'application/zip', + knownLength: fileStats.size, + }); + + const uploadResponse = await nodeFetch(`${apiBaseUrl}/datasets/${datasetId}/upload/`, { + method: 'POST', + headers: { ...formData.getHeaders(), ...headers }, + body: formData, + }); + + if (!uploadResponse.ok) { + const errorText = await uploadResponse.text(); + + throw new Error(`Failed to upload dataset file: ${uploadResponse.status} ${uploadResponse.statusText} - ${errorText}`); + } + + const uploadResult = await uploadResponse.json(); + + logger.info(`Dataset file uploaded successfully with resource ID: ${uploadResult.id}`); + + return { resourceId: uploadResult.id, fileName }; +} + +export async function updateResourceMetadata({ apiBaseUrl, headers, datasetId, resourceId, fileName }) { + logger.info('Updating resource metadata…'); + + const resourceUpdateResponse = await nodeFetch(`${apiBaseUrl}/datasets/${datasetId}/resources/${resourceId}/`, { + method: 'PUT', + headers: { ...headers, 'Content-Type': 'application/json' }, + body: JSON.stringify({ + title: fileName, + description: DEFAULT_RESOURCE_DESCRIPTION, + filetype: 'file', + format: 'zip', + mime: 'application/zip', + }), + }); + + if (!resourceUpdateResponse.ok) { + const errorText = await resourceUpdateResponse.text(); + + throw new Error(`Failed to update resource metadata: ${resourceUpdateResponse.status} ${resourceUpdateResponse.statusText} - ${errorText}`); + } + + logger.info('Resource metadata updated successfully'); +} + +export async function getDatasetUrl({ apiBaseUrl, headers, datasetId }) { + const datasetResponse = await nodeFetch(`${apiBaseUrl}/datasets/${datasetId}/`, { + method: 'GET', + headers: { ...headers }, + }); + + if (!datasetResponse.ok) { + const errorText = await datasetResponse.text(); + + throw new Error(`Failed to retrieve dataset URL: ${datasetResponse.status} ${datasetResponse.statusText} - ${errorText}`); + } + + const datasetData = await datasetResponse.json(); + const datasetUrl = datasetData.page; + + return datasetUrl; +} diff --git a/scripts/dataset/publish/datagouv/index.js b/scripts/dataset/publish/datagouv/index.js new file mode 100644 index 000000000..1868086ee --- /dev/null +++ b/scripts/dataset/publish/datagouv/index.js @@ -0,0 +1,47 @@ +import config from 'config'; + +import logger from '../../logger/index.js'; + +import { updateDatasetMetadata, uploadResource, updateResourceMetadata, getDatasetUrl } from './dataset.js'; + +const PRODUCTION_API_BASE_URL = 'https://www.data.gouv.fr/api/1'; +const DEMO_API_BASE_URL = 'https://demo.data.gouv.fr/api/1'; + +function loadConfiguration() { + const apiKey = process.env.OTA_ENGINE_DATAGOUV_API_KEY; + + if (!apiKey) { + throw new Error('OTA_ENGINE_DATAGOUV_API_KEY environment variable is required for data.gouv.fr publishing'); + } + + const datasetId = config.get('@opentermsarchive/engine.dataset.datagouv.datasetId'); + + if (!datasetId) { + throw new Error('datasetId is required in config at @opentermsarchive/engine.dataset.datagouv.datasetId. Run "node scripts/dataset/publish/datagouv/create-dataset.js" to create a dataset first.'); + } + + const useDemo = config.get('@opentermsarchive/engine.dataset.datagouv.useDemo'); + const apiBaseUrl = useDemo ? DEMO_API_BASE_URL : PRODUCTION_API_BASE_URL; + + if (useDemo) { + logger.warn('Using demo.data.gouv.fr environment for testing'); + } + + const headers = { 'X-API-KEY': apiKey }; + + return { datasetId, apiBaseUrl, headers }; +} + +export default async function publish({ archivePath, releaseDate, stats }) { + const config = loadConfiguration(); + + await updateDatasetMetadata({ ...config, releaseDate, stats }); + + const { resourceId, fileName } = await uploadResource({ ...config, archivePath }); + + await updateResourceMetadata({ ...config, resourceId, fileName }); + + const datasetUrl = await getDatasetUrl({ ...config }); + + return datasetUrl; +} From df66f7ca02dc3fecbb4e871f00ef42aa771f512a Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 13 Nov 2025 16:49:37 +0100 Subject: [PATCH 03/13] Allow publishing both on GitHub/Gitlab & datagouv --- bin/ota-dataset.js | 4 +-- scripts/dataset/index.js | 9 +++++-- scripts/dataset/publish/index.js | 44 ++++++++++++++++++++++++++++---- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/bin/ota-dataset.js b/bin/ota-dataset.js index 060649187..43328f6d8 100755 --- a/bin/ota-dataset.js +++ b/bin/ota-dataset.js @@ -11,9 +11,9 @@ import logger from '../src/logger/index.js'; program .name('ota dataset') - .description('Export the versions dataset into a ZIP file and optionally publish it to GitHub releases') + .description('Export the versions dataset into a ZIP file and optionally publish it to GitHub releases, GitLab releases, or data.gouv.fr') .option('-f, --file ', 'file name of the generated dataset') - .option('-p, --publish', 'publish dataset to GitHub releases on versions repository. Mandatory authentication to GitHub is provided through the `OTA_ENGINE_GITHUB_TOKEN` environment variable') + .option('-p, --publish', 'publish dataset. Supports GitHub releases (OTA_ENGINE_GITHUB_TOKEN), GitLab releases (OTA_ENGINE_GITLAB_TOKEN), or data.gouv.fr (OTA_ENGINE_DATAGOUV_API_KEY + config)') .option('-r, --remove-local-copy', 'remove local copy of dataset after publishing. Works only in combination with --publish option') .option('--schedule', 'schedule automatic dataset generation'); diff --git a/scripts/dataset/index.js b/scripts/dataset/index.js index 4c739686d..8f1cd18fc 100644 --- a/scripts/dataset/index.js +++ b/scripts/dataset/index.js @@ -24,13 +24,18 @@ export async function release({ shouldPublish, shouldRemoveLocalCopy, fileName } logger.info('Start publishing dataset…'); - const releaseUrl = await publishRelease({ + const results = await publishRelease({ archivePath, releaseDate, stats, }); - logger.info(`Dataset published to ${releaseUrl}`); + if (results.length > 0) { + logger.info('Dataset published to following platforms:'); + results.forEach(result => { + logger.info(` - ${result.platform}: ${result.url}`); + }); + } if (!shouldRemoveLocalCopy) { return; diff --git a/scripts/dataset/publish/index.js b/scripts/dataset/publish/index.js index 6ed8ead0f..0386f461f 100644 --- a/scripts/dataset/publish/index.js +++ b/scripts/dataset/publish/index.js @@ -1,15 +1,49 @@ +import config from 'config'; + +import logger from '../logger/index.js'; + +import publishDataGouv from './datagouv/index.js'; import publishGitHub from './github/index.js'; import publishGitLab from './gitlab/index.js'; -export default function publishRelease({ archivePath, releaseDate, stats }) { +export default async function publishRelease({ archivePath, releaseDate, stats }) { + const platforms = []; + // If both GitHub and GitLab tokens are defined, GitHub takes precedence if (process.env.OTA_ENGINE_GITHUB_TOKEN) { - return publishGitHub({ archivePath, releaseDate, stats }); + platforms.push({ name: 'GitHub', publish: () => publishGitHub({ archivePath, releaseDate, stats }) }); + } else if (process.env.OTA_ENGINE_GITLAB_TOKEN) { + platforms.push({ name: 'GitLab', publish: () => publishGitLab({ archivePath, releaseDate, stats }) }); + } + + if (process.env.OTA_ENGINE_DATAGOUV_API_KEY && config.get('@opentermsarchive/engine.dataset.datagouv.datasetId')) { + platforms.push({ name: 'data.gouv.fr', publish: () => publishDataGouv({ archivePath, releaseDate, stats }) }); + } + + if (!platforms.length) { + throw new Error('No publishing platform configured. Please configure at least one of: GitHub (OTA_ENGINE_GITHUB_TOKEN), GitLab (OTA_ENGINE_GITLAB_TOKEN), or data.gouv.fr (OTA_ENGINE_DATAGOUV_API_KEY + datasetId in config).'); } - if (process.env.OTA_ENGINE_GITLAB_TOKEN) { - return publishGitLab({ archivePath, releaseDate, stats }); + const results = await Promise.allSettled(platforms.map(async platform => { + const url = await platform.publish(); + + return { platform: platform.name, url }; + })); + + const succeeded = results.filter(result => result.status === 'fulfilled'); + const failed = results.filter(result => result.status === 'rejected'); + + if (failed.length) { + let errorMessage = !succeeded.length ? 'All platforms failed to publish:' : 'Some platforms failed to publish:'; + + failed.forEach(rejectedResult => { + const index = results.indexOf(rejectedResult); + + errorMessage += `\n - ${platforms[index].name}: ${rejectedResult.reason.message}`; + }); + + logger.error(errorMessage); } - throw new Error('No GitHub nor GitLab token found in environment variables (OTA_ENGINE_GITHUB_TOKEN or OTA_ENGINE_GITLAB_TOKEN). Cannot publish the dataset without authentication.'); + return succeeded.map(result => result.value); } From e303ea31fa4c2730aea221691827c4b11d844258 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 13 Nov 2025 17:17:03 +0100 Subject: [PATCH 04/13] Add changelog entry --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6601e83c0..fce227424 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased [minor] + +> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs. + +### Added + +- Add support for publishing datasets to data.gouv.fr; configure `dataset.datagouv.datasetId` in configuration file and set `OTA_ENGINE_DATAGOUV_API_KEY` environment variable +- Add ability to publish datasets to multiple platforms simultaneously; datasets can now be published to GitHub (or GitLab) and data.gouv.fr in parallel + ## 10.0.1 - 2025-11-24 _Full changeset and discussions: [#1208](https://github.com/OpenTermsArchive/engine/pull/1208)._ From 30210a219ea1f24539c23158671804ca813ce2ad Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 20 Nov 2025 11:12:44 +0100 Subject: [PATCH 05/13] Add module prefix to dataset logs --- scripts/dataset/logger/index.js | 14 ++++++++++++-- scripts/dataset/publish/datagouv/dataset.js | 4 +++- scripts/dataset/publish/datagouv/index.js | 3 ++- scripts/dataset/publish/github/index.js | 3 +++ scripts/dataset/publish/gitlab/index.js | 4 +++- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/scripts/dataset/logger/index.js b/scripts/dataset/logger/index.js index 5de6d0b36..dc14b7a4d 100644 --- a/scripts/dataset/logger/index.js +++ b/scripts/dataset/logger/index.js @@ -8,11 +8,13 @@ const { combine, timestamp, printf, colorize } = winston.format; logger.format = combine( colorize(), timestamp({ format: 'YYYY-MM-DDTHH:mm:ssZ' }), - printf(({ level, message, counter, hash, timestamp }) => { - const prefix = counter && hash ? `${counter.toString().padEnd(6)} ${hash.padEnd(40)}` : ''; + printf(({ level, message, counter, hash, timestamp, module }) => { + let prefix = counter && hash ? `${counter.toString().padEnd(6)} ${hash.padEnd(40)}` : ''; const timestampPrefix = config.get('@opentermsarchive/engine.logger.timestampPrefix') ? `${timestamp} ` : ''; + prefix = module ? `${module} ${prefix}` : prefix; + const levelStr = level.padEnd(15); let coloredLevel = levelStr; let coloredMessage = message; @@ -29,4 +31,12 @@ logger.format = combine( }), ); +export function createModuleLogger(moduleName) { + return { + info: message => logger.info(message, { module: moduleName }), + warn: message => logger.warn(message, { module: moduleName }), + error: message => logger.error(message, { module: moduleName }), + }; +} + export default logger; diff --git a/scripts/dataset/publish/datagouv/dataset.js b/scripts/dataset/publish/datagouv/dataset.js index 9c059a29c..a2f3f39f6 100644 --- a/scripts/dataset/publish/datagouv/dataset.js +++ b/scripts/dataset/publish/datagouv/dataset.js @@ -5,7 +5,9 @@ import FormData from 'form-data'; import nodeFetch from 'node-fetch'; import * as readme from '../../assets/README.template.js'; -import logger from '../../logger/index.js'; +import { createModuleLogger } from '../../logger/index.js'; + +const logger = createModuleLogger('datagouv'); const DATASET_LICENSE = 'odc-odbl'; const DEFAULT_RESOURCE_DESCRIPTION = 'See README.md inside the archive for dataset structure and usage information.'; diff --git a/scripts/dataset/publish/datagouv/index.js b/scripts/dataset/publish/datagouv/index.js index 1868086ee..220e2a229 100644 --- a/scripts/dataset/publish/datagouv/index.js +++ b/scripts/dataset/publish/datagouv/index.js @@ -1,8 +1,9 @@ import config from 'config'; -import logger from '../../logger/index.js'; +import { createModuleLogger } from '../../logger/index.js'; import { updateDatasetMetadata, uploadResource, updateResourceMetadata, getDatasetUrl } from './dataset.js'; +const logger = createModuleLogger('datagouv'); const PRODUCTION_API_BASE_URL = 'https://www.data.gouv.fr/api/1'; const DEMO_API_BASE_URL = 'https://demo.data.gouv.fr/api/1'; diff --git a/scripts/dataset/publish/github/index.js b/scripts/dataset/publish/github/index.js index 6d83ac47a..ac26a6979 100644 --- a/scripts/dataset/publish/github/index.js +++ b/scripts/dataset/publish/github/index.js @@ -6,6 +6,9 @@ import config from 'config'; import { Octokit } from 'octokit'; // eslint-disable-line import/no-unresolved import * as readme from '../../assets/README.template.js'; +import { createModuleLogger } from '../../logger/index.js'; + +const logger = createModuleLogger('github'); export default async function publish({ archivePath, releaseDate, stats }) { const octokit = new Octokit({ auth: process.env.OTA_ENGINE_GITHUB_TOKEN }); diff --git a/scripts/dataset/publish/gitlab/index.js b/scripts/dataset/publish/gitlab/index.js index ba8f2f3d9..7e45a907c 100644 --- a/scripts/dataset/publish/gitlab/index.js +++ b/scripts/dataset/publish/gitlab/index.js @@ -8,7 +8,9 @@ import nodeFetch from 'node-fetch'; import GitLab from '../../../../src/reporter/gitlab/index.js'; import * as readme from '../../assets/README.template.js'; -import logger from '../../logger/index.js'; +import { createModuleLogger } from '../../logger/index.js'; + +const logger = createModuleLogger('gitlab'); dotenv.config({ quiet: true }); From 3a3c73aea9847c6d5227b9805af49813d08f458d Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 20 Nov 2025 11:16:18 +0100 Subject: [PATCH 06/13] Unify logs between publish modules --- scripts/dataset/publish/github/index.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/dataset/publish/github/index.js b/scripts/dataset/publish/github/index.js index ac26a6979..25171eeb4 100644 --- a/scripts/dataset/publish/github/index.js +++ b/scripts/dataset/publish/github/index.js @@ -17,6 +17,8 @@ export default async function publish({ archivePath, releaseDate, stats }) { const tagName = `${path.basename(archivePath, path.extname(archivePath))}`; // use archive filename as Git tag + logger.info(`Creating release for ${owner}/${repo}…`); + const { data: { upload_url: uploadUrl, html_url: releaseUrl } } = await octokit.rest.repos.createRelease({ owner, repo, @@ -25,6 +27,9 @@ export default async function publish({ archivePath, releaseDate, stats }) { body: readme.body(stats), }); + logger.info(`Release created successfully with tag: ${tagName}`); + logger.info('Uploading release asset…'); + await octokit.rest.repos.uploadReleaseAsset({ data: fsApi.readFileSync(archivePath), headers: { @@ -35,5 +40,7 @@ export default async function publish({ archivePath, releaseDate, stats }) { url: uploadUrl, }); + logger.info(`Release asset uploaded successfully: ${path.basename(archivePath)}`); + return releaseUrl; } From d3b697ee5032efcb74cf4ee9d2432b784576ecbe Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 20 Nov 2025 11:17:20 +0100 Subject: [PATCH 07/13] Sanitize title for archive name --- scripts/dataset/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dataset/index.js b/scripts/dataset/index.js index 8f1cd18fc..67390d95e 100644 --- a/scripts/dataset/index.js +++ b/scripts/dataset/index.js @@ -9,7 +9,7 @@ import publishRelease from './publish/index.js'; export async function release({ shouldPublish, shouldRemoveLocalCopy, fileName }) { const releaseDate = new Date(); - const archiveName = fileName || `dataset-${config.get('@opentermsarchive/engine.dataset.title')}-${releaseDate.toISOString().replace(/T.*/, '')}`; + const archiveName = fileName || `${config.get('@opentermsarchive/engine.dataset.title').toLowerCase().replace(/[^a-zA-Z0-9.\-_]/g, '-')}-${releaseDate.toISOString().replace(/T.*/, '')}`; const archivePath = `${path.basename(archiveName, '.zip')}.zip`; // allow to pass filename or filename.zip as the archive name and have filename.zip as the result name logger.info('Start exporting dataset…'); From 37ad51282edfb590e45869bee5b470d02a8965c4 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 20 Nov 2025 11:17:20 +0100 Subject: [PATCH 08/13] Update default dataset title --- config/default.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/default.json b/config/default.json index 637e9dafb..3cac8ea15 100644 --- a/config/default.json +++ b/config/default.json @@ -56,7 +56,7 @@ } }, "dataset": { - "title": "sandbox", + "title": "Sandbox collection dataset", "versionsRepositoryURL": "https://github.com/OpenTermsArchive/sandbox-declarations", "publishingSchedule": "30 8 * * MON" } From 3764459aca0647827ac9f337969144ffa7ae7577 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 20 Nov 2025 11:17:55 +0100 Subject: [PATCH 09/13] Do not extend dataset title --- scripts/dataset/assets/README.template.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dataset/assets/README.template.js b/scripts/dataset/assets/README.template.js index 1c63d117c..abfc87e6b 100644 --- a/scripts/dataset/assets/README.template.js +++ b/scripts/dataset/assets/README.template.js @@ -14,7 +14,7 @@ export function title({ releaseDate }) { const title = config.get('@opentermsarchive/engine.dataset.title'); - return `${title} — ${releaseDate} dataset`; + return `${title} — ${releaseDate}`; } export function body({ servicesCount, firstVersionDate, lastVersionDate }) { From 86ff4fc70456c019c59cd79f9b6cf50cd1f854d4 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 20 Nov 2025 11:17:55 +0100 Subject: [PATCH 10/13] Improve datagouv module Allow passing either datasetId or organisationIdOrSlug --- scripts/dataset/publish/datagouv/dataset.js | 179 ++++++++++++++++---- scripts/dataset/publish/datagouv/index.js | 60 +++++-- scripts/dataset/publish/index.js | 4 +- 3 files changed, 197 insertions(+), 46 deletions(-) diff --git a/scripts/dataset/publish/datagouv/dataset.js b/scripts/dataset/publish/datagouv/dataset.js index a2f3f39f6..88c9652bf 100644 --- a/scripts/dataset/publish/datagouv/dataset.js +++ b/scripts/dataset/publish/datagouv/dataset.js @@ -4,7 +4,6 @@ import path from 'path'; import FormData from 'form-data'; import nodeFetch from 'node-fetch'; -import * as readme from '../../assets/README.template.js'; import { createModuleLogger } from '../../logger/index.js'; const logger = createModuleLogger('datagouv'); @@ -12,11 +11,113 @@ const logger = createModuleLogger('datagouv'); const DATASET_LICENSE = 'odc-odbl'; const DEFAULT_RESOURCE_DESCRIPTION = 'See README.md inside the archive for dataset structure and usage information.'; -export async function updateDatasetMetadata({ apiBaseUrl, headers, datasetId, releaseDate, stats }) { +const routes = { + dataset: (apiBaseUrl, datasetId) => `${apiBaseUrl}/datasets/${datasetId}/`, + datasets: apiBaseUrl => `${apiBaseUrl}/datasets/`, + datasetUpload: (apiBaseUrl, datasetId) => `${apiBaseUrl}/datasets/${datasetId}/upload/`, + resource: (apiBaseUrl, datasetId, resourceId) => `${apiBaseUrl}/datasets/${datasetId}/resources/${resourceId}/`, + resourceUpload: (apiBaseUrl, datasetId, resourceId) => `${apiBaseUrl}/datasets/${datasetId}/resources/${resourceId}/upload/`, + organization: (apiBaseUrl, organizationIdOrSlug) => `${apiBaseUrl}/organizations/${organizationIdOrSlug}/`, + organizationDatasets: (apiBaseUrl, organizationId) => `${apiBaseUrl}/organizations/${organizationId}/datasets/?page_size=100`, +}; + +export async function getOrganization({ apiBaseUrl, headers, organizationIdOrSlug }) { + logger.info(`Fetching organization: ${organizationIdOrSlug}…`); + + const orgResponse = await nodeFetch(routes.organization(apiBaseUrl, organizationIdOrSlug), { headers }); + + if (!orgResponse.ok) { + const errorText = await orgResponse.text(); + + throw new Error(`Failed to retrieve organization: ${orgResponse.status} ${orgResponse.statusText} - ${errorText}`); + } + + const orgData = await orgResponse.json(); + + logger.info(`Found organization: ${orgData.name} (ID: ${orgData.id})`); + + return orgData; +} + +export async function getDataset({ apiBaseUrl, headers, datasetId }) { + const datasetResponse = await nodeFetch(routes.dataset(apiBaseUrl, datasetId), { headers }); + + if (!datasetResponse.ok) { + const errorText = await datasetResponse.text(); + const error = new Error(`Failed to retrieve dataset: ${datasetResponse.status} ${datasetResponse.statusText} - ${errorText}`); + + error.statusCode = datasetResponse.status; + throw error; + } + + const datasetData = await datasetResponse.json(); + + return datasetData; +} + +export async function findDatasetByTitle({ apiBaseUrl, headers, organizationId, title }) { + logger.info(`Searching for dataset with title "${title}" in organization…`); + + const searchResponse = await nodeFetch(routes.organizationDatasets(apiBaseUrl, organizationId), { headers }); + + if (!searchResponse.ok) { + const errorText = await searchResponse.text(); + + throw new Error(`Failed to search for datasets: ${searchResponse.status} ${searchResponse.statusText} - ${errorText}`); + } + + const searchData = await searchResponse.json(); + + const dataset = searchData.data.find(ds => ds.title === title); + + if (dataset) { + logger.info(`Found existing dataset: ${dataset.title} (ID: ${dataset.id})`); + + return dataset; + } + + logger.info('No existing dataset found with this title'); + + return null; +} + +export async function createDataset({ apiBaseUrl, headers, organizationId, title, description, license, frequency }) { + logger.info(`Creating new dataset: ${title}…`); + + const createResponse = await nodeFetch(routes.datasets(apiBaseUrl), { + method: 'POST', + headers: { + ...headers, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + title, + description, + organization: organizationId, + license, + frequency, + }), + }); + + if (!createResponse.ok) { + const errorText = await createResponse.text(); + + throw new Error(`Failed to create dataset: ${createResponse.status} ${createResponse.statusText} - ${errorText}`); + } + + const dataset = await createResponse.json(); + + logger.info(`Dataset created successfully: ${dataset.title} (ID: ${dataset.id})`); + + return dataset; +} + +export async function updateDatasetMetadata({ apiBaseUrl, headers, datasetId, title, description, stats, frequency }) { const updatePayload = { - title: readme.title({ releaseDate }), - description: readme.body(stats), + title, + description, license: DATASET_LICENSE, + frequency, }; if (stats?.firstVersionDate && stats?.lastVersionDate) { @@ -26,7 +127,7 @@ export async function updateDatasetMetadata({ apiBaseUrl, headers, datasetId, re }; } - const updateResponse = await nodeFetch(`${apiBaseUrl}/datasets/${datasetId}/`, { + const updateResponse = await nodeFetch(routes.dataset(apiBaseUrl, datasetId), { method: 'PUT', headers: { ...headers, @@ -37,25 +138,21 @@ export async function updateDatasetMetadata({ apiBaseUrl, headers, datasetId, re if (!updateResponse.ok) { const errorText = await updateResponse.text(); + const error = new Error(`Failed to update dataset metadata: ${updateResponse.status} ${updateResponse.statusText} - ${errorText}`); - throw new Error(`Failed to update dataset metadata: ${updateResponse.status} ${updateResponse.statusText} - ${errorText}`); + error.statusCode = updateResponse.status; + throw error; } + + logger.info('Dataset metadata updated successfully'); } export async function uploadResource({ apiBaseUrl, headers, datasetId, archivePath }) { logger.info('Uploading dataset archive…'); - const formData = new FormData(); - const fileName = path.basename(archivePath); - const fileStats = fsApi.statSync(archivePath); + const { formData, fileName } = createFormDataForFile(archivePath); - formData.append('file', fsApi.createReadStream(archivePath), { - filename: fileName, - contentType: 'application/zip', - knownLength: fileStats.size, - }); - - const uploadResponse = await nodeFetch(`${apiBaseUrl}/datasets/${datasetId}/upload/`, { + const uploadResponse = await nodeFetch(routes.datasetUpload(apiBaseUrl, datasetId), { method: 'POST', headers: { ...formData.getHeaders(), ...headers }, body: formData, @@ -74,10 +171,34 @@ export async function uploadResource({ apiBaseUrl, headers, datasetId, archivePa return { resourceId: uploadResult.id, fileName }; } +export async function replaceResourceFile({ apiBaseUrl, headers, datasetId, resourceId, archivePath }) { + logger.info(`Replacing file for existing resource ID: ${resourceId}…`); + + const { formData, fileName } = createFormDataForFile(archivePath); + + const uploadResponse = await nodeFetch(routes.resourceUpload(apiBaseUrl, datasetId, resourceId), { + method: 'POST', + headers: { ...formData.getHeaders(), ...headers }, + body: formData, + }); + + if (!uploadResponse.ok) { + const errorText = await uploadResponse.text(); + + throw new Error(`Failed to replace resource file: ${uploadResponse.status} ${uploadResponse.statusText} - ${errorText}`); + } + + const uploadResult = await uploadResponse.json(); + + logger.info('Resource file replaced successfully'); + + return { resourceId: uploadResult.id, fileName }; +} + export async function updateResourceMetadata({ apiBaseUrl, headers, datasetId, resourceId, fileName }) { logger.info('Updating resource metadata…'); - const resourceUpdateResponse = await nodeFetch(`${apiBaseUrl}/datasets/${datasetId}/resources/${resourceId}/`, { + const resourceUpdateResponse = await nodeFetch(routes.resource(apiBaseUrl, datasetId, resourceId), { method: 'PUT', headers: { ...headers, 'Content-Type': 'application/json' }, body: JSON.stringify({ @@ -98,20 +219,16 @@ export async function updateResourceMetadata({ apiBaseUrl, headers, datasetId, r logger.info('Resource metadata updated successfully'); } -export async function getDatasetUrl({ apiBaseUrl, headers, datasetId }) { - const datasetResponse = await nodeFetch(`${apiBaseUrl}/datasets/${datasetId}/`, { - method: 'GET', - headers: { ...headers }, - }); - - if (!datasetResponse.ok) { - const errorText = await datasetResponse.text(); - - throw new Error(`Failed to retrieve dataset URL: ${datasetResponse.status} ${datasetResponse.statusText} - ${errorText}`); - } +function createFormDataForFile(archivePath) { + const formData = new FormData(); + const fileName = path.basename(archivePath); + const fileStats = fsApi.statSync(archivePath); - const datasetData = await datasetResponse.json(); - const datasetUrl = datasetData.page; + formData.append('file', fsApi.createReadStream(archivePath), { + filename: fileName, + contentType: 'application/zip', + knownLength: fileStats.size, + }); - return datasetUrl; + return { formData, fileName }; } diff --git a/scripts/dataset/publish/datagouv/index.js b/scripts/dataset/publish/datagouv/index.js index 220e2a229..c41cf7f41 100644 --- a/scripts/dataset/publish/datagouv/index.js +++ b/scripts/dataset/publish/datagouv/index.js @@ -1,12 +1,34 @@ import config from 'config'; +import * as readme from '../../assets/README.template.js'; import { createModuleLogger } from '../../logger/index.js'; -import { updateDatasetMetadata, uploadResource, updateResourceMetadata, getDatasetUrl } from './dataset.js'; +import { updateDatasetMetadata, uploadResource, replaceResourceFile, updateResourceMetadata, getDataset, getOrganization, findDatasetByTitle, createDataset } from './dataset.js'; + const logger = createModuleLogger('datagouv'); const PRODUCTION_API_BASE_URL = 'https://www.data.gouv.fr/api/1'; const DEMO_API_BASE_URL = 'https://demo.data.gouv.fr/api/1'; +const DATASET_LICENSE = 'odc-odbl'; + +export default async function publish({ archivePath, stats }) { + const { datasetId, organizationIdOrSlug, apiBaseUrl, headers, datasetTitle, frequency } = loadConfiguration(); + const description = readme.body(stats); + + const dataset = datasetId + ? await getDataset({ apiBaseUrl, headers, datasetId }) + : await ensureDatasetExists({ apiBaseUrl, headers, organizationIdOrSlug, datasetTitle, description, frequency }); + + await updateDatasetMetadata({ apiBaseUrl, headers, datasetId: dataset.id, title: datasetTitle, description, stats, frequency }); + + const { resourceId, fileName } = await handleResourceUpload({ apiBaseUrl, headers, datasetId: dataset.id, dataset, archivePath }); + + await updateResourceMetadata({ apiBaseUrl, headers, datasetId: dataset.id, resourceId, fileName }); + + logger.info(`Dataset published successfully: ${dataset.page}`); + + return dataset.page; +} function loadConfiguration() { const apiKey = process.env.OTA_ENGINE_DATAGOUV_API_KEY; @@ -15,13 +37,16 @@ function loadConfiguration() { throw new Error('OTA_ENGINE_DATAGOUV_API_KEY environment variable is required for data.gouv.fr publishing'); } - const datasetId = config.get('@opentermsarchive/engine.dataset.datagouv.datasetId'); + const datasetId = config.has('@opentermsarchive/engine.dataset.datagouv.datasetId') && config.get('@opentermsarchive/engine.dataset.datagouv.datasetId'); + const organizationIdOrSlug = config.has('@opentermsarchive/engine.dataset.datagouv.organizationIdOrSlug') && config.get('@opentermsarchive/engine.dataset.datagouv.organizationIdOrSlug'); - if (!datasetId) { - throw new Error('datasetId is required in config at @opentermsarchive/engine.dataset.datagouv.datasetId. Run "node scripts/dataset/publish/datagouv/create-dataset.js" to create a dataset first.'); + if (!datasetId && !organizationIdOrSlug) { + throw new Error('Either datasetId or organizationIdOrSlug is required in config at @opentermsarchive/engine.dataset.datagouv'); } - const useDemo = config.get('@opentermsarchive/engine.dataset.datagouv.useDemo'); + const datasetTitle = config.get('@opentermsarchive/engine.dataset.title'); + const frequency = config.has('@opentermsarchive/engine.dataset.datagouv.frequency') && config.get('@opentermsarchive/engine.dataset.datagouv.frequency'); + const useDemo = config.has('@opentermsarchive/engine.dataset.datagouv.useDemo') && config.get('@opentermsarchive/engine.dataset.datagouv.useDemo'); const apiBaseUrl = useDemo ? DEMO_API_BASE_URL : PRODUCTION_API_BASE_URL; if (useDemo) { @@ -30,19 +55,28 @@ function loadConfiguration() { const headers = { 'X-API-KEY': apiKey }; - return { datasetId, apiBaseUrl, headers }; + return { datasetId, organizationIdOrSlug, apiBaseUrl, headers, datasetTitle, frequency }; } -export default async function publish({ archivePath, releaseDate, stats }) { - const config = loadConfiguration(); +async function ensureDatasetExists({ apiBaseUrl, headers, organizationIdOrSlug, datasetTitle, description, frequency }) { + const organization = await getOrganization({ apiBaseUrl, headers, organizationIdOrSlug }); + let dataset = await findDatasetByTitle({ apiBaseUrl, headers, organizationId: organization.id, title: datasetTitle }); + + if (!dataset) { + dataset = await createDataset({ apiBaseUrl, headers, organizationId: organization.id, title: datasetTitle, description, license: DATASET_LICENSE, frequency }); + } - await updateDatasetMetadata({ ...config, releaseDate, stats }); + return dataset; +} - const { resourceId, fileName } = await uploadResource({ ...config, archivePath }); +function handleResourceUpload({ apiBaseUrl, headers, datasetId, dataset, archivePath }) { + if (dataset?.resources?.length > 0) { + const existingResource = dataset.resources[0]; - await updateResourceMetadata({ ...config, resourceId, fileName }); + logger.info(`Found existing resource: ${existingResource.title} (ID: ${existingResource.id})`); - const datasetUrl = await getDatasetUrl({ ...config }); + return replaceResourceFile({ apiBaseUrl, headers, datasetId, resourceId: existingResource.id, archivePath }); + } - return datasetUrl; + return uploadResource({ apiBaseUrl, headers, datasetId, archivePath }); } diff --git a/scripts/dataset/publish/index.js b/scripts/dataset/publish/index.js index 0386f461f..79f752a12 100644 --- a/scripts/dataset/publish/index.js +++ b/scripts/dataset/publish/index.js @@ -16,12 +16,12 @@ export default async function publishRelease({ archivePath, releaseDate, stats } platforms.push({ name: 'GitLab', publish: () => publishGitLab({ archivePath, releaseDate, stats }) }); } - if (process.env.OTA_ENGINE_DATAGOUV_API_KEY && config.get('@opentermsarchive/engine.dataset.datagouv.datasetId')) { + if (process.env.OTA_ENGINE_DATAGOUV_API_KEY && (config.has('@opentermsarchive/engine.dataset.datagouv.datasetId') || config.has('@opentermsarchive/engine.dataset.datagouv.organizationIdOrSlug'))) { platforms.push({ name: 'data.gouv.fr', publish: () => publishDataGouv({ archivePath, releaseDate, stats }) }); } if (!platforms.length) { - throw new Error('No publishing platform configured. Please configure at least one of: GitHub (OTA_ENGINE_GITHUB_TOKEN), GitLab (OTA_ENGINE_GITLAB_TOKEN), or data.gouv.fr (OTA_ENGINE_DATAGOUV_API_KEY + datasetId in config).'); + throw new Error('No publishing platform configured. Please configure at least one of: GitHub (OTA_ENGINE_GITHUB_TOKEN), GitLab (OTA_ENGINE_GITLAB_TOKEN), or data.gouv.fr (OTA_ENGINE_DATAGOUV_API_KEY + datasetId or organizationIdOrSlug in config).'); } const results = await Promise.allSettled(platforms.map(async platform => { From dc4656d1581e8ff4e084b5f73585fb04814f76b1 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 20 Nov 2025 12:11:10 +0100 Subject: [PATCH 11/13] Fix deprecated url.parse() --- scripts/dataset/publish/github/index.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/dataset/publish/github/index.js b/scripts/dataset/publish/github/index.js index 25171eeb4..5aaec8ad5 100644 --- a/scripts/dataset/publish/github/index.js +++ b/scripts/dataset/publish/github/index.js @@ -1,6 +1,5 @@ import fsApi from 'fs'; import path from 'path'; -import url from 'url'; import config from 'config'; import { Octokit } from 'octokit'; // eslint-disable-line import/no-unresolved @@ -13,7 +12,7 @@ const logger = createModuleLogger('github'); export default async function publish({ archivePath, releaseDate, stats }) { const octokit = new Octokit({ auth: process.env.OTA_ENGINE_GITHUB_TOKEN }); - const [ owner, repo ] = url.parse(config.get('@opentermsarchive/engine.dataset.versionsRepositoryURL')).pathname.split('/').filter(component => component); + const [ owner, repo ] = new URL(config.get('@opentermsarchive/engine.dataset.versionsRepositoryURL')).pathname.split('/').filter(component => component); const tagName = `${path.basename(archivePath, path.extname(archivePath))}`; // use archive filename as Git tag From c109d21bee178e7893732302a2b6a0b67d88084c Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 20 Nov 2025 11:18:27 +0100 Subject: [PATCH 12/13] Update changelog entry --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fce227424..b74b24d68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ All changes that impact users of this module are documented in this file, in the ### Added -- Add support for publishing datasets to data.gouv.fr; configure `dataset.datagouv.datasetId` in configuration file and set `OTA_ENGINE_DATAGOUV_API_KEY` environment variable +- Add support for publishing datasets to data.gouv.fr; configure `dataset.datagouv.datasetId` or `dataset.datagouv.organizationIdOrSlug` in configuration file and set `OTA_ENGINE_DATAGOUV_API_KEY` environment variable - Add ability to publish datasets to multiple platforms simultaneously; datasets can now be published to GitHub (or GitLab) and data.gouv.fr in parallel ## 10.0.1 - 2025-11-24 From d14c5860b4ad15aa75d0dfcaf9f08147d435be1f Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 20 Nov 2025 16:17:41 +0100 Subject: [PATCH 13/13] Fix test --- scripts/dataset/export/test/fixtures/dataset/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dataset/export/test/fixtures/dataset/README.md b/scripts/dataset/export/test/fixtures/dataset/README.md index b186c3829..16cd5ec89 100644 --- a/scripts/dataset/export/test/fixtures/dataset/README.md +++ b/scripts/dataset/export/test/fixtures/dataset/README.md @@ -1,4 +1,4 @@ -# Open Terms Archive — sandbox — January 1, 2022 dataset +# Open Terms Archive — sandbox — January 1, 2022 This dataset consolidates the contractual documents of 2 service providers, in all their versions that were accessible online between January 1, 2021 and January 6, 2022.