Skip to content
103 changes: 74 additions & 29 deletions udata/harvest/backends/dcat.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from abc import abstractmethod
from datetime import date
from typing import ClassVar, Generator

Expand All @@ -15,6 +16,7 @@
from udata.rdf import (
DCAT,
DCT,
GEODCAT,
HYDRA,
SPDX,
guess_format,
Expand All @@ -23,6 +25,7 @@
url_from_rdf,
)
from udata.storage.s3 import store_as_json
from udata.utils import to_bool

from .base import BaseBackend, HarvestExtraConfig

Expand Down Expand Up @@ -125,7 +128,7 @@ def inner_harvest(self):
else:
self.job.data["graphs"] = serialized_graphs

def get_format(self):
def get_format(self) -> str:
fmt = guess_format(self.source.url)
# if format can't be guessed from the url
# we fallback on the declared Content-Type
Expand Down Expand Up @@ -251,14 +254,21 @@ def get_node_from_item(self, graph, item):
raise ValueError(f"Unable to find dataset with DCT.identifier:{item.remote_id}")


class CswDcatBackend(DcatBackend):
class BaseCswDcatBackend(DcatBackend):
"""
CSW harvester fetching records as DCAT.
The parsing of items is then the same as for the DcatBackend.
Abstract base CSW to DCAT harvester.

Once items are retrieved from CSW, the parsing of these items is the same as DcatBackend.
"""

name = "csw-dcat"
display_name = "CSW-DCAT"
extra_configs = (
HarvestExtraConfig(
_("Remote URL prefix"),
"remote_url_prefix",
str,
_("A prefix used to build the remote URL of the harvested items."),
),
)

# CSW_REQUEST is based on:
# - Request syntax from spec [1] and example requests [1] [2].
Expand Down Expand Up @@ -324,8 +334,6 @@ class CswDcatBackend(DcatBackend):
</csw:GetRecords>
"""

CSW_OUTPUT_SCHEMA = "http://www.w3.org/ns/dcat#"

SAXON_SECURITY_FEATURES = {
"http://saxon.sf.net/feature/allow-external-functions": "false",
"http://saxon.sf.net/feature/parserFeature?uri=http://apache.org/xml/features/nonvalidating/load-external-dtd": "false",
Expand All @@ -344,15 +352,36 @@ def __init__(self, *args, **kwargs):
self.xpath_proc = self.saxon_proc.new_xpath_processor()
self.xpath_proc.declare_namespace("csw", CSW_NAMESPACE)

@property
@abstractmethod
def output_schema(self):
"""
Return the CSW `outputSchema` property.
"""
pass

@abstractmethod
def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
"""
Return the input tree as a DCAT tree.
"""
pass

@override
def get_format(self) -> str:
return "xml"

@override
def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]:
"""
Yield all RDF pages as `Graph` from the source
Yield all RDF pages as `Graph` from the source.
"""
output_schema = self.output_schema
page_number = 0
start = 1

while True:
data = self.CSW_REQUEST.format(output_schema=self.CSW_OUTPUT_SCHEMA, start=start)
data = self.CSW_REQUEST.format(output_schema=output_schema, start=start)
response = self.post(url, data=data, headers={"Content-Type": "application/xml"})
response.raise_for_status()

Expand Down Expand Up @@ -386,19 +415,11 @@ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, N
return

page_number += 1
start = self.next_position(start, search_results)
start = self._next_position(start, search_results)
if not start:
return

def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
"""
Return the input tree as a DCAT tree.
For CswDcatBackend, this method return the incoming tree as-is, since it's already DCAT.
For subclasses of CswDcatBackend, this method should convert the incoming tree to DCAT.
"""
return tree

def next_position(self, start: int, search_results: PyXdmNode) -> int | None:
def _next_position(self, start: int, search_results: PyXdmNode) -> int | None:
next_record = int(search_results.get_attribute_value("nextRecord"))
matched_count = int(search_results.get_attribute_value("numberOfRecordsMatched"))
returned_count = int(search_results.get_attribute_value("numberOfRecordsReturned"))
Expand All @@ -420,25 +441,44 @@ def next_position(self, start: int, search_results: PyXdmNode) -> int | None:
return None if should_break else next_record


class CswIso19139DcatBackend(CswDcatBackend):
class CswDcatBackend(BaseCswDcatBackend):
"""
CSW harvester fetching records as ISO-19139 and using XSLT to convert them to DCAT.
The parsing of items is then the same as for the DcatBackend.
CSW harvester fetching records as DCAT.
"""

name = "csw-iso-19139"
display_name = "CSW-ISO-19139"
name = "csw-dcat"
display_name = "CSW-DCAT"

extra_configs = (
*BaseCswDcatBackend.extra_configs,
HarvestExtraConfig(
_("Remote URL prefix"),
"remote_url_prefix",
_("GeoDCAT-AP"),
"enable_geodcat",
str,
_("A prefix used to build the remote URL of the harvested items."),
_("Request GeoDCAT-AP to the CSW server (must be supported by the server)."),
),
)

CSW_OUTPUT_SCHEMA = "http://www.isotc211.org/2005/gmd"
@property
@override
def output_schema(self):
if to_bool(self.get_extra_config_value("enable_geodcat")):
return str(GEODCAT)
else:
return str(DCAT)

@override
def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
return tree


class CswIso19139DcatBackend(BaseCswDcatBackend):
"""
CSW harvester fetching records as ISO-19139 and using XSLT to convert them to DCAT.
"""

name = "csw-iso-19139"
display_name = "CSW-ISO-19139"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -450,6 +490,11 @@ def __init__(self, *args, **kwargs):
"CoupledResourceLookUp", self.saxon_proc.make_string_value("disabled")
)

@property
@override
def output_schema(self):
return "http://www.isotc211.org/2005/gmd"

@override
def as_dcat(self, tree: PyXdmNode) -> PyXdmNode:
return self.xslt_exec.transform_to_value(xdm_node=tree).head
Loading