diff --git a/api/serializers.py b/api/serializers.py index 203b0d368..452aaf5ff 100644 --- a/api/serializers.py +++ b/api/serializers.py @@ -1,5 +1,4 @@ -# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut -# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut +# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut # noqa: E501 # # SPDX-License-Identifier: AGPL-3.0-or-later @@ -10,7 +9,7 @@ from rest_framework import serializers from dataedit.helper import get_readable_table_name -from dataedit.models import Table +from dataedit.models import Dataset, Table from modelview.models import Energyframework, Energymodel from oeplatform.settings import URL @@ -164,3 +163,38 @@ def validate_dataset(self, value): raise serializers.ValidationError("Dataset names must be unique.") return value + + +class DatasetReadSerializer(serializers.ModelSerializer): + class Meta: + model = Dataset + fields = ["uuid", "name", "metadata", "created_at"] + + +class DatasetCreateSerializer(serializers.Serializer): + name = serializers.CharField() + title = serializers.CharField() + description = serializers.CharField() + at_id = serializers.URLField(required=False) + + +class DatasetAssignTablesSerializer(serializers.Serializer): + tables = serializers.ListField( + child=serializers.DictField(child=serializers.CharField()), min_length=1 + ) + + def validate_tables(self, value): + for item in value: + if "schema" not in item or "name" not in item: + raise serializers.ValidationError( + "Each table must have 'schema' and 'name'." + ) + return value + + +class DatasetResourceSerializer(serializers.ModelSerializer): + schema = serializers.StringRelatedField() + + class Meta: + model = Table + fields = ["id", "schema", "name", "oemetadata", "human_readable_name"] diff --git a/api/services/dataset_creation.py b/api/services/dataset_creation.py new file mode 100644 index 000000000..58e22f4e3 --- /dev/null +++ b/api/services/dataset_creation.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut # noqa: E501 +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +from copy import deepcopy +from typing import Any + +from oemetadata.v2.v20.example import OEMETADATA_V20_EXAMPLE +from oemetadata.v2.v20.template import OEMETADATA_V20_TEMPLATE + + +def assemble_dataset_metadata( + validated_data: dict[str, Any], oemetadata: dict = OEMETADATA_V20_TEMPLATE +) -> dict[str, Any]: + # set the context + oemetadata = deepcopy(oemetadata) + oemetadata["@context"] = OEMETADATA_V20_EXAMPLE["@context"] + oemetadata["resources"] = [] # Remove resources + + oemetadata["@id"] = validated_data.get("at_id") + oemetadata["name"] = validated_data["name"] + oemetadata["title"] = validated_data["title"] + oemetadata["description"] = validated_data["description"] + + return oemetadata diff --git a/api/tests/test_datasets_api.py b/api/tests/test_datasets_api.py new file mode 100644 index 000000000..4cc457de5 --- /dev/null +++ b/api/tests/test_datasets_api.py @@ -0,0 +1,176 @@ +# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut # noqa: E501 +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +from copy import deepcopy + +from oemetadata.latest.template import OEMETADATA_LATEST_TEMPLATE +from rest_framework import status +from rest_framework.test import APITestCase + +from dataedit.models import Dataset, Schema, Table + + +class DatasetAPITests(APITestCase): + def setUpDatasetMetadata(self, dataset_name: str): + metadata = deepcopy(OEMETADATA_LATEST_TEMPLATE) + + metadata["name"] = dataset_name + metadata["resources"] = [] + + return metadata + + def setUpResourceMetadata(self, table_name: str): + metadata = deepcopy(OEMETADATA_LATEST_TEMPLATE) + + metadata["resources"][0]["name"] = table_name + + return metadata + + def test_create_dataset(self): + payload = { + "name": "test_dataset", + "title": "Test Dataset", + "description": "This is a test dataset", + } + response = self.client.post( + "/api/v0/datasets/", payload, format="json" + ) # fixed + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + self.assertIn("metadata", response.data) + self.assertIn("resources", response.data["metadata"]) + self.assertEqual(response.data["metadata"]["name"], "test_dataset") + + def test_list_datasets(self): + Dataset.objects.create(name="ds1", metadata=self.setUpDatasetMetadata("ds1")) + Dataset.objects.create(name="ds2", metadata=self.setUpDatasetMetadata("ds2")) + response = self.client.get("/api/v0/datasets/") # fixed + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.data), 2) + + def test_assign_tables_to_dataset(self): + schema = Schema.objects.create(name="test_schema") + Table.objects.create( + name="t1", schema=schema, oemetadata=self.setUpResourceMetadata("t1") + ) + Table.objects.create( + name="t2", schema=schema, oemetadata=self.setUpResourceMetadata("t2") + ) + dataset = Dataset.objects.create( + name="test_dataset", metadata={"name": "test_dataset"} + ) + + payload = { + "dataset_name": "test_dataset", + "tables": [ + {"schema": "test_schema", "name": "t1"}, + {"schema": "test_schema", "name": "t2"}, + ], + } + + response = self.client.post( + "/api/v0/datasets/test_dataset/assign-tables/", payload, format="json" + ) + self.assertEqual(response.status_code, 200) + dataset.refresh_from_db() + self.assertEqual(len(dataset.tables.all()), 2) + self.assertEqual(len(dataset.metadata["resources"]), 2) + + def test_list_resources_for_dataset(self): + schema = Schema.objects.create(name="test_schema") + table = Table.objects.create( + name="t1", schema=schema, oemetadata=self.setUpResourceMetadata("t1") + ) + dataset = Dataset.objects.create( + name="test_dataset", metadata=self.setUpDatasetMetadata("test_dataset") + ) + dataset.tables.add(table) + dataset.update_resources_from_tables() + + response = self.client.get( + f"/api/v0/datasets/{dataset.name}/resources/" + ) # fixed + self.assertEqual(response.status_code, 200) + self.assertEqual(len(response.data), 1) + self.assertEqual(response.data[0]["name"], "t1") + + def test_assign_missing_table(self): + Dataset.objects.create( + name="ds_missing", metadata=self.setUpDatasetMetadata("ds_missing") + ) + + payload = { + "dataset_name": "ds_missing", + "tables": [{"schema": "nonexistent", "name": "missing"}], + } + + response = self.client.post( + "/api/v0/datasets/ds_missing/assign-tables/", payload, format="json" + ) + self.assertEqual(response.status_code, 200) + self.assertIn("missing", response.data) + self.assertEqual(len(response.data["missing"]), 1) + + def test_list_resources_dataset_not_found(self): + response = self.client.get("/api/v0/datasets/nonexistent/resources/") # fixed + self.assertEqual(response.status_code, 404) + + +class DatasetManagerAPITests(APITestCase): + def setUp(self): + self.dataset = Dataset.objects.create( + name="test_dataset", + metadata={ + "name": "test_dataset", + "title": "Test Title", + "description": "Test Description", + "resources": [], + }, + ) + self.detail_url = f"/api/v0/datasets/{self.dataset.name}/" + + def test_get_dataset(self): + response = self.client.get(self.detail_url) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["name"], "test_dataset") + + def test_update_dataset(self): + updated_data = { + "name": "test_dataset", # must match existing name + "title": "Updated Title", + "description": "Updated Description", + "at_id": "https://example.org/dataset/test_dataset", + } + + response = self.client.put(self.detail_url, updated_data, format="json") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.dataset.refresh_from_db() + self.assertEqual(self.dataset.metadata["title"], "Updated Title") + self.assertEqual(self.dataset.metadata["description"], "Updated Description") + self.assertEqual( + self.dataset.metadata["@id"], "https://example.org/dataset/test_dataset" + ) + + def test_delete_dataset(self): + response = self.client.delete(self.detail_url) + self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) + self.assertFalse(Dataset.objects.filter(name="test_dataset").exists()) + + def test_get_nonexistent_dataset(self): + response = self.client.get("/api/v0/datasets/nonexistent_dataset/") + self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) + + def test_put_nonexistent_dataset(self): + payload = { + "name": "nonexistent_dataset", + "title": "Does Not Exist", + "description": "Should return 404", + } + response = self.client.put( + "/api/v0/datasets/nonexistent_dataset/", payload, format="json" + ) + self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) + + def test_delete_nonexistent_dataset(self): + response = self.client.delete("/api/v0/datasets/nonexistent_dataset/") + self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) diff --git a/api/urls.py b/api/urls.py index bb0982cc4..5f39ede0d 100644 --- a/api/urls.py +++ b/api/urls.py @@ -229,6 +229,26 @@ views.ManageOekgScenarioDatasets.as_view(), name="add-scenario-datasets", ), + path( + "v0/datasets/", + views.DatasetsListCreate.as_view(), + name="dataset-list-create", + ), + path( + "v0/datasets//assign-tables/", + views.AssignDatasetTables.as_view(), + name="dataset-assign-tables", + ), + path( + "v0/datasets//", + views.DatasetManager.as_view(), + name="dataset", + ), + path( + "v0/datasets//resources/", + views.DatasetsListResources.as_view(), + name="dataset-resources", + ), ] diff --git a/api/views.py b/api/views.py index cbe6f8aff..56e284878 100644 --- a/api/views.py +++ b/api/views.py @@ -23,7 +23,7 @@ import json import logging import re -from datetime import datetime, timedelta # noqa +from copy import deepcopy from decimal import Decimal import geoalchemy2 # noqa: Although this import seems unused is has to be here @@ -34,9 +34,7 @@ from django.contrib.auth.mixins import LoginRequiredMixin from django.contrib.postgres.search import TrigramSimilarity from django.core.exceptions import ObjectDoesNotExist -from django.db import DatabaseError, transaction from django.db.models import Q -from django.db.utils import IntegrityError from django.http import ( Http404, HttpResponse, @@ -45,9 +43,11 @@ JsonResponse, StreamingHttpResponse, ) +from django.shortcuts import get_object_or_404 from django.utils import timezone from django.utils.decorators import method_decorator from django.views.decorators.cache import never_cache +from oemetadata.latest.example import OEMETADATA_LATEST_EXAMPLE from oemetadata.latest.template import OEMETADATA_LATEST_TEMPLATE from rest_framework import generics, status from rest_framework.authentication import TokenAuthentication @@ -65,11 +65,16 @@ from api.error import APIError from api.helpers.http import ModHttpResponse from api.serializers import ( + DatasetAssignTablesSerializer, + DatasetCreateSerializer, + DatasetReadSerializer, + DatasetResourceSerializer, EnergyframeworkSerializer, EnergymodelSerializer, ScenarioBundleScenarioDatasetSerializer, ScenarioDataTablesSerializer, ) +from api.services.dataset_creation import assemble_dataset_metadata from api.services.embargo import ( EmbargoValidationError, apply_embargo, @@ -80,8 +85,7 @@ from api.utils import get_dataset_configs from api.validators.column import validate_column_names from api.validators.identifier import assert_valid_identifier_name -from dataedit.models import Embargo -from dataedit.models import Schema as DBSchema +from dataedit.models import Dataset, Embargo from dataedit.models import Table as DBTable from dataedit.models import Topic from dataedit.views import get_tag_keywords_synchronized_metadata, schema_whitelist @@ -320,6 +324,14 @@ def __create_sequence(self, request, schema, sequence, jsn): class Metadata(APIView): + """ + Important note: + oemetadata v2 introduces datasets which are not relevant on a table level + always query for metadata["resources"][0]. Keeping the complete oemetadata v2 JSON + makes it easy to integrate as no further changes to validation are required for now. + Datasets are handled in the model.Datasets & api views. + """ + @api_exception @method_decorator(never_cache) def get(self, request, schema, table): @@ -343,7 +355,8 @@ def post(self, request, schema, table): cursor = actions.load_cursor_from_context(request.data) # update/sync keywords with tags before saving metadata - # TODO make this iter over all resources + # oemetadata v2 introduces datasets which are not relevant on a table level + # always query for metadata["resources"][0] keywords = metadata["resources"][0].get("keywords", []) or [] # get_tag_keywords_synchronized_metadata returns the OLD metadata @@ -353,18 +366,19 @@ def post(self, request, schema, table): _metadata = get_tag_keywords_synchronized_metadata( table=table, schema=schema, keywords_new=keywords ) - # TODO make this iter over all resources + # oemetadata v2 introduces datasets which are not relevant on a table level + # always query for metadata["resources"][0] metadata["resources"][0]["keywords"] = _metadata["resources"][0]["keywords"] # Write oemetadata json to dataedit.models.tables - # and to SQL comment on table actions.set_table_metadata( table=table, schema=schema, metadata=metadata, cursor=cursor ) _metadata = get_tag_keywords_synchronized_metadata( table=table, schema=schema, keywords_new=keywords ) - # TODO make this iter over all resources + # oemetadata v2 introduces datasets which are not relevant on a table level + # always query for metadata["resources"][0] metadata["resources"][0]["keywords"] = _metadata["resources"][0]["keywords"] # make sure extra metadata is removed @@ -379,6 +393,99 @@ def post(self, request, schema, table): raise APIError(error) +class DatasetsListCreate(generics.ListCreateAPIView): + queryset = Dataset.objects.all() + + def get_serializer_class(self): + if self.request.method == "POST": + return DatasetCreateSerializer + return DatasetReadSerializer + + def create(self, request, *args, **kwargs): + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + metadata = assemble_dataset_metadata(serializer.validated_data) + dataset = Dataset.objects.create(metadata=metadata, name=metadata["name"]) + + return Response( + {"id": dataset.pk, "metadata": dataset.metadata}, + status=status.HTTP_201_CREATED, + ) + + +class DatasetsListResources(generics.ListAPIView): + serializer_class = DatasetResourceSerializer + + def get_queryset(self): + dataset_name = self.kwargs["dataset_name"] + dataset = get_object_or_404(Dataset, name=dataset_name) + return dataset.tables.all() + + +class DatasetManager(APIView): + """ + View to retrieve, update, or delete a single dataset's metadata. + URL: /v0/datasets// + """ + + def get(self, request, dataset_name): + dataset = get_object_or_404(Dataset, name=dataset_name) + serializer = DatasetReadSerializer(dataset) + return Response(serializer.data, status=status.HTTP_200_OK) + + def put(self, request, dataset_name): + dataset = get_object_or_404(Dataset, name=dataset_name) + serializer = DatasetCreateSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + dataset.metadata = assemble_dataset_metadata(serializer.validated_data) + dataset.save() + return Response({"message": "Dataset updated"}, status=status.HTTP_200_OK) + + def delete(self, request, dataset_name): + dataset = get_object_or_404(Dataset, name=dataset_name) + dataset.delete() + return Response( + {"message": "Dataset deleted"}, status=status.HTTP_204_NO_CONTENT + ) + + +class AssignDatasetTables(APIView): + def post(self, request, dataset_name): + serializer = DatasetAssignTablesSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + table_refs = serializer.validated_data["tables"] + + try: + dataset = Dataset.objects.get(name=dataset_name) + except Dataset.DoesNotExist: + return Response({"error": "Dataset not found"}, status=404) + + missing = [] + added_tables = [] + + for table_ref in table_refs: + try: + table = DBTable.load(table_ref["schema"], table_ref["name"]) + dataset.tables.add(table) + added_tables.append(table.name) + except DBTable.DoesNotExist: + missing.append(table_ref) + + dataset.update_resources_from_tables() + + return Response( + { + "message": f"Added {len(added_tables)} tables.", + "added": added_tables, + "missing": missing, + }, + status=200, + ) + + class Table(APIView): """ Handles the creation of tables and serves information on existing tables @@ -550,6 +657,39 @@ def put(self, request, schema, table): metadata=metadata, cursor=cursor, ) + else: + # If no metadata is provided, we create a minimal metadata object + metadata = deepcopy(OEMETADATA_LATEST_TEMPLATE) + metadata["@context"] = OEMETADATA_LATEST_EXAMPLE["@context"] + metadata["metaMetadata"] = OEMETADATA_LATEST_EXAMPLE["metaMetadata"] + + # Set basic resource info + resource = { + "name": table, + "topics": [schema], + } + + # Update the first resource - there will only be one resource. + # The dataset section is managed by the database implementation ... + metadata["resources"][0].update(resource) + + # Build schema fields from columns + fields = [] + for col in columns: + field = { + "name": col["name"], + "type": col["data_type"], + "nullable": col.get("is_nullable", True), + # add more field metadata as needed + } + fields.append(field) + + # Replace the fields list entirely + metadata["resources"][0]["schema"]["fields"] = fields + + actions.set_table_metadata( + table=table, schema=schema, metadata=metadata, cursor=None + ) return JsonResponse({}, status=status.HTTP_201_CREATED) diff --git a/dataedit/migrations/0037_dataset.py b/dataedit/migrations/0037_dataset.py new file mode 100644 index 000000000..be2b20cf6 --- /dev/null +++ b/dataedit/migrations/0037_dataset.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut # noqa: E501 +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +# Generated by Django 5.1.4 on 2025-07-31 12:27 + +import uuid + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("dataedit", "0036_alter_peerreview_oemetadata"), + ] + + operations = [ + migrations.CreateModel( + name="Dataset", + fields=[ + ( + "uuid", + models.UUIDField( + default=uuid.uuid4, + editable=False, + primary_key=True, + serialize=False, + ), + ), + ("name", models.CharField(max_length=255, unique=True)), + ("metadata", models.JSONField(default=dict)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "tables", + models.ManyToManyField( + blank=True, related_name="datasets", to="dataedit.table" + ), + ), + ], + ), + ] diff --git a/dataedit/models.py b/dataedit/models.py index a1b20e763..a334407c4 100644 --- a/dataedit/models.py +++ b/dataedit/models.py @@ -1,22 +1,23 @@ -# SPDX-FileCopyrightText: 2025 Pierre Francois © Reiner Lemoine Institut -# SPDX-FileCopyrightText: 2025 Pierre Francois © Reiner Lemoine Institut -# SPDX-FileCopyrightText: 2025 Christian Winger © Öko-Institut e.V. -# SPDX-FileCopyrightText: 2025 Daryna Barabanova © Reiner Lemoine Institut -# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut -# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut -# SPDX-FileCopyrightText: 2025 Martin Glauer © Otto-von-Guericke-Universität Magdeburg -# SPDX-FileCopyrightText: 2025 Martin Glauer © Otto-von-Guericke-Universität Magdeburg +# SPDX-FileCopyrightText: 2025 Pierre Francois © Reiner Lemoine Institut # noqa: E501 +# SPDX-FileCopyrightText: 2025 Pierre Francois © Reiner Lemoine Institut # noqa: E501 +# SPDX-FileCopyrightText: 2025 Christian Winger © Öko-Institut e.V. # noqa: E501 +# SPDX-FileCopyrightText: 2025 Daryna Barabanova © Reiner Lemoine Institut # noqa: E501 +# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut # noqa: E501 +# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut # noqa: E501 +# SPDX-FileCopyrightText: 2025 Martin Glauer © Otto-von-Guericke-Universität Magdeburg # noqa: E501 +# SPDX-FileCopyrightText: 2025 Martin Glauer © Otto-von-Guericke-Universität Magdeburg # noqa: E501 # SPDX-FileCopyrightText: 2025 Tom Heimbrodt -# SPDX-FileCopyrightText: 2025 Christian Winger © Öko-Institut e.V. -# SPDX-FileCopyrightText: 2025 Daryna Barabanova © Reiner Lemoine Institut -# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut -# SPDX-FileCopyrightText: 2025 Stephan Uller © Reiner Lemoine Institut -# SPDX-FileCopyrightText: 2025 user © Reiner Lemoine Institut +# SPDX-FileCopyrightText: 2025 Christian Winger © Öko-Institut e.V. # noqa: E501 +# SPDX-FileCopyrightText: 2025 Daryna Barabanova © Reiner Lemoine Institut # noqa: E501 +# SPDX-FileCopyrightText: 2025 Jonas Huber © Reiner Lemoine Institut # noqa: E501 +# SPDX-FileCopyrightText: 2025 Stephan Uller © Reiner Lemoine Institut # noqa: E501 +# SPDX-FileCopyrightText: 2025 user © Reiner Lemoine Institut # noqa: E501 # # SPDX-License-Identifier: AGPL-3.0-or-later import json import logging +import uuid from datetime import datetime, timedelta from enum import Enum @@ -178,6 +179,28 @@ class Meta: unique_together = (("name",),) +class Dataset(models.Model): + """Represents a dataset in the database. + + Datasets are implemented according to oemetadata specification. + """ + + uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + name = models.CharField(max_length=255, unique=True) + metadata = models.JSONField(null=False, default=dict) + tables = models.ManyToManyField("Table", related_name="datasets", blank=True) + created_at = models.DateTimeField(auto_now_add=True) + + def update_resources_from_tables(self): + """ + Rebuild the `resources` field in OEMetadata based on linked tables. + """ + self.metadata["resources"] = [ + table.oemetadata["resources"][0] for table in self.tables.all() + ] + self.save() + + class Embargo(models.Model): DURATION_CHOICES = [ ("6_months", "6 Months"), diff --git a/dataedit/views.py b/dataedit/views.py index 41f8b4940..a21c197c9 100644 --- a/dataedit/views.py +++ b/dataedit/views.py @@ -997,6 +997,8 @@ def iter_oem_key_order(metadata: dict): meta_widget = MetaDataWidget(ordered_oem_151) revisions = [] + # TODO Legacy code, not fully reviewed yet, currently not used + # seems to continue to track changes in the brackround api_changes = change_requests(schema, table) data = api_changes.get("data") display_message = api_changes.get("display_message") diff --git a/versions/changelogs/current.md b/versions/changelogs/current.md index 8ccab9eb1..038cff3c7 100644 --- a/versions/changelogs/current.md +++ b/versions/changelogs/current.md @@ -40,6 +40,13 @@ SPDX-License-Identifier: CC0-1.0 ## Features +- Add Dataset rest-api and metadata based concept as specified in oemetadata / frictionless ([#2071](https://github.com/OpenEnergyPlatform/oeplatform/pull/2071)) + + - Resource metadata is stored for each created table. + - Dataset objects can be listed, created, edited and existing tables can be assigned as resource + - Datasets and assigned Resources are stored in the django database using a m:n relation with tables to read the oemetadata. + - Rest api implementation + ## Bugs ## Removed