From e79099b19d148e8349f447a5fd318f4b6460a156 Mon Sep 17 00:00:00 2001 From: Hans-Christian van der Werf Date: Thu, 5 Jun 2025 21:13:59 +0200 Subject: [PATCH 01/36] feat(rdf): support serialization of DatasetSeries from RDF and link correctly via In_Series --- ckanext/dcat/harvesters/rdf.py | 112 +++++++++++++------ ckanext/dcat/processors.py | 49 +++++++- ckanext/dcat/profiles/euro_dcat_ap_3.py | 11 ++ ckanext/dcat/profiles/euro_health_dcat_ap.py | 18 +-- 4 files changed, 148 insertions(+), 42 deletions(-) diff --git a/ckanext/dcat/harvesters/rdf.py b/ckanext/dcat/harvesters/rdf.py index a22e0b97..00a7f91a 100644 --- a/ckanext/dcat/harvesters/rdf.py +++ b/ckanext/dcat/harvesters/rdf.py @@ -210,39 +210,18 @@ def gather_stage(self, harvest_job): return [] try: - - source_dataset = model.Package.get(harvest_job.source.id) - - for dataset in parser.datasets(): - if not dataset.get('name'): - dataset['name'] = self._gen_new_name(dataset['title']) - if dataset['name'] in self._names_taken: - suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1 - dataset['name'] = '{}-{}'.format(dataset['name'], suffix) - self._names_taken.append(dataset['name']) - - # Unless already set by the parser, get the owner organization (if any) - # from the harvest source dataset - if not dataset.get('owner_org'): - if source_dataset.owner_org: - dataset['owner_org'] = source_dataset.owner_org - - # Try to get a unique identifier for the harvested dataset - guid = self._get_guid(dataset, source_url=source_dataset.url) - - if not guid: - self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset), - harvest_job) - continue - - dataset['extras'].append({'key': 'guid', 'value': guid}) - guids_in_source.append(guid) - - obj = HarvestObject(guid=guid, job=harvest_job, - content=json.dumps(dataset)) - - obj.save() - object_ids.append(obj.id) + source_dataset = model.Package.get(harvest_job.source.id) + + series_ids, series_mapping = self._parse_and_collect( + parser.dataset_series(), + source_dataset, + harvest_job, + guids_in_source, + is_series=True, + collect_series_mapping=True + ) + object_ids += series_ids + object_ids += self._parse_and_collect(parser.datasets(series_mapping), source_dataset, harvest_job, guids_in_source, is_series=False) except Exception as e: self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) @@ -422,3 +401,70 @@ def import_stage(self, harvest_object): model.Session.commit() return True + + def _parse_and_collect( + self, + items, + source_dataset, + harvest_job, + guids_in_source, + is_series=False, + collect_series_mapping=False + ): + object_ids = [] + label = "dataset series" if is_series else "dataset" + series_mapping = {} if collect_series_mapping else None + + for item in items: + original_title = item.get("title", label) + if not item.get("name"): + item["name"] = self._gen_new_name(original_title) + + if item["name"] in self._names_taken: + suffix = len([i for i in self._names_taken if i.startswith(item["name"] + "-")]) + 1 + item["name"] = f"{item['name']}-{suffix}" + + self._names_taken.append(item["name"]) + + if not item.get("owner_org") and source_dataset.owner_org: + item["owner_org"] = source_dataset.owner_org + + guid = self._get_guid(item, source_url=source_dataset.url) + if not guid: + self._save_gather_error(f"Could not get a unique identifier for {label}: {item}", harvest_job) + continue + + item.setdefault("extras", []).append({"key": "guid", "value": guid}) + guids_in_source.append(guid) + + obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(item)) + obj.save() + object_ids.append(obj.id) + + # Store mapping of RDF URI to dataset name if requested + if collect_series_mapping: + series_uri = item.get("uri") or item.get("identifier") + if series_uri: + # Try to find an existing active dataset series by 'guid' match + existing = model.Session.query(model.Package).\ + join(model.PackageExtra).\ + filter(model.PackageExtra.key == 'guid').\ + filter(model.PackageExtra.value == series_uri).\ + filter(model.Package.type == 'dataset_series').\ + filter(model.Package.state == 'active').\ + first() + + if existing: + item["name"] = existing.name + + series_mapping[str(series_uri)] = { + "id": existing.id if existing else item.get("id"), + "name": item["name"] + } + + + if collect_series_mapping: + return object_ids, series_mapping + + return object_ids + diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index 79f35821..d97d7790 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -119,6 +119,16 @@ def _datasets(self): for dataset in self.g.subjects(RDF.type, DCAT.Dataset): yield dataset + def _dataset_series(self): + ''' + Generator that returns all DCAT dataset series on the graph + + Yields rdflib.term.URIRef objects that can be used on graph lookups + and queries + ''' + for dataset_series in self.g.subjects(RDF.type, DCAT.DatasetSeries): + yield dataset_series + def next_page(self): ''' Returns the URL of the next page or None if there is no next page @@ -173,7 +183,7 @@ def supported_formats(self): for plugin in rdflib.plugin.plugins(kind=rdflib.parser.Parser)]) - def datasets(self): + def datasets(self, series_mapping=None): ''' Generator that returns CKAN datasets parsed from the RDF graph @@ -193,6 +203,43 @@ def datasets(self): ) profile.parse_dataset(dataset_dict, dataset_ref) + # Add in_series if present in RDF and mapped + in_series = [] + for series_ref in self.g.objects(dataset_ref, DCAT.inSeries): + key = str(series_ref) + if series_mapping and key in series_mapping: + in_series.append(series_mapping[key]["id"]) + else: + # fallback to URI + in_series.append(key) + + if in_series: + dataset_dict["in_series"] = in_series + + yield dataset_dict + + + + def dataset_series(self): + ''' + Generator that returns CKAN dataset series parsed from the RDF graph + + Each dataset series is passed to all the loaded profiles before being + yielded, so it can be further modified by each one of them. + + Returns a dataset series dict that can be passed to eg `package_create` + or `package_update` + ''' + for dataset_ref in self._dataset_series(): + dataset_dict = {} + for profile_class in self._profiles: + profile = profile_class( + self.g, + dataset_type=self.dataset_type, + compatibility_mode=self.compatibility_mode + ) + profile.parse_dataset(dataset_dict, dataset_ref) + yield dataset_dict diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py index 92206558..24d90c84 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_3.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py @@ -29,6 +29,17 @@ def parse_dataset(self, dataset_dict, dataset_ref): # DCAT AP v2 scheming fields dataset_dict = self._parse_dataset_v2_scheming(dataset_dict, dataset_ref) + + # Check if it's a dataset series + if (dataset_ref, RDF.type, DCAT.DatasetSeries) in self.g: + dataset_dict["type"] = "dataset_series" + + # Example defaulting logic (adjust based on RDF vocab if you have it) + if "series_order_field" not in dataset_dict: + dataset_dict["series_order_field"] = "metadata_created" + if "series_order_type" not in dataset_dict: + dataset_dict["series_order_type"] = "date" + return dataset_dict def graph_from_dataset(self, dataset_dict, dataset_ref): diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 3da8628f..44fca9ac 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -87,6 +87,16 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): if values: dataset_dict[key] = values + def __parse_healthdcat_booleanvalues(self, dataset_dict, dataset_ref): + for key, predicate in ( + ("trusted_data_holder", HEALTHDCATAP.trustedDataHolder), + ): + value = self._object_value(dataset_ref, predicate) + if value is not None: + lowered = value.lower() + if lowered in ("true", "false"): + dataset_dict[key] = lowered == "true" + def graph_from_dataset(self, dataset_dict, dataset_ref): super().graph_from_dataset(dataset_dict, dataset_ref) for prefix, namespace in namespaces.items(): @@ -156,14 +166,6 @@ def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): ) except (ValueError, TypeError): self.g.add((dataset_ref, predicate, Literal(value))) - - def __parse_healthdcat_booleanvalues(self, dataset_dict, dataset_ref): - for key, predicate in ( - ("trusted_data_holder", HEALTHDCATAP.trustedDataHolder), - ): - value = self._object_value(dataset_ref, predicate) - if value is not None: - dataset_dict[key] = value.lower() == "true" def graph_from_catalog(self, catalog_dict, catalog_ref): super().graph_from_catalog(catalog_dict, catalog_ref) From 22face72a20b1583f761d946c0282d2332b1d7e8 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 19 Jun 2025 13:49:03 +0200 Subject: [PATCH 02/36] remove fallback. not support by dataseries extension --- ckanext/dcat/processors.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index d97d7790..bb0d0c6c 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -209,9 +209,6 @@ def datasets(self, series_mapping=None): key = str(series_ref) if series_mapping and key in series_mapping: in_series.append(series_mapping[key]["id"]) - else: - # fallback to URI - in_series.append(key) if in_series: dataset_dict["in_series"] = in_series From 84ef9296a0a60dd286518539930b3ff73ddff924 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 19 Jun 2025 14:34:19 +0200 Subject: [PATCH 03/36] Remove enter --- ckanext/dcat/processors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index bb0d0c6c..d255d582 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -215,8 +215,7 @@ def datasets(self, series_mapping=None): yield dataset_dict - - + def dataset_series(self): ''' Generator that returns CKAN dataset series parsed from the RDF graph From 1f5110b5c21bd3f6aa88f72959b0c389a97b640f Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Tue, 1 Jul 2025 14:34:11 +0200 Subject: [PATCH 04/36] fix: skip fluent for 2.9 --- .../tests/profiles/dcat_ap_2/test_multilingual_support.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py index 9c4629ea..2ab47d51 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py @@ -20,7 +20,10 @@ ) from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest - +@pytest.mark.skipif( + ckan.__version__.startswith("2.9"), + reason="Fluent plugin is not compatible with CKAN 2.9" +) @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") @pytest.mark.ckan_config( From 2d70ee875776cbedb91764ec5e49a1e1a2ccd8fb Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Tue, 1 Jul 2025 14:36:27 +0200 Subject: [PATCH 05/36] fix import --- .../dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py index 2ab47d51..a58b8f21 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py @@ -1,5 +1,5 @@ import json - +import ckan import pytest from ckan.tests.helpers import call_action From b061455d1f2ef2299e5a94a94bd5ec5f81311bfb Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Tue, 1 Jul 2025 14:44:43 +0200 Subject: [PATCH 06/36] check if this works --- .../tests/profiles/dcat_ap_2/test_multilingual_support.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py index a58b8f21..ff7c873c 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py @@ -1,5 +1,5 @@ import json -import ckan +import pkg_resources import pytest from ckan.tests.helpers import call_action @@ -21,7 +21,7 @@ from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest @pytest.mark.skipif( - ckan.__version__.startswith("2.9"), + pkg_resources.get_distribution("ckan").version.startswith("2.9"), reason="Fluent plugin is not compatible with CKAN 2.9" ) @pytest.mark.usefixtures("with_plugins", "clean_db") From c9e46266ad38e342f7924b90434c8dee1feb53ad Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Tue, 1 Jul 2025 14:54:29 +0200 Subject: [PATCH 07/36] Disable support for 2.9 in tests --- .github/workflows/test.yml | 7 ------- .../tests/profiles/dcat_ap_2/test_multilingual_support.py | 6 +----- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c960cf0f..7db99384 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,9 +26,6 @@ jobs: - ckan-version: "2.10" ckan-image: "ckan/ckan-dev:2.10-py3.10" solr-version: "9" - - ckan-version: "2.9" - ckan-image: "ckan/ckan-dev:2.9-py3.9" - solr-version: "8" fail-fast: false name: CKAN ${{ matrix.ckan-version }} @@ -64,10 +61,6 @@ jobs: pip install -e . # Replace default path to CKAN core config file with the one on the container sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Install requirements (2.9) - run: | - pip install -U pytest-rerunfailures - if: ${{ matrix.ckan-version == '2.9' }} - name: Setup other extensions run: | git clone https://github.com/ckan/ckanext-harvest diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py index ff7c873c..6135e3d8 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py @@ -1,5 +1,5 @@ import json -import pkg_resources + import pytest from ckan.tests.helpers import call_action @@ -20,10 +20,6 @@ ) from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest -@pytest.mark.skipif( - pkg_resources.get_distribution("ckan").version.startswith("2.9"), - reason="Fluent plugin is not compatible with CKAN 2.9" -) @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") @pytest.mark.ckan_config( From 01991b4490b22f5ebd41a1a8b71039b8c5d29bc8 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Tue, 1 Jul 2025 15:21:29 +0200 Subject: [PATCH 08/36] check if this works --- .github/workflows/test.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7db99384..5d3184b4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,6 +26,9 @@ jobs: - ckan-version: "2.10" ckan-image: "ckan/ckan-dev:2.10-py3.10" solr-version: "9" + - ckan-version: "2.9" + ckan-image: "ckan/ckan-dev:2.9-py3.9" + solr-version: "8" fail-fast: false name: CKAN ${{ matrix.ckan-version }} @@ -61,6 +64,10 @@ jobs: pip install -e . # Replace default path to CKAN core config file with the one on the container sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini + - name: Install requirements (2.9) + run: | + pip install -U pytest-rerunfailures + if: ${{ matrix.ckan-version == '2.9' }} - name: Setup other extensions run: | git clone https://github.com/ckan/ckanext-harvest @@ -68,8 +75,7 @@ jobs: pip install -r ckanext-harvest/requirements.txt git clone https://github.com/ckan/ckanext-scheming pip install -e ckanext-scheming - git clone https://github.com/ckan/ckanext-fluent - pip install -e ckanext-fluent + pip install git+https://github.com/ckan/ckanext-fluent.git@4e9340a#egg=ckanext-fluent git clone https://github.com/ckan/ckanext-dataset-series pip install -e ckanext-dataset-series - name: Setup extension From 806e2489c749039e24093746b2a417562d185c2c Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 2 Jul 2025 14:23:04 +0200 Subject: [PATCH 09/36] feat(missing field) add missing fields - provenance_activity - qualified_attribution - quality_annotation --- ckanext/dcat/profiles/base.py | 53 ++++++++++- ckanext/dcat/profiles/euro_dcat_ap_2.py | 87 ++++++++++++++++- ckanext/dcat/profiles/euro_dcat_ap_base.py | 2 +- ckanext/dcat/profiles/euro_health_dcat_ap.py | 84 +++++++++++++++- ckanext/dcat/schemas/dcat_ap_full.yaml | 79 +++++++++++++++ ckanext/dcat/schemas/health_dcat_ap.yaml | 95 +++++++++++++++++++ .../test_euro_dcatap_profile_serialize.py | 2 +- .../test_euro_health_dcat_ap_profile_parse.py | 34 +++++++ ...t_euro_health_dcat_ap_profile_serialize.py | 75 ++++++++++++++- examples/ckan/health_dcat_ap.json | 40 ++++++++ examples/dcat/dataset_health.ttl | 2 +- 11 files changed, 541 insertions(+), 12 deletions(-) diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 29802793..299f74bc 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -7,7 +7,7 @@ from ckantoolkit import ObjectNotFound, asbool, aslist, config, get_action, url_for from dateutil.parser import parse as parse_date from geomet import InvalidGeoJSONException, wkt -from rdflib import BNode, Literal, URIRef, term +from rdflib import BNode, Literal, URIRef, term, PROV from rdflib.namespace import ORG, RDF, RDFS, SKOS, XSD, Namespace from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS @@ -95,7 +95,6 @@ def __new__(cls, value, lang=None): # In case something goes wrong: use Literal return Literal(value, lang=lang) - class CleanedURIRef(object): """Performs some basic URL encoding on value before creating an URIRef object. @@ -547,9 +546,13 @@ def _agents_details(self, subject, predicate): ) agent_details["url"] = self._object_value(agent, FOAF.homepage) agent_details["type"] = self._object_value(agent, DCT.type) - agent_details['identifier'] = self._object_value(agent, DCT.identifier) - agents.append(agent_details) + agent_details["identifier"] = self._object_value(agent, DCT.identifier) + + acted_orgs = self._agents_details(agent, PROV.actedOnBehalfOf) + if acted_orgs: + agent_details["actedOnBehalfOf"] = acted_orgs + agents.append(agent_details) return agents def _contact_details(self, subject, predicate): @@ -819,6 +822,48 @@ def _read_list_value(self, value): return items + def _add_agent_to_graph(self, subject_ref, predicate, agent_dict): + """ + Serializes a foaf:Agent or foaf:Organization with optional subfields into the RDF graph. + + Parameters: + - subject_ref: The RDF subject (dataset, activity, etc.) + - predicate: The RDF predicate (e.g., dct:publisher, prov:wasAssociatedWith, dcat:agent) + - agent_dict: A dict with agent metadata (e.g., name, email, homepage, type, identifier, actedOnBehalfOf) + """ + uri = agent_dict.get("uri", "").strip() + + agent_ref = URIRefOrLiteral(uri) if uri else BNode() + + self.g.add((subject_ref, predicate, agent_ref)) + self.g.add((agent_ref, RDF.type, FOAF.Organization)) + self.g.add((agent_ref, RDF.type, FOAF.Agent)) + + if agent_dict.get("name"): + self.g.add((agent_ref, FOAF.name, Literal(agent_dict["name"]))) + if agent_dict.get("email"): + email = agent_dict["email"] + if not email.startswith("mailto:"): + email = f"mailto:{email}" + self.g.add((agent_ref, FOAF.mbox, URIRef(email))) + if agent_dict.get("url"): + self.g.add((agent_ref, FOAF.homepage, URIRef(agent_dict["url"]))) + if agent_dict.get("homepage"): + self.g.add((agent_ref, FOAF.homepage, URIRef(agent_dict["homepage"]))) + if agent_dict.get("type"): + self.g.add((agent_ref, DCT.type, URIRef(agent_dict["type"]))) + if agent_dict.get("identifier"): + self.g.add((agent_ref, DCT.identifier, Literal(agent_dict["identifier"]))) + + for sub_org in agent_dict.get("actedOnBehalfOf", []): + if sub_org.get("name"): + org_ref = BNode() + self.g.add((agent_ref, PROV.actedOnBehalfOf, org_ref)) + self.g.add((org_ref, RDF.type, PROV.Organization)) + self.g.add((org_ref, FOAF.name, Literal(sub_org["name"]))) + + return agent_ref + def _add_spatial_value_to_graph(self, spatial_ref, predicate, value): """ Adds spatial triples to the graph. Assumes that value is a GeoJSON string diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index e5204be1..fe7e0570 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -1,7 +1,7 @@ import json from decimal import Decimal, DecimalException -from rdflib import URIRef, BNode, Literal, Namespace +from rdflib import URIRef, BNode, Literal, Namespace, FOAF, PROV, RDF, RDFS from ckanext.dcat.utils import resource_uri from .base import URIRefOrLiteral, CleanedURIRef @@ -18,7 +18,6 @@ from .euro_dcat_ap_base import BaseEuropeanDCATAPProfile - ELI = Namespace("http://data.europa.eu/eli/ontology#") @@ -65,6 +64,32 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): # Call base super method for common properties super().parse_dataset(dataset_dict, dataset_ref) + # --- Provenance deserialization --- + was_generated_by = self.g.value(dataset_ref, PROV.wasGeneratedBy) + if was_generated_by: + activity_dict = {} + activity_dict["uri"] = str(was_generated_by) + activity_dict["type"] = [ + str(t) for t in self.g.objects(was_generated_by, RDF.type) + ] + activity_dict["label"] = self._object_value(was_generated_by, RDFS.label) + activity_dict["seeAlso"] = self._object_value(was_generated_by, RDFS.seeAlso) + activity_dict["dct_type"] = self._object_value(was_generated_by, DCT.type) + activity_dict["startedAtTime"] = self._object_value( + was_generated_by, PROV.startedAtTime + ) + + agents = self._agents_details(was_generated_by, PROV.wasAssociatedWith) + if agents: + activity_dict["wasAssociatedWith"] = [agents[0]] # Only take the first agent + + dataset_dict["provenance_activity"] = [activity_dict] + + # --- Qualified Attribution --- + qualified_attributions = self._parse_qualified_attributions(dataset_ref) + if qualified_attributions: + dataset_dict["qualified_attribution"] = qualified_attributions + # Standard values value = self._object_value(dataset_ref, DCAT.temporalResolution) if value: @@ -246,6 +271,44 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): _datatype=datatype, _class=_class, ) + + # --- Provenance serialization --- + activities = dataset_dict.get("provenance_activity", []) + + for activity in activities: + activity_uri = URIRef(activity.get("uri")) if activity.get("uri") else BNode() + self.g.add((dataset_ref, PROV.wasGeneratedBy, activity_uri)) + self.g.add((activity_uri, RDF.type, PROV.Activity)) + + if activity.get("label"): + self.g.add((activity_uri, RDFS.label, Literal(activity["label"]))) + if activity.get("seeAlso"): + self.g.add((activity_uri, RDFS.seeAlso, URIRef(activity["seeAlso"]))) + if activity.get("dct_type"): + self.g.add((activity_uri, DCT.type, URIRef(activity["dct_type"]))) + if activity.get("startedAtTime"): + self.g.add((activity_uri, PROV.startedAtTime, Literal(activity["startedAtTime"], datatype=XSD.dateTime))) + + for agent_dict in activity.get("wasAssociatedWith", []): + self._add_agent_to_graph(activity_uri, PROV.wasAssociatedWith, agent_dict) + + # Qualified Attribution + qualified_attributions = dataset_dict.get("qualified_attribution", []) + for attr in qualified_attributions: + attr_ref = BNode() + self.g.add((dataset_ref, DCAT.qualifiedAttribution, attr_ref)) + self.g.add((attr_ref, RDF.type, DCAT.Attribution)) + + agent_list = attr.get("agent", []) + for agent_dict in agent_list: + if isinstance(agent_dict, dict): + self._add_agent_to_graph(attr_ref, DCAT.agent, agent_dict) + elif isinstance(agent_dict, str): + self.g.add((attr_ref, DCAT.agent, URIRef(agent_dict))) + role = attr.get("role") + if role: + self.g.add((attr_ref, DCAT.hadRole, URIRef(role))) + # Temporal @@ -448,3 +511,23 @@ def _graph_from_dataset_v2_only(self, dataset_dict, dataset_ref): _type=URIRefOrLiteral, _class=ADMS.Identifier, ) + + def _parse_qualified_attributions(self, dataset_ref): + attributions = [] + for qual_attr_ref in self.g.objects(dataset_ref, PROV.qualifiedAttribution): + attr = {} + + # Get role + for role_ref in self.g.objects(qual_attr_ref, DCAT.hadRole): + attr["role"] = str(role_ref) + break + + # Get agent (using shared logic) + agent_details = self._agents_details(qual_attr_ref, PROV.agent) + if agent_details: + attr["agent"] = agent_details + + if attr: + attributions.append(attr) + + return attributions diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index e42c6afb..f26c5ce5 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -778,4 +778,4 @@ def _graph_from_catalog_base(self, catalog_dict, catalog_ref): # Dates modified = self._last_catalog_modification() if modified: - self._add_date_triple(catalog_ref, DCT.modified, modified) + self._add_date_triple(catalog_ref, DCT.modified, modified) \ No newline at end of file diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 44fca9ac..49927e9f 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -1,4 +1,4 @@ -from rdflib import XSD, Literal, URIRef +from rdflib import XSD, Literal, URIRef, RDF, BNode from rdflib.namespace import Namespace from ckanext.dcat.profiles.base import URIRefOrLiteral @@ -7,9 +7,18 @@ # HealthDCAT-AP namespace. Note: not finalized yet HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#") +# HealthDCAT-AP namespace. Note: not finalized yet +HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#") + # Data Privacy Vocabulary namespace DPV = Namespace("https://w3id.org/dpv#") +# Data Quality Vocabulary namespace +DQV = Namespace("http://www.w3.org/ns/dqv#") + +# Open Annotation namespace +OA = Namespace("http://www.w3.org/ns/oa#") + namespaces = { "healthdcatap": HEALTHDCATAP, "dpv": DPV, @@ -54,6 +63,10 @@ def _parse_health_fields(self, dataset_dict, dataset_ref): retention_dict["end"] = retention_end if retention_dict: dataset_dict["retention_period"] = [retention_dict] + + quality_annotations = self._parse_quality_annotation(dataset_ref) + if quality_annotations: + dataset_dict["quality_annotation"] = quality_annotations return dataset_dict @@ -96,6 +109,43 @@ def __parse_healthdcat_booleanvalues(self, dataset_dict, dataset_ref): lowered = value.lower() if lowered in ("true", "false"): dataset_dict[key] = lowered == "true" + + def _parse_quality_annotation(self, dataset_ref): + """ + Parse DQV quality annotations from the RDF graph. + + Returns a list of quality annotation dictionaries. + Only includes annotations where body and target are valid URIs. + """ + quality_annotation = [] + + # Find all quality annotations for this dataset + for annotation_ref in self.g.objects(dataset_ref, DQV.hasQualityAnnotation): + annotation_dict = {} + + # Get the body (must be a URI) + body = self._object_value(annotation_ref, OA.hasBody) + if body and isinstance(body, str) and body.startswith(("http://", "https://")): + annotation_dict["body"] = body + + # Get the target (must be a URI) + target = self._object_value(annotation_ref, OA.hasTarget) + if target and isinstance(target, str) and target.startswith(("http://", "https://")): + annotation_dict["target"] = target + + # Only include the annotation if both body and target are valid URIs + if "body" not in annotation_dict or "target" not in annotation_dict: + continue + + # Get the motivation (URI or literal) + motivation = self._object_value(annotation_ref, OA.motivatedBy) + if motivation: + annotation_dict["motivated_by"] = motivation + + quality_annotation.append(annotation_dict) + + return quality_annotation + def graph_from_dataset(self, dataset_dict, dataset_ref): super().graph_from_dataset(dataset_dict, dataset_ref) @@ -142,6 +192,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self._add_nonneg_integer_triple(dataset_dict, dataset_ref, key, predicate) self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab) + self._add_quality_annotation(dataset_dict, dataset_ref) def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): """ @@ -167,5 +218,36 @@ def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): except (ValueError, TypeError): self.g.add((dataset_ref, predicate, Literal(value))) + def _add_quality_annotation(self, dataset_dict, dataset_ref): + """ + Serialize qualified_annotation entries into RDF as DQV.QualityAnnotations. + Only URI-based body, target, and motivation values are supported. + """ + quality_annotation = self._get_dict_value(dataset_dict, "quality_annotation") + + if not quality_annotation: + return + + for annotation in quality_annotation: + if not isinstance(annotation, dict): + continue + + annotation_ref = BNode() + + # Link from dataset + self.g.add((dataset_ref, DQV.hasQualityAnnotation, annotation_ref)) + self.g.add((annotation_ref, RDF.type, OA.Annotation)) + + # URI-based fields only + for field, predicate in [ + ("body", OA.hasBody), + ("target", OA.hasTarget), + ("motivated_by", OA.motivatedBy), + ]: + uri = annotation.get(field) + if isinstance(uri, str) and uri.startswith(("http://", "https://")): + self.g.add((annotation_ref, predicate, URIRef(uri))) + + def graph_from_catalog(self, catalog_dict, catalog_ref): super().graph_from_catalog(catalog_dict, catalog_ref) diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index e6fb4a37..c25c3929 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -285,6 +285,85 @@ dataset_fields: help_text: The function of an entity or agent with respect to another entity or resource. help_text: A description of a relationship with another resource. +- field_name: provenance_activity + label: Provenance Activity + repeating_label: Provenance Activity + repeating_once: true + repeating_subfields: + - field_name: uri + label: Activity URI + help_text: URI of the provenance activity (if available). + - field_name: label + label: Label + help_text: Human-readable label for the activity. + - field_name: seeAlso + label: See Also + help_text: Related link for the activity. + - field_name: dct_type + label: Type + help_text: Type of the activity (URI). + - field_name: startedAtTime + label: Started At Time + help_text: When the activity started (ISO 8601). + - field_name: wasAssociatedWith + label: Associated Agent + repeating_label: Agent + repeating_once: true + repeating_subfields: + - field_name: uri + label: URI + - field_name: name + label: Name + - field_name: email + label: Email + - field_name: homepage + label: Homepage + - field_name: type + label: Type + - field_name: identifier + label: Identifier + - field_name: url + label: URL + - field_name: actedOnBehalfOf + label: Acted On Behalf Of + repeating_label: Organization + repeating_once: true + repeating_subfields: + - field_name: name + label: Organization Name + help_text: Structured provenance activity information, including agent and organization. + +# Add qualified_attribution field here, just before the commented-out hvd_category field +- field_name: qualified_attribution + label: Qualified Attribution + repeating_label: Attribution + repeating_once: true + repeating_subfields: + - field_name: agent + label: Agent + repeating_label: Agent + repeating_once: true + repeating_subfields: + - field_name: uri + label: URI + - field_name: name + label: Name + - field_name: email + label: Email + - field_name: homepage + label: Homepage + - field_name: type + label: Type + - field_name: identifier + label: Identifier + - field_name: url + label: URL + - field_name: role + label: Role + help_text: Role of the agent (e.g., data processor, contributor). + help_text: Structured qualified attribution information including agent and role. + + #- field_name: hvd_category # label: HVD Category # preset: multiple_text diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index c8624396..6475404b 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -444,11 +444,106 @@ dataset_fields: help_text: The function of an entity or agent with respect to another entity or resource. help_text: A description of a relationship with another resource. +- field_name: provenance_activity + label: Provenance Activity + repeating_label: Provenance Activity + repeating_once: true + repeating_subfields: + - field_name: uri + label: Activity URI + help_text: URI of the provenance activity (if available). + - field_name: label + label: Label + help_text: Human-readable label for the activity. + - field_name: seeAlso + label: See Also + help_text: Related link for the activity. + - field_name: dct_type + label: Type + help_text: Type of the activity (URI). + - field_name: startedAtTime + label: Started At Time + help_text: When the activity started (ISO 8601). + - field_name: wasAssociatedWith + label: Associated Agent + repeating_label: Agent + repeating_once: true + repeating_subfields: + - field_name: uri + label: URI + - field_name: name + label: Name + - field_name: email + label: Email + - field_name: homepage + label: Homepage + - field_name: type + label: Type + - field_name: identifier + label: Identifier + - field_name: url + label: URL + - field_name: actedOnBehalfOf + label: Acted On Behalf Of + repeating_label: Organization + repeating_once: true + repeating_subfields: + - field_name: name + label: Organization Name + help_text: Structured provenance activity information, including agent and organization. + # Note: if not provided, this will be autogenerated - field_name: uri label: URI help_text: An URI for this dataset (if not provided it will be autogenerated). +- field_name: qualified_attribution + label: Qualified Attribution + repeating_label: Attribution + repeating_once: true + repeating_subfields: + - field_name: agent + label: Agent + repeating_label: Agent + repeating_once: true + repeating_subfields: + - field_name: uri + label: URI + - field_name: name + label: Name + - field_name: email + label: Email + - field_name: homepage + label: Homepage + - field_name: type + label: Type + - field_name: identifier + label: Identifier + - field_name: url + label: URL + - field_name: role + label: Role + help_text: Role of the agent (e.g., data processor, contributor). + help_text: Structured qualified attribution information including agent and role. + +- field_name: quality_annotation + label: Quality annotations + repeating_label: Quality annotation + repeating_subfields: + - field_name: body + label: Body + help_text: The content of the quality annotation (e.g., URL to certificate, measurement value, assessment result). + - field_name: target + label: Target + help_text: The specific aspect of the dataset being annotated (e.g., URI or description of what is being assessed). + - field_name: motivated_by + label: Motivated by + help_text: The motivation or reason for the quality annotation. + help_text: > + Quality annotations provide information about the quality of the dataset, including certifications, + measurements, and assessments. These annotations follow the Data Quality Vocabulary (DQV) + and Web Annotation standards. + # TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) # resource_fields: diff --git a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_serialize.py index 826aef47..88826149 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_serialize.py @@ -1514,4 +1514,4 @@ def test_dont_set_missing_license_for_resource_config_param_value_false(self): assert str(distribution) == utils.resource_uri(resource) # Verify that the license of the dataset is not in the distribution - assert not self._triple(g, distribution, DCT.license, URIRef(dataset['license_id'])) + assert not self._triple(g, distribution, DCT.license, URIRef(dataset['license_id'])) \ No newline at end of file diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 1d7b53c5..1caf10dc 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -182,3 +182,37 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] + + assert dataset["provenance_activity"] == [{ + "uri": "internalURI:wasGeneratedBy0", + "label": "http://dbpedia.org/resource/Record_linkage", + "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp", + "dct_type": "http://dbpedia.org/resource/Record_linkage", + "startedAtTime": "2021-01-01T00:00:00+00:00", + "wasAssociatedWith": [{ + "name": "Dr. Joris van Loenhout", + "url": "https://www.sciensano.be/fr/people/joris-van-loenhout", + "email": "Joris.VanLoenhout@sciensano.be", + "type": "", + "uri": "", + "identifier": "", + "actedOnBehalfOf": [{ + "name": "Contact Point" + }] + }] + }] + + assert dataset["qualified_attribution"][0]["role"] == "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor" + + agent = dataset["qualified_attribution"][0]["agent"][0] + assert agent["name"] == "Contact Point" + assert agent["email"] == "healthdata@sciensano.be" + assert agent["url"] == "https://healthdata.be" + assert agent["type"] == "" + assert agent["identifier"] == "" + + # DQV Quality Annotation + assert len(dataset["quality_annotation"]) == 1 + assert dataset["quality_annotation"][0]["body"] == "https://certificates.theodi.org/en/datasets/393/certificate" + assert dataset["quality_annotation"][0]["target"] == "https://certificates.theodi.org/en/datasets/393" + assert dataset["quality_annotation"][0]["motivated_by"] == "http://www.w3.org/ns/dqv#qualityAssessment" diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index b6e5c5c6..3a7b794f 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -3,9 +3,9 @@ import pytest from ckan.tests.helpers import call_action from geomet import wkt -from rdflib import Graph -from rdflib.namespace import RDF +from rdflib import Graph, PROV, Literal from rdflib.term import URIRef +from rdflib.namespace import Namespace from ckanext.dcat import utils from ckanext.dcat.processors import RDFSerializer @@ -30,6 +30,8 @@ DCAT_AP_PROFILES = ["euro_dcat_ap_3"] +# Open Annotation namespace +OA = Namespace("http://www.w3.org/ns/oa#") @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @@ -103,3 +105,72 @@ def test_e2e_ckan_to_dcat(self): assert self._triple( g, relation[0][2], predicate, value ), f"relation Predicate {predicate} does not have value {value}" + + # Test provenance activity + provenance = [t for t in g.triples((dataset_ref, PROV.wasGeneratedBy, None))] + assert len(provenance) == 1 + activity_node = provenance[0][2] + activity_items = [ + (RDF.type, PROV.Activity), + (RDFS.label, Literal(dataset_dict["provenance_activity"][0]["label"])), + (RDFS.seeAlso, URIRef(dataset_dict["provenance_activity"][0]["seeAlso"])), + (DCT.type, URIRef(dataset_dict["provenance_activity"][0]["dct_type"])), + (PROV.startedAtTime, Literal(dataset_dict["provenance_activity"][0]["startedAtTime"], datatype=XSD.dateTime)), + ] + for predicate, value in activity_items: + assert self._triple(g, activity_node, predicate, value), f"Provenance {predicate} mismatch" + + agent_triple = list(g.objects(activity_node, PROV.wasAssociatedWith)) + assert len(agent_triple) == 1 + agent_node = agent_triple[0] + agent_items = [ + (RDF.type, PROV.Agent), + (FOAF.name, Literal(dataset_dict["provenance_activity"][0]["wasAssociatedWith"][0]["name"])), + (FOAF.homepage, URIRef(dataset_dict["provenance_activity"][0]["wasAssociatedWith"][0]["homepage"])), + (FOAF.mbox, URIRef(dataset_dict["provenance_activity"][0]["wasAssociatedWith"][0]["email"])), + ] + + acted_on = list(g.objects(agent_node, PROV.actedOnBehalfOf)) + assert len(acted_on) == 1 + org_node = acted_on[0] + assert self._triple(g, org_node, FOAF.name, Literal(dataset_dict["provenance_activity"][0]["wasAssociatedWith"][0]["actedOnBehalfOf"][0]["name"])) + + # Test qualified attribution + attributions = [t for t in g.triples((dataset_ref, DCAT.qualifiedAttribution, None))] + assert len(attributions) == 1 + attr_node = attributions[0][2] + assert self._triple(g, attr_node, RDF.type, DCAT.Attribution) + assert self._triple(g, attr_node, DCAT.hadRole, URIRef(dataset_dict["qualified_attribution"][0]["role"])) + + agent_node = list(g.objects(attr_node, DCAT.agent))[0] + agent_details = dataset_dict["qualified_attribution"][0]["agent"][0] + agent_items = [ + (RDF.type, FOAF.Organization), + (FOAF.name, Literal(agent_details["name"])), + (FOAF.mbox, URIRef("mailto:" + agent_details["email"])), + (FOAF.homepage, URIRef(agent_details["homepage"])), + ] + for predicate, value in agent_items: + assert self._triple(g, agent_node, predicate, value), f"QualifiedAttribution Agent {predicate} mismatch" + + # Test qualified annotation + annotations = [t for t in + g.triples((dataset_ref, URIRef("http://www.w3.org/ns/dqv#hasQualityAnnotation"), None))] + assert len(annotations) == 1, "Expected one dqv:hasQualityAnnotation triple" + + annotation_node = annotations[0][2] + assert self._triple(g, annotation_node, RDF.type, URIRef("http://www.w3.org/ns/oa#Annotation")) + + annotation_details = dataset_dict["quality_annotation"][0] + + # Assert URI-based fields + for field, predicate_uri in [ + ("motivated_by", OA.motivatedBy), + ("body", OA.hasBody), + ("target", OA.hasTarget), + ]: + value = annotation_details.get(field) + assert value is not None, f"Missing {field} in annotation" + assert self._triple(g, annotation_node, URIRef(predicate_uri), + URIRef(value)), f"QualityAnnotation {field} mismatch" + diff --git a/examples/ckan/health_dcat_ap.json b/examples/ckan/health_dcat_ap.json index 26450ee8..28980740 100644 --- a/examples/ckan/health_dcat_ap.json +++ b/examples/ckan/health_dcat_ap.json @@ -1,5 +1,24 @@ [ { + "qualified_attribution": [ + { + "agent": [{ + "name": "Contact Point", + "email": "healthdata@sciensano.be", + "homepage": "https://healthdata.be", + "type": "", + "identifier": "" + }], + "role": "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor" + } + ], + "quality_annotation": [ + { + "motivated_by": "http://www.w3.org/ns/dqv#qualityAssessment", + "body": "https://acertificateserver.eu/mycertificate", + "target": "https://fair.healthdata.be/dataset/d43a158e-7d13-4660-bbc3-9d3f8d5501e5" + } + ], "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", "analytics": [ "http://example.com/analytics" @@ -147,6 +166,27 @@ "url": "https://www.example.com/hdab" } ], + "provenance_activity": [ + { + "dct_type": "http://dbpedia.org/resource/Record_linkage", + "label": "http://dbpedia.org/resource/Record_linkage", + "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp", + "startedAtTime": "2021-01-01T00:00:00+00:00", + "uri": "internalURI:wasGeneratedBy0", + "wasAssociatedWith": [ + { + "homepage": "https://www.sciensano.be/fr/people/joris-van-loenhout", + "email": "mailto:Joris.VanLoenhout@sciensano.be", + "name": "Dr. Joris van Loenhout", + "actedOnBehalfOf": [ + { + "name": "Contact Point" + } + ] + } + ] + } + ], "publisher": [ { "email": "info@example.com", diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index 982a728e..35fffe15 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -223,7 +223,7 @@ a prov:Attribution; dcat:hadRole ; prov:agent [ a foaf:Organization; - foaf:homepage ; + foaf:homepage ; foaf:mbox ; foaf:name "Contact Point" ] . From b1c819366a7d353058fa7661a6ff66de7b96ca25 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Mon, 7 Jul 2025 16:54:10 +0200 Subject: [PATCH 10/36] Add homepage --- ckanext/dcat/profiles/euro_dcat_ap_base.py | 7 +++++++ ckanext/dcat/schemas/dcat_ap_full.yaml | 5 +++++ ckanext/dcat/schemas/health_dcat_ap.yaml | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index f26c5ce5..56c9540c 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -206,6 +206,13 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): src_data = self._extract_catalog_dict(catalog_src) dataset_dict["extras"].extend(src_data) + + homepage = self._object_value(dataset_ref, FOAF.homepage) + if homepage: + dataset_dict["homepage"] = homepage + elif config.get("ckan.site_url"): + dataset_dict["homepage"] = config.get("ckan.site_url") + # Resources for distribution in self._distributions(dataset_ref): diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index c25c3929..8cebe622 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -112,6 +112,11 @@ dataset_fields: label: Identifier help_text: Unique identifier for the creator, such as an ORCID or ROR ID. +- field_name: homepage + label: Homepage + display_snippet: link.html + help_text: A web page that acts as the homepage for the dataset. + - field_name: license_id label: License form_snippet: license.html diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index 6475404b..5f24d4cc 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -109,6 +109,11 @@ dataset_fields: label: Identifier help_text: Unique identifier for the creator, such as an ORCID or ROR ID. +- field_name: homepage + label: Homepage + display_snippet: link.html + help_text: A web page that acts as the homepage for the dataset. + - field_name: license_id label: License form_snippet: license.html From a6e1e4b9990f9145d6330ebfa90b1dae8343b05b Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Mon, 7 Jul 2025 17:05:10 +0200 Subject: [PATCH 11/36] Also serrilaize homepage when available --- ckanext/dcat/profiles/euro_dcat_ap_base.py | 8 ++++++-- .../profiles/dcat_ap/test_euro_dcatap_profile_parse.py | 1 + examples/dcat/dataset.rdf | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index 56c9540c..ba9f1f2a 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -209,9 +209,13 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): homepage = self._object_value(dataset_ref, FOAF.homepage) if homepage: - dataset_dict["homepage"] = homepage + dataset_dict["extras"].append( + {"key": "homepage", "value": homepage} + ) elif config.get("ckan.site_url"): - dataset_dict["homepage"] = config.get("ckan.site_url") + dataset_dict["extras"].append( + {"key": "homepage", "value": config.get("ckan.site_url")} + ) # Resources for distribution in self._distributions(dataset_ref): diff --git a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py index 8da1c634..d0252a11 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py @@ -124,6 +124,7 @@ def _get_extra_value_as_list(key): assert _get_extra_value('access_rights') == 'public' assert _get_extra_value('provenance') == 'Some statement about provenance' assert _get_extra_value('dcat_type') == 'test-type' + assert _get_extra_value('homepage') == 'http://dataset.info.org/home' # Lists assert sorted(_get_extra_value_as_list('language')) == [u'ca', u'en', u'es'] diff --git a/examples/dcat/dataset.rdf b/examples/dcat/dataset.rdf index 5ce71e1c..1235888f 100644 --- a/examples/dcat/dataset.rdf +++ b/examples/dcat/dataset.rdf @@ -19,6 +19,7 @@ Zimbabwe Regional Geochemical Survey. During the period 1982-86 a team of geologists from the British Geological Survey ... http://dataset.info.org + exploration geochemistry geology From 217da9a1cb484d57ca5ba327788cc098b229e01f Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Tue, 8 Jul 2025 16:18:30 +0200 Subject: [PATCH 12/36] Added retention period to healthDCAT --- ckanext/dcat/profiles/euro_health_dcat_ap.py | 70 ++++++++++++++----- ckanext/dcat/schemas/health_dcat_ap.yaml | 12 ++++ .../test_euro_health_dcat_ap_profile_parse.py | 4 ++ ...t_euro_health_dcat_ap_profile_serialize.py | 23 ++++++ examples/ckan/health_dcat_ap.json | 11 ++- examples/dcat/dataset_health.ttl | 6 ++ 6 files changed, 109 insertions(+), 17 deletions(-) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 49927e9f..cb2748aa 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -1,5 +1,7 @@ from rdflib import XSD, Literal, URIRef, RDF, BNode from rdflib.namespace import Namespace +from rdflib.namespace import DCTERMS as DCT +from .base import CleanedURIRef, resource_uri, SCHEMA from ckanext.dcat.profiles.base import URIRefOrLiteral from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile @@ -7,9 +9,6 @@ # HealthDCAT-AP namespace. Note: not finalized yet HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#") -# HealthDCAT-AP namespace. Note: not finalized yet -HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#") - # Data Privacy Vocabulary namespace DPV = Namespace("https://w3id.org/dpv#") @@ -51,22 +50,19 @@ def _parse_health_fields(self, dataset_dict, dataset_ref): agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab) if agents: dataset_dict["hdab"] = agents - - # Retention period - retention_start, retention_end = self._time_interval( - dataset_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2 - ) - retention_dict = {} - if retention_start is not None: - retention_dict["start"] = retention_start - if retention_end is not None: - retention_dict["end"] = retention_end - if retention_dict: - dataset_dict["retention_period"] = [retention_dict] - + # Add the quality annotations quality_annotations = self._parse_quality_annotation(dataset_ref) if quality_annotations: dataset_dict["quality_annotation"] = quality_annotations + + # Dataset-level retention + dataset_dict["retention_period"] = self._parse_retention_period(dataset_ref) + + # Distribution-level retention + for distribution_ref in self._distributions(dataset_ref): + for resource_dict in dataset_dict.get("resources", []): + if resource_dict["distribution_ref"] == str(distribution_ref): + resource_dict["retention_period"] = self._parse_retention_period(distribution_ref) return dataset_dict @@ -146,6 +142,25 @@ def _parse_quality_annotation(self, dataset_ref): return quality_annotation + def _parse_retention_period(self, subject_ref): + """ + Parses the HEALTHDCATAP.retentionPeriod from the RDF graph for a given subject + (e.g., dataset or distribution). + + Returns a list with a single dict, e.g., + [{"start": "2023-01-01", "end": "2025-01-01"}] + or an empty list if no values are found. + """ + retention_start, retention_end = self._time_interval( + subject_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2 + ) + retention_dict = {} + if retention_start is not None: + retention_dict["start"] = retention_start + if retention_end is not None: + retention_dict["end"] = retention_end + + return [retention_dict] if retention_dict else [] def graph_from_dataset(self, dataset_dict, dataset_ref): super().graph_from_dataset(dataset_dict, dataset_ref) @@ -194,6 +209,29 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab) self._add_quality_annotation(dataset_dict, dataset_ref) + # Dataset-level retention period + self._add_retention_period(dataset_ref, dataset_dict.get("retention_period", [])) + + for resource_dict in dataset_dict.get("resources", []): + distribution_ref = CleanedURIRef(resource_uri(resource_dict)) + self._add_retention_period(distribution_ref, resource_dict.get("retention_period", [])) + + + def _add_retention_period(self, subject_ref, retention_list): + for retention in retention_list: + start = retention.get("start") + end = retention.get("end") + + if start or end: + period_node = BNode() + self.g.add((subject_ref, HEALTHDCATAP.retentionPeriod, period_node)) + self.g.add((period_node, RDF.type, DCT.PeriodOfTime)) + + if start: + self.g.add((period_node, SCHEMA.startDate, Literal(start, datatype=XSD.date))) + if end: + self.g.add((period_node, SCHEMA.endDate, Literal(end, datatype=XSD.date))) + def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): """ Adds non-negative integers to the Dataset graph (xsd:nonNegativeInteger) diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index 5f24d4cc..142cde4c 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -715,3 +715,15 @@ resource_fields: - field_name: uri label: URI help_text: An URI for this resource (if not provided it will be autogenerated). + +- field_name: retention_period + label: Retention period + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 1caf10dc..179250d9 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -182,6 +182,10 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] + assert dataset["resources"][0]["retention_period"] == { + "start": "2020-03-01", + "end": "2034-12-31", + } assert dataset["provenance_activity"] == [{ "uri": "internalURI:wasGeneratedBy0", diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index 3a7b794f..0336a057 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -173,4 +173,27 @@ def test_e2e_ckan_to_dcat(self): assert value is not None, f"Missing {field} in annotation" assert self._triple(g, annotation_node, URIRef(predicate_uri), URIRef(value)), f"QualityAnnotation {field} mismatch" + + # Extract the distribution node + distributions = list(g.objects(dataset_ref, DCAT.distribution)) + assert len(distributions) > 0, "No distributions found" + distribution_node = distributions[0] + + distribution_details = dataset_dict["resources"][0] + + assert self._triple(g, distribution_node, RDF.type, DCAT.Distribution) + assert self._triple(g, distribution_node, DCT.format, URIRef(distribution_details["format"])) + assert self._triple(g, distribution_node, DCT.identifier, Literal(distribution_details["uri"])) + assert self._triple(g, distribution_node, DCT.license, URIRef(distribution_details["license"])) + assert self._triple(g, distribution_node, DCT.title, Literal(distribution_details["title"])) + assert self._triple(g, distribution_node, DCAT.accessURL, URIRef(distribution_details["access_url"])) + assert self._triple(g, distribution_node, DCAT.downloadURL, URIRef(distribution_details["download_url"])) + + # Check retention period + retention_nodes = list(g.objects(distribution_node, HEALTHDCATAP.retentionPeriod)) + assert len(retention_nodes) == 1, "Expected one retentionPeriod node on distribution" + retention_node = retention_nodes[0] + assert self._triple(g, retention_node, DCT.type, DCT.PeriodOfTime) + assert self._triple(g, retention_node, DCT.startDate, Literal(distribution_details["retention_period"]["start"], datatype=XSD.date)) + assert self._triple(g, retention_node, DCT.endDate, Literal(distribution_details["retention_period"]["end"], datatype=XSD.date)) diff --git a/examples/ckan/health_dcat_ap.json b/examples/ckan/health_dcat_ap.json index 28980740..4a7c67c0 100644 --- a/examples/ckan/health_dcat_ap.json +++ b/examples/ckan/health_dcat_ap.json @@ -232,7 +232,16 @@ "start": "2020-03-01" } ], - "resources": [], + "resources": [ + { + "retention_period": [ + { + "end": "2034-12-31", + "start": "2020-03-01" + } + ] + } + ], "groups": [], "relationships_as_subject": [], "relationships_as_object": [] diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index 35fffe15..3f846385 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -136,6 +136,12 @@ ]; dct:title "Technical report number of unique study subjects available by environment for project HDBP0250"; dcat:accessURL ; + + [ a dct:PeriodOfTime; + rdfs:comment "As stated in the CSI deliberation"; + dcat:endDate "2034-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; dcat:downloadURL ; dcat:mediaType . From cd8661b65fd011eadad35092f32a96c87566325b Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 10 Jul 2025 16:55:54 +0200 Subject: [PATCH 13/36] Fix retention period UT --- ckanext/dcat/profiles/euro_health_dcat_ap.py | 4 ++-- .../test_euro_health_dcat_ap_profile_parse.py | 10 ++++++---- ...t_euro_health_dcat_ap_profile_serialize.py | 6 ------ examples/dcat/dataset_health.ttl | 19 +------------------ 4 files changed, 9 insertions(+), 30 deletions(-) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index cb2748aa..b6ecb2b8 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -1,8 +1,8 @@ from rdflib import XSD, Literal, URIRef, RDF, BNode from rdflib.namespace import Namespace from rdflib.namespace import DCTERMS as DCT -from .base import CleanedURIRef, resource_uri, SCHEMA - +from .base import CleanedURIRef, SCHEMA +from ckanext.dcat.utils import resource_uri from ckanext.dcat.profiles.base import URIRefOrLiteral from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 179250d9..2d907f0f 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -182,10 +182,12 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] - assert dataset["resources"][0]["retention_period"] == { - "start": "2020-03-01", - "end": "2034-12-31", - } + assert dataset["resources"][0]["retention_period"] == [ + { + "start": "2020-03-01", + "end": "2034-12-31", + } + ] assert dataset["provenance_activity"] == [{ "uri": "internalURI:wasGeneratedBy0", diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index 0336a057..9eec2bc0 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -182,12 +182,6 @@ def test_e2e_ckan_to_dcat(self): distribution_details = dataset_dict["resources"][0] assert self._triple(g, distribution_node, RDF.type, DCAT.Distribution) - assert self._triple(g, distribution_node, DCT.format, URIRef(distribution_details["format"])) - assert self._triple(g, distribution_node, DCT.identifier, Literal(distribution_details["uri"])) - assert self._triple(g, distribution_node, DCT.license, URIRef(distribution_details["license"])) - assert self._triple(g, distribution_node, DCT.title, Literal(distribution_details["title"])) - assert self._triple(g, distribution_node, DCAT.accessURL, URIRef(distribution_details["access_url"])) - assert self._triple(g, distribution_node, DCAT.downloadURL, URIRef(distribution_details["download_url"])) # Check retention period retention_nodes = list(g.objects(distribution_node, HEALTHDCATAP.retentionPeriod)) diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index 3f846385..f8d72b89 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -94,7 +94,7 @@ adms:sample ; adms:versionNotes "Dataset continuously updated"; dcat:contactPoint ; - # dcat:distribution ; + dcat:distribution ; dcat:hasVersion ; dcat:keyword "Test 1" , "Test 2" , "Test 3"; dcat:spatialResolutionInMeters "10"^^; @@ -248,23 +248,6 @@ a dct:MediaTypeOrExtent . -# -# a dcat:Distribution; -# dcatap:applicableLegislation ; -# dct:description "EU Health Data Access Body For better Healthcare, Research & Policy Making"; -# dct:format ; -# dct:identifier "http://ehelse.healthdataportal.eu/distribution/13a3851d-6cdf-4570-a7f0-7f03015d1925"; -# dct:isPartOf ; -# dct:issued "2024-06-03T08:51:00Z"^^; -# dct:license ; -# dct:modified "2024-06-04T18:00:00Z"^^; -# dct:rights [ a dct:RightsStatement; -# rdfs:label "Access to data is conditional on the issuance of a permit by the HDAB after submission of a data request application (English)" -# ]; -# dct:title "EU Health Data Access Body"; -# dcat:accessURL ; -# dcat:byteSize "80000"^^ . - a prov:Activity; rdfs:label "http://dbpedia.org/resource/Record_linkage"; From 90dac79d02bfcb8322d29f25506cc67b8de7124b Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 10 Jul 2025 21:08:48 +0200 Subject: [PATCH 14/36] fix test --- ckanext/dcat/profiles/euro_health_dcat_ap.py | 14 +- ...t_euro_health_dcat_ap_profile_serialize.py | 16 +- examples/ckan/health_dcat_ap.json | 528 ++++++++++-------- 3 files changed, 310 insertions(+), 248 deletions(-) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index b6ecb2b8..253f4ee9 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -1,7 +1,7 @@ -from rdflib import XSD, Literal, URIRef, RDF, BNode +from rdflib import XSD, Literal, URIRef, RDF, BNode, DCAT, RDFS from rdflib.namespace import Namespace from rdflib.namespace import DCTERMS as DCT -from .base import CleanedURIRef, SCHEMA +from .base import CleanedURIRef from ckanext.dcat.utils import resource_uri from ckanext.dcat.profiles.base import URIRefOrLiteral from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile @@ -216,21 +216,23 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): distribution_ref = CleanedURIRef(resource_uri(resource_dict)) self._add_retention_period(distribution_ref, resource_dict.get("retention_period", [])) - def _add_retention_period(self, subject_ref, retention_list): for retention in retention_list: start = retention.get("start") end = retention.get("end") + comment = retention.get("comment") - if start or end: + if start or end or comment: period_node = BNode() self.g.add((subject_ref, HEALTHDCATAP.retentionPeriod, period_node)) self.g.add((period_node, RDF.type, DCT.PeriodOfTime)) if start: - self.g.add((period_node, SCHEMA.startDate, Literal(start, datatype=XSD.date))) + self.g.add((period_node, DCAT.startDate, Literal(start, datatype=XSD.date))) if end: - self.g.add((period_node, SCHEMA.endDate, Literal(end, datatype=XSD.date))) + self.g.add((period_node, DCAT.endDate, Literal(end, datatype=XSD.date))) + if comment: + self.g.add((period_node, RDFS.comment, Literal(comment, lang="en"))) def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): """ diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index 9eec2bc0..0c523189 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -187,7 +187,17 @@ def test_e2e_ckan_to_dcat(self): retention_nodes = list(g.objects(distribution_node, HEALTHDCATAP.retentionPeriod)) assert len(retention_nodes) == 1, "Expected one retentionPeriod node on distribution" retention_node = retention_nodes[0] - assert self._triple(g, retention_node, DCT.type, DCT.PeriodOfTime) - assert self._triple(g, retention_node, DCT.startDate, Literal(distribution_details["retention_period"]["start"], datatype=XSD.date)) - assert self._triple(g, retention_node, DCT.endDate, Literal(distribution_details["retention_period"]["end"], datatype=XSD.date)) + assert self._triple(g, retention_node, RDF.type, DCT.PeriodOfTime) + assert self._triple( + g, + retention_node, + DCAT.startDate, + Literal(distribution_details["retention_period"][0]["start"], datatype=XSD.date) + ) + assert self._triple( + g, + retention_node, + DCAT.endDate, + Literal(distribution_details["retention_period"][0]["end"], datatype=XSD.date) + ) diff --git a/examples/ckan/health_dcat_ap.json b/examples/ckan/health_dcat_ap.json index 4a7c67c0..88398a3c 100644 --- a/examples/ckan/health_dcat_ap.json +++ b/examples/ckan/health_dcat_ap.json @@ -1,249 +1,299 @@ [ - { - "qualified_attribution": [ + { + "qualified_attribution": [ + { + "agent": [ { - "agent": [{ - "name": "Contact Point", - "email": "healthdata@sciensano.be", - "homepage": "https://healthdata.be", - "type": "", - "identifier": "" - }], - "role": "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor" + "name": "Contact Point", + "email": "healthdata@sciensano.be", + "homepage": "https://healthdata.be", + "type": "", + "identifier": "" } ], - "quality_annotation": [ + "role": "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor" + } + ], + "quality_annotation": [ + { + "motivated_by": "http://www.w3.org/ns/dqv#qualityAssessment", + "body": "https://acertificateserver.eu/mycertificate", + "target": "https://fair.healthdata.be/dataset/d43a158e-7d13-4660-bbc3-9d3f8d5501e5" + } + ], + "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", + "analytics": [ + "http://example.com/analytics" + ], + "alternate_identifier": [ + "internalURI:admsIdentifier0" + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg/2022/868/oj" + ], + "author": null, + "author_email": null, + "code_values": [ + "http://example.com/code1", + "http://example.com/code2" + ], + "coding_system": [ + "http://www.wikidata.org/entity/P1690", + "http://www.wikidata.org/entity/P4229" + ], + "conforms_to": [ + "http://www.wikidata.org/entity/Q19597236" + ], + "creator_user_id": null, + "dcat_type": "n1049372e768c4429a6b2200c22f5f1a4b7", + "documentation": [ + "n1049372e768c4429a6b2200c22f5f1a4b9" + ], + "frequency": "http://publications.europa.eu/resource/authority/frequency/DAILY", + "health_category": [ + "http://example.com/ontology/resource/authority/healthcategories/PHDR", + "http://example.com/ontology/resource/authority/healthcategories/IDHP", + "http://example.com/ontology/resource/authority/healthcategories/DIOH", + "http://example.com/ontology/resource/authority/healthcategories/EHRS" + ], + "health_theme": [ + "http://www.wikidata.org/entity/Q7907952", + "http://www.wikidata.org/entity/Q58624061" + ], + "id": "e7ccf79d-705c-427f-8e96-f87bcd6e5318", + "identifier": "http://example.com/dataset/1234567890", + "is_referenced_by": [ + "https://doi.org/10.1038/sdata.2016.18", + "https://dx.doi.org/10.1002/jmri.28679" + ], + "isopen": false, + "issued": "2024-01-01T00:00:00+00:00", + "language": [ + "http://publications.europa.eu/resource/authority/language/ENG", + "http://publications.europa.eu/resource/authority/language/NLD", + "http://publications.europa.eu/resource/authority/language/FRA" + ], + "legal_basis": [ + "https://w3id.org/dpv#Consent" + ], + "license_id": "", + "license_title": "", + "maintainer": null, + "maintainer_email": null, + "max_typical_age": "110", + "metadata_created": "2024-12-02T19:00:30.897399", + "metadata_modified": "2024-12-02T19:00:30.897406", + "min_typical_age": "0", + "modified": "2024-12-31T23:59:59+00:00", + "name": "test-dcat-1", + "notes": "This dataset is an example of using HealthDCAT-AP in CKAN", + "num_resources": 0, + "num_tags": 3, + "number_of_records": "123456789", + "number_of_unique_individuals": "7654321", + "organization": null, + "personal_data": [ + "https://w3id.org/dpv/dpv-pd#Age", + "https://w3id.org/dpv/dpv-pd#Gender", + "https://w3id.org/dpv/dpv-pd#HealthRecord" + ], + "population_coverage": [ + "This example includes a very non-descript population" + ], + "private": false, + "provenance": "This example dataset is partly sourced from TEHDAS2", + "publisher_note": [ + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." + ], + "publisher_type": [ + "http://example.com/publisherType/undefined" + ], + "trusted_data_holder": true, + "purpose": [ + "https://w3id.org/dpv#AcademicResearch" + ], + "qualified_relation": [ + { + "uri": "", + "relation": "http://example.com/dataset/3.141592", + "role": "http://www.iana.org/assignments/relation/related" + } + ], + "state": "active", + "temporal_resolution": "P1D", + "theme": [ + "http://publications.europa.eu/resource/authority/data-theme/HEAL" + ], + "title": "HealthDCAT-AP test dataset", + "type": "dataset", + "uri": "http://example.healthdata.nl/set/dataset", + "version_notes": "Dataset continuously updated", + "contact": [ + { + "email": "covacsurv@sciensano.be", + "identifier": "", + "name": "Contact Point" + } + ], + "creator": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "url": "https:/example.com/homepage" + } + ], + "extras": [ + { + "key": "related_resource", + "value": "[\"http://example.com/dataset/9876543210\"]" + }, + { + "key": "sample", + "value": "[\"http://example.com/sample\"]" + }, + { + "key": "spatial_uri", + "value": "http://publications.europa.eu/resource/authority/country/BEL" + } + ], + "hdab": [ + { + "email": "hdab@example.com", + "identifier": "", + "name": "EU Health Data Access Body", + "type": "", + "uri": "", + "url": "https://www.example.com/hdab" + } + ], + "provenance_activity": [ + { + "dct_type": "http://dbpedia.org/resource/Record_linkage", + "label": "http://dbpedia.org/resource/Record_linkage", + "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp", + "startedAtTime": "2021-01-01T00:00:00+00:00", + "uri": "internalURI:wasGeneratedBy0", + "wasAssociatedWith": [ { - "motivated_by": "http://www.w3.org/ns/dqv#qualityAssessment", - "body": "https://acertificateserver.eu/mycertificate", - "target": "https://fair.healthdata.be/dataset/d43a158e-7d13-4660-bbc3-9d3f8d5501e5" + "homepage": "https://www.sciensano.be/fr/people/joris-van-loenhout", + "email": "mailto:Joris.VanLoenhout@sciensano.be", + "name": "Dr. Joris van Loenhout", + "actedOnBehalfOf": [ + { + "name": "Contact Point" + } + ] } - ], - "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", - "analytics": [ - "http://example.com/analytics" - ], - "alternate_identifier": [ - "internalURI:admsIdentifier0" - ], - "applicable_legislation": [ - "http://data.europa.eu/eli/reg/2022/868/oj" - ], - "author": null, - "author_email": null, - "code_values": [ - "http://example.com/code1", - "http://example.com/code2" - ], - "coding_system": [ - "http://www.wikidata.org/entity/P1690", - "http://www.wikidata.org/entity/P4229" - ], - "conforms_to": [ - "http://www.wikidata.org/entity/Q19597236" - ], - "creator_user_id": null, - "dcat_type": "n1049372e768c4429a6b2200c22f5f1a4b7", - "documentation": [ - "n1049372e768c4429a6b2200c22f5f1a4b9" - ], - "frequency": "http://publications.europa.eu/resource/authority/frequency/DAILY", - "health_category": [ - "http://example.com/ontology/resource/authority/healthcategories/PHDR", - "http://example.com/ontology/resource/authority/healthcategories/IDHP", - "http://example.com/ontology/resource/authority/healthcategories/DIOH", - "http://example.com/ontology/resource/authority/healthcategories/EHRS" - ], - "health_theme": [ - "http://www.wikidata.org/entity/Q7907952", - "http://www.wikidata.org/entity/Q58624061" - ], - "id": "e7ccf79d-705c-427f-8e96-f87bcd6e5318", - "identifier": "http://example.com/dataset/1234567890", - "is_referenced_by": [ - "https://doi.org/10.1038/sdata.2016.18", - "https://dx.doi.org/10.1002/jmri.28679" - ], - "isopen": false, - "issued": "2024-01-01T00:00:00+00:00", - "language": [ - "http://publications.europa.eu/resource/authority/language/ENG", - "http://publications.europa.eu/resource/authority/language/NLD", - "http://publications.europa.eu/resource/authority/language/FRA" - ], - "legal_basis": [ - "https://w3id.org/dpv#Consent" - ], - "license_id": "", - "license_title": "", - "maintainer": null, - "maintainer_email": null, - "max_typical_age": "110", - "metadata_created": "2024-12-02T19:00:30.897399", - "metadata_modified": "2024-12-02T19:00:30.897406", - "min_typical_age": "0", - "modified": "2024-12-31T23:59:59+00:00", - "name": "test-dcat-1", - "notes": "This dataset is an example of using HealthDCAT-AP in CKAN", - "num_resources": 0, - "num_tags": 3, - "number_of_records": "123456789", - "number_of_unique_individuals": "7654321", - "organization": null, - "personal_data": [ - "https://w3id.org/dpv/dpv-pd#Age", - "https://w3id.org/dpv/dpv-pd#Gender", - "https://w3id.org/dpv/dpv-pd#HealthRecord" - ], - "population_coverage": [ - "This example includes a very non-descript population" - ], - "private": false, - "provenance": "This example dataset is partly sourced from TEHDAS2", - "publisher_note": [ - "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." - ], - "publisher_type": [ - "http://example.com/publisherType/undefined" - ], - "trusted_data_holder": true, - "purpose": [ - "https://w3id.org/dpv#AcademicResearch" - ], - "qualified_relation": [ - { - "uri": "", - "relation": "http://example.com/dataset/3.141592", - "role": "http://www.iana.org/assignments/relation/related" - } - ], + ] + } + ], + "publisher": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "uri": "", + "url": "https://healthdata.nl" + } + ], + "retention_period": [ + { + "end": "2034-12-31", + "start": "2020-03-01" + } + ], + "tags": [ + { + "display_name": "Test 1", + "id": "5c418ec2-cb41-4c42-9b9c-f5d1e3a831e5", + "name": "Test 1", "state": "active", - "temporal_resolution": "P1D", - "theme": [ - "http://publications.europa.eu/resource/authority/data-theme/HEAL" - ], - "title": "HealthDCAT-AP test dataset", - "type": "dataset", - "uri": "http://example.healthdata.nl/set/dataset", - "version_notes": "Dataset continuously updated", - "contact": [ - { - "email": "covacsurv@sciensano.be", - "identifier": "", - "name": "Contact Point" - } - ], - "creator": [ - { - "email": "info@example.com", - "identifier": "", - "name": "Contact Point", - "type": "", - "url": "https:/example.com/homepage" - } - ], - "extras": [ - { - "key": "related_resource", - "value": "[\"http://example.com/dataset/9876543210\"]" - }, - { - "key": "sample", - "value": "[\"http://example.com/sample\"]" - }, - { - "key": "spatial_uri", - "value": "http://publications.europa.eu/resource/authority/country/BEL" - } - ], - "hdab": [ - { - "email": "hdab@example.com", - "identifier": "", - "name": "EU Health Data Access Body", - "type": "", - "uri": "", - "url": "https://www.example.com/hdab" - } - ], - "provenance_activity": [ - { - "dct_type": "http://dbpedia.org/resource/Record_linkage", - "label": "http://dbpedia.org/resource/Record_linkage", - "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp", - "startedAtTime": "2021-01-01T00:00:00+00:00", - "uri": "internalURI:wasGeneratedBy0", - "wasAssociatedWith": [ - { - "homepage": "https://www.sciensano.be/fr/people/joris-van-loenhout", - "email": "mailto:Joris.VanLoenhout@sciensano.be", - "name": "Dr. Joris van Loenhout", - "actedOnBehalfOf": [ - { - "name": "Contact Point" - } - ] - } - ] - } - ], - "publisher": [ - { - "email": "info@example.com", - "identifier": "", - "name": "Contact Point", - "type": "", - "uri": "", - "url": "https://healthdata.nl" - } + "vocabulary_id": null + }, + { + "display_name": "Test 2", + "id": "c4117ace-2114-470d-b6e9-0df7580a12d8", + "name": "Test 2", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Test 3", + "id": "d5a5288d-3bff-431e-be94-12c71d25d75b", + "name": "Test 3", + "state": "active", + "vocabulary_id": null + } + ], + "temporal_coverage": [ + { + "end": "2024-12-31", + "start": "2020-03-01" + } + ], + "resources": [ + { + "access_url": "", + "applicable_legislation": [], + "availability": "", + "cache_last_updated": null, + "cache_url": null, + "compress_format": "", + "conforms_to": [], + "created": "2025-07-10T18:19:32.840953", + "description": "", + "documentation": [], + "download_url": "", + "format": null, + "hash": "", + "hash_algorithm": "", + "id": "7e65aeba-136d-48d6-a824-782176c63104", + "issued_date": "", + "issued_time": "", + "language": null, + "last_modified": null, + "license": "", + "metadata_modified": "2025-07-10T18:19:32.838228", + "mimetype": "", + "mimetype_inner": null, + "modified_date": "", + "modified_time": "", + "name": "", + "package_format": "", + "package_id": "16a60ea2-965a-4b5a-9a65-1284354c256e", + "position": 0, + "resource_type": null, + "rights": "", + "spatial_resolution_in_meters": "", + "state": "active", + "status": "", + "temporal_resolution": "", + "uri": "", + "url": "", + "url_type": "", + "access_services": [ + { + "access_rights": "", + "endpoint_description": "", + "endpoint_url": [], + "serves_dataset": [], + "title": "", + "uri": "" + } ], "retention_period": [ - { - "end": "2034-12-31", - "start": "2020-03-01" - } - ], - "tags": [ - { - "display_name": "Test 1", - "id": "5c418ec2-cb41-4c42-9b9c-f5d1e3a831e5", - "name": "Test 1", - "state": "active", - "vocabulary_id": null - }, - { - "display_name": "Test 2", - "id": "c4117ace-2114-470d-b6e9-0df7580a12d8", - "name": "Test 2", - "state": "active", - "vocabulary_id": null - }, - { - "display_name": "Test 3", - "id": "d5a5288d-3bff-431e-be94-12c71d25d75b", - "name": "Test 3", - "state": "active", - "vocabulary_id": null - } - ], - "temporal_coverage": [ - { - "end": "2024-12-31", - "start": "2020-03-01" - } - ], - "resources": [ - { - "retention_period": [ - { - "end": "2034-12-31", - "start": "2020-03-01" - } - ] - } - ], - "groups": [], - "relationships_as_subject": [], - "relationships_as_object": [] - } + { + "end": "2025-07-18", + "start": "2025-07-10" + } + ] + } + ], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } ] \ No newline at end of file From 1294f4aab27d0f5b074f0abb49aee7d8c80b0254 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 2 Jul 2025 14:23:04 +0200 Subject: [PATCH 15/36] feat(missing field) add missing fields - provenance_activity - qualified_attribution - quality_annotation - retention period (distribution) - homepage (catalog) --- ckanext/dcat/profiles/base.py | 53 +- ckanext/dcat/profiles/euro_dcat_ap_2.py | 87 +++- ckanext/dcat/profiles/euro_dcat_ap_base.py | 13 +- ckanext/dcat/profiles/euro_health_dcat_ap.py | 148 +++++- ckanext/dcat/schemas/dcat_ap_full.yaml | 84 +++ ckanext/dcat/schemas/health_dcat_ap.yaml | 112 ++++ .../dcat_ap/test_euro_dcatap_profile_parse.py | 1 + .../test_euro_dcatap_profile_serialize.py | 2 +- .../test_euro_health_dcat_ap_profile_parse.py | 40 ++ ...t_euro_health_dcat_ap_profile_serialize.py | 102 +++- examples/ckan/health_dcat_ap.json | 487 +++++++++++------- examples/dcat/dataset.rdf | 1 + examples/dcat/dataset_health.ttl | 27 +- 13 files changed, 921 insertions(+), 236 deletions(-) diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 29802793..299f74bc 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -7,7 +7,7 @@ from ckantoolkit import ObjectNotFound, asbool, aslist, config, get_action, url_for from dateutil.parser import parse as parse_date from geomet import InvalidGeoJSONException, wkt -from rdflib import BNode, Literal, URIRef, term +from rdflib import BNode, Literal, URIRef, term, PROV from rdflib.namespace import ORG, RDF, RDFS, SKOS, XSD, Namespace from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS @@ -95,7 +95,6 @@ def __new__(cls, value, lang=None): # In case something goes wrong: use Literal return Literal(value, lang=lang) - class CleanedURIRef(object): """Performs some basic URL encoding on value before creating an URIRef object. @@ -547,9 +546,13 @@ def _agents_details(self, subject, predicate): ) agent_details["url"] = self._object_value(agent, FOAF.homepage) agent_details["type"] = self._object_value(agent, DCT.type) - agent_details['identifier'] = self._object_value(agent, DCT.identifier) - agents.append(agent_details) + agent_details["identifier"] = self._object_value(agent, DCT.identifier) + + acted_orgs = self._agents_details(agent, PROV.actedOnBehalfOf) + if acted_orgs: + agent_details["actedOnBehalfOf"] = acted_orgs + agents.append(agent_details) return agents def _contact_details(self, subject, predicate): @@ -819,6 +822,48 @@ def _read_list_value(self, value): return items + def _add_agent_to_graph(self, subject_ref, predicate, agent_dict): + """ + Serializes a foaf:Agent or foaf:Organization with optional subfields into the RDF graph. + + Parameters: + - subject_ref: The RDF subject (dataset, activity, etc.) + - predicate: The RDF predicate (e.g., dct:publisher, prov:wasAssociatedWith, dcat:agent) + - agent_dict: A dict with agent metadata (e.g., name, email, homepage, type, identifier, actedOnBehalfOf) + """ + uri = agent_dict.get("uri", "").strip() + + agent_ref = URIRefOrLiteral(uri) if uri else BNode() + + self.g.add((subject_ref, predicate, agent_ref)) + self.g.add((agent_ref, RDF.type, FOAF.Organization)) + self.g.add((agent_ref, RDF.type, FOAF.Agent)) + + if agent_dict.get("name"): + self.g.add((agent_ref, FOAF.name, Literal(agent_dict["name"]))) + if agent_dict.get("email"): + email = agent_dict["email"] + if not email.startswith("mailto:"): + email = f"mailto:{email}" + self.g.add((agent_ref, FOAF.mbox, URIRef(email))) + if agent_dict.get("url"): + self.g.add((agent_ref, FOAF.homepage, URIRef(agent_dict["url"]))) + if agent_dict.get("homepage"): + self.g.add((agent_ref, FOAF.homepage, URIRef(agent_dict["homepage"]))) + if agent_dict.get("type"): + self.g.add((agent_ref, DCT.type, URIRef(agent_dict["type"]))) + if agent_dict.get("identifier"): + self.g.add((agent_ref, DCT.identifier, Literal(agent_dict["identifier"]))) + + for sub_org in agent_dict.get("actedOnBehalfOf", []): + if sub_org.get("name"): + org_ref = BNode() + self.g.add((agent_ref, PROV.actedOnBehalfOf, org_ref)) + self.g.add((org_ref, RDF.type, PROV.Organization)) + self.g.add((org_ref, FOAF.name, Literal(sub_org["name"]))) + + return agent_ref + def _add_spatial_value_to_graph(self, spatial_ref, predicate, value): """ Adds spatial triples to the graph. Assumes that value is a GeoJSON string diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index e5204be1..fe7e0570 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -1,7 +1,7 @@ import json from decimal import Decimal, DecimalException -from rdflib import URIRef, BNode, Literal, Namespace +from rdflib import URIRef, BNode, Literal, Namespace, FOAF, PROV, RDF, RDFS from ckanext.dcat.utils import resource_uri from .base import URIRefOrLiteral, CleanedURIRef @@ -18,7 +18,6 @@ from .euro_dcat_ap_base import BaseEuropeanDCATAPProfile - ELI = Namespace("http://data.europa.eu/eli/ontology#") @@ -65,6 +64,32 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): # Call base super method for common properties super().parse_dataset(dataset_dict, dataset_ref) + # --- Provenance deserialization --- + was_generated_by = self.g.value(dataset_ref, PROV.wasGeneratedBy) + if was_generated_by: + activity_dict = {} + activity_dict["uri"] = str(was_generated_by) + activity_dict["type"] = [ + str(t) for t in self.g.objects(was_generated_by, RDF.type) + ] + activity_dict["label"] = self._object_value(was_generated_by, RDFS.label) + activity_dict["seeAlso"] = self._object_value(was_generated_by, RDFS.seeAlso) + activity_dict["dct_type"] = self._object_value(was_generated_by, DCT.type) + activity_dict["startedAtTime"] = self._object_value( + was_generated_by, PROV.startedAtTime + ) + + agents = self._agents_details(was_generated_by, PROV.wasAssociatedWith) + if agents: + activity_dict["wasAssociatedWith"] = [agents[0]] # Only take the first agent + + dataset_dict["provenance_activity"] = [activity_dict] + + # --- Qualified Attribution --- + qualified_attributions = self._parse_qualified_attributions(dataset_ref) + if qualified_attributions: + dataset_dict["qualified_attribution"] = qualified_attributions + # Standard values value = self._object_value(dataset_ref, DCAT.temporalResolution) if value: @@ -246,6 +271,44 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): _datatype=datatype, _class=_class, ) + + # --- Provenance serialization --- + activities = dataset_dict.get("provenance_activity", []) + + for activity in activities: + activity_uri = URIRef(activity.get("uri")) if activity.get("uri") else BNode() + self.g.add((dataset_ref, PROV.wasGeneratedBy, activity_uri)) + self.g.add((activity_uri, RDF.type, PROV.Activity)) + + if activity.get("label"): + self.g.add((activity_uri, RDFS.label, Literal(activity["label"]))) + if activity.get("seeAlso"): + self.g.add((activity_uri, RDFS.seeAlso, URIRef(activity["seeAlso"]))) + if activity.get("dct_type"): + self.g.add((activity_uri, DCT.type, URIRef(activity["dct_type"]))) + if activity.get("startedAtTime"): + self.g.add((activity_uri, PROV.startedAtTime, Literal(activity["startedAtTime"], datatype=XSD.dateTime))) + + for agent_dict in activity.get("wasAssociatedWith", []): + self._add_agent_to_graph(activity_uri, PROV.wasAssociatedWith, agent_dict) + + # Qualified Attribution + qualified_attributions = dataset_dict.get("qualified_attribution", []) + for attr in qualified_attributions: + attr_ref = BNode() + self.g.add((dataset_ref, DCAT.qualifiedAttribution, attr_ref)) + self.g.add((attr_ref, RDF.type, DCAT.Attribution)) + + agent_list = attr.get("agent", []) + for agent_dict in agent_list: + if isinstance(agent_dict, dict): + self._add_agent_to_graph(attr_ref, DCAT.agent, agent_dict) + elif isinstance(agent_dict, str): + self.g.add((attr_ref, DCAT.agent, URIRef(agent_dict))) + role = attr.get("role") + if role: + self.g.add((attr_ref, DCAT.hadRole, URIRef(role))) + # Temporal @@ -448,3 +511,23 @@ def _graph_from_dataset_v2_only(self, dataset_dict, dataset_ref): _type=URIRefOrLiteral, _class=ADMS.Identifier, ) + + def _parse_qualified_attributions(self, dataset_ref): + attributions = [] + for qual_attr_ref in self.g.objects(dataset_ref, PROV.qualifiedAttribution): + attr = {} + + # Get role + for role_ref in self.g.objects(qual_attr_ref, DCAT.hadRole): + attr["role"] = str(role_ref) + break + + # Get agent (using shared logic) + agent_details = self._agents_details(qual_attr_ref, PROV.agent) + if agent_details: + attr["agent"] = agent_details + + if attr: + attributions.append(attr) + + return attributions diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index e42c6afb..ba9f1f2a 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -206,6 +206,17 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): src_data = self._extract_catalog_dict(catalog_src) dataset_dict["extras"].extend(src_data) + + homepage = self._object_value(dataset_ref, FOAF.homepage) + if homepage: + dataset_dict["extras"].append( + {"key": "homepage", "value": homepage} + ) + elif config.get("ckan.site_url"): + dataset_dict["extras"].append( + {"key": "homepage", "value": config.get("ckan.site_url")} + ) + # Resources for distribution in self._distributions(dataset_ref): @@ -778,4 +789,4 @@ def _graph_from_catalog_base(self, catalog_dict, catalog_ref): # Dates modified = self._last_catalog_modification() if modified: - self._add_date_triple(catalog_ref, DCT.modified, modified) + self._add_date_triple(catalog_ref, DCT.modified, modified) \ No newline at end of file diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 44fca9ac..253f4ee9 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -1,6 +1,8 @@ -from rdflib import XSD, Literal, URIRef +from rdflib import XSD, Literal, URIRef, RDF, BNode, DCAT, RDFS from rdflib.namespace import Namespace - +from rdflib.namespace import DCTERMS as DCT +from .base import CleanedURIRef +from ckanext.dcat.utils import resource_uri from ckanext.dcat.profiles.base import URIRefOrLiteral from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile @@ -10,6 +12,12 @@ # Data Privacy Vocabulary namespace DPV = Namespace("https://w3id.org/dpv#") +# Data Quality Vocabulary namespace +DQV = Namespace("http://www.w3.org/ns/dqv#") + +# Open Annotation namespace +OA = Namespace("http://www.w3.org/ns/oa#") + namespaces = { "healthdcatap": HEALTHDCATAP, "dpv": DPV, @@ -42,18 +50,19 @@ def _parse_health_fields(self, dataset_dict, dataset_ref): agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab) if agents: dataset_dict["hdab"] = agents + # Add the quality annotations + quality_annotations = self._parse_quality_annotation(dataset_ref) + if quality_annotations: + dataset_dict["quality_annotation"] = quality_annotations + + # Dataset-level retention + dataset_dict["retention_period"] = self._parse_retention_period(dataset_ref) - # Retention period - retention_start, retention_end = self._time_interval( - dataset_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2 - ) - retention_dict = {} - if retention_start is not None: - retention_dict["start"] = retention_start - if retention_end is not None: - retention_dict["end"] = retention_end - if retention_dict: - dataset_dict["retention_period"] = [retention_dict] + # Distribution-level retention + for distribution_ref in self._distributions(dataset_ref): + for resource_dict in dataset_dict.get("resources", []): + if resource_dict["distribution_ref"] == str(distribution_ref): + resource_dict["retention_period"] = self._parse_retention_period(distribution_ref) return dataset_dict @@ -96,6 +105,62 @@ def __parse_healthdcat_booleanvalues(self, dataset_dict, dataset_ref): lowered = value.lower() if lowered in ("true", "false"): dataset_dict[key] = lowered == "true" + + def _parse_quality_annotation(self, dataset_ref): + """ + Parse DQV quality annotations from the RDF graph. + + Returns a list of quality annotation dictionaries. + Only includes annotations where body and target are valid URIs. + """ + quality_annotation = [] + + # Find all quality annotations for this dataset + for annotation_ref in self.g.objects(dataset_ref, DQV.hasQualityAnnotation): + annotation_dict = {} + + # Get the body (must be a URI) + body = self._object_value(annotation_ref, OA.hasBody) + if body and isinstance(body, str) and body.startswith(("http://", "https://")): + annotation_dict["body"] = body + + # Get the target (must be a URI) + target = self._object_value(annotation_ref, OA.hasTarget) + if target and isinstance(target, str) and target.startswith(("http://", "https://")): + annotation_dict["target"] = target + + # Only include the annotation if both body and target are valid URIs + if "body" not in annotation_dict or "target" not in annotation_dict: + continue + + # Get the motivation (URI or literal) + motivation = self._object_value(annotation_ref, OA.motivatedBy) + if motivation: + annotation_dict["motivated_by"] = motivation + + quality_annotation.append(annotation_dict) + + return quality_annotation + + def _parse_retention_period(self, subject_ref): + """ + Parses the HEALTHDCATAP.retentionPeriod from the RDF graph for a given subject + (e.g., dataset or distribution). + + Returns a list with a single dict, e.g., + [{"start": "2023-01-01", "end": "2025-01-01"}] + or an empty list if no values are found. + """ + retention_start, retention_end = self._time_interval( + subject_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2 + ) + retention_dict = {} + if retention_start is not None: + retention_dict["start"] = retention_start + if retention_end is not None: + retention_dict["end"] = retention_end + + return [retention_dict] if retention_dict else [] def graph_from_dataset(self, dataset_dict, dataset_ref): super().graph_from_dataset(dataset_dict, dataset_ref) @@ -142,6 +207,32 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self._add_nonneg_integer_triple(dataset_dict, dataset_ref, key, predicate) self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab) + self._add_quality_annotation(dataset_dict, dataset_ref) + + # Dataset-level retention period + self._add_retention_period(dataset_ref, dataset_dict.get("retention_period", [])) + + for resource_dict in dataset_dict.get("resources", []): + distribution_ref = CleanedURIRef(resource_uri(resource_dict)) + self._add_retention_period(distribution_ref, resource_dict.get("retention_period", [])) + + def _add_retention_period(self, subject_ref, retention_list): + for retention in retention_list: + start = retention.get("start") + end = retention.get("end") + comment = retention.get("comment") + + if start or end or comment: + period_node = BNode() + self.g.add((subject_ref, HEALTHDCATAP.retentionPeriod, period_node)) + self.g.add((period_node, RDF.type, DCT.PeriodOfTime)) + + if start: + self.g.add((period_node, DCAT.startDate, Literal(start, datatype=XSD.date))) + if end: + self.g.add((period_node, DCAT.endDate, Literal(end, datatype=XSD.date))) + if comment: + self.g.add((period_node, RDFS.comment, Literal(comment, lang="en"))) def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): """ @@ -167,5 +258,36 @@ def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): except (ValueError, TypeError): self.g.add((dataset_ref, predicate, Literal(value))) + def _add_quality_annotation(self, dataset_dict, dataset_ref): + """ + Serialize qualified_annotation entries into RDF as DQV.QualityAnnotations. + Only URI-based body, target, and motivation values are supported. + """ + quality_annotation = self._get_dict_value(dataset_dict, "quality_annotation") + + if not quality_annotation: + return + + for annotation in quality_annotation: + if not isinstance(annotation, dict): + continue + + annotation_ref = BNode() + + # Link from dataset + self.g.add((dataset_ref, DQV.hasQualityAnnotation, annotation_ref)) + self.g.add((annotation_ref, RDF.type, OA.Annotation)) + + # URI-based fields only + for field, predicate in [ + ("body", OA.hasBody), + ("target", OA.hasTarget), + ("motivated_by", OA.motivatedBy), + ]: + uri = annotation.get(field) + if isinstance(uri, str) and uri.startswith(("http://", "https://")): + self.g.add((annotation_ref, predicate, URIRef(uri))) + + def graph_from_catalog(self, catalog_dict, catalog_ref): super().graph_from_catalog(catalog_dict, catalog_ref) diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index e6fb4a37..8cebe622 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -112,6 +112,11 @@ dataset_fields: label: Identifier help_text: Unique identifier for the creator, such as an ORCID or ROR ID. +- field_name: homepage + label: Homepage + display_snippet: link.html + help_text: A web page that acts as the homepage for the dataset. + - field_name: license_id label: License form_snippet: license.html @@ -285,6 +290,85 @@ dataset_fields: help_text: The function of an entity or agent with respect to another entity or resource. help_text: A description of a relationship with another resource. +- field_name: provenance_activity + label: Provenance Activity + repeating_label: Provenance Activity + repeating_once: true + repeating_subfields: + - field_name: uri + label: Activity URI + help_text: URI of the provenance activity (if available). + - field_name: label + label: Label + help_text: Human-readable label for the activity. + - field_name: seeAlso + label: See Also + help_text: Related link for the activity. + - field_name: dct_type + label: Type + help_text: Type of the activity (URI). + - field_name: startedAtTime + label: Started At Time + help_text: When the activity started (ISO 8601). + - field_name: wasAssociatedWith + label: Associated Agent + repeating_label: Agent + repeating_once: true + repeating_subfields: + - field_name: uri + label: URI + - field_name: name + label: Name + - field_name: email + label: Email + - field_name: homepage + label: Homepage + - field_name: type + label: Type + - field_name: identifier + label: Identifier + - field_name: url + label: URL + - field_name: actedOnBehalfOf + label: Acted On Behalf Of + repeating_label: Organization + repeating_once: true + repeating_subfields: + - field_name: name + label: Organization Name + help_text: Structured provenance activity information, including agent and organization. + +# Add qualified_attribution field here, just before the commented-out hvd_category field +- field_name: qualified_attribution + label: Qualified Attribution + repeating_label: Attribution + repeating_once: true + repeating_subfields: + - field_name: agent + label: Agent + repeating_label: Agent + repeating_once: true + repeating_subfields: + - field_name: uri + label: URI + - field_name: name + label: Name + - field_name: email + label: Email + - field_name: homepage + label: Homepage + - field_name: type + label: Type + - field_name: identifier + label: Identifier + - field_name: url + label: URL + - field_name: role + label: Role + help_text: Role of the agent (e.g., data processor, contributor). + help_text: Structured qualified attribution information including agent and role. + + #- field_name: hvd_category # label: HVD Category # preset: multiple_text diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index c8624396..142cde4c 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -109,6 +109,11 @@ dataset_fields: label: Identifier help_text: Unique identifier for the creator, such as an ORCID or ROR ID. +- field_name: homepage + label: Homepage + display_snippet: link.html + help_text: A web page that acts as the homepage for the dataset. + - field_name: license_id label: License form_snippet: license.html @@ -444,11 +449,106 @@ dataset_fields: help_text: The function of an entity or agent with respect to another entity or resource. help_text: A description of a relationship with another resource. +- field_name: provenance_activity + label: Provenance Activity + repeating_label: Provenance Activity + repeating_once: true + repeating_subfields: + - field_name: uri + label: Activity URI + help_text: URI of the provenance activity (if available). + - field_name: label + label: Label + help_text: Human-readable label for the activity. + - field_name: seeAlso + label: See Also + help_text: Related link for the activity. + - field_name: dct_type + label: Type + help_text: Type of the activity (URI). + - field_name: startedAtTime + label: Started At Time + help_text: When the activity started (ISO 8601). + - field_name: wasAssociatedWith + label: Associated Agent + repeating_label: Agent + repeating_once: true + repeating_subfields: + - field_name: uri + label: URI + - field_name: name + label: Name + - field_name: email + label: Email + - field_name: homepage + label: Homepage + - field_name: type + label: Type + - field_name: identifier + label: Identifier + - field_name: url + label: URL + - field_name: actedOnBehalfOf + label: Acted On Behalf Of + repeating_label: Organization + repeating_once: true + repeating_subfields: + - field_name: name + label: Organization Name + help_text: Structured provenance activity information, including agent and organization. + # Note: if not provided, this will be autogenerated - field_name: uri label: URI help_text: An URI for this dataset (if not provided it will be autogenerated). +- field_name: qualified_attribution + label: Qualified Attribution + repeating_label: Attribution + repeating_once: true + repeating_subfields: + - field_name: agent + label: Agent + repeating_label: Agent + repeating_once: true + repeating_subfields: + - field_name: uri + label: URI + - field_name: name + label: Name + - field_name: email + label: Email + - field_name: homepage + label: Homepage + - field_name: type + label: Type + - field_name: identifier + label: Identifier + - field_name: url + label: URL + - field_name: role + label: Role + help_text: Role of the agent (e.g., data processor, contributor). + help_text: Structured qualified attribution information including agent and role. + +- field_name: quality_annotation + label: Quality annotations + repeating_label: Quality annotation + repeating_subfields: + - field_name: body + label: Body + help_text: The content of the quality annotation (e.g., URL to certificate, measurement value, assessment result). + - field_name: target + label: Target + help_text: The specific aspect of the dataset being annotated (e.g., URI or description of what is being assessed). + - field_name: motivated_by + label: Motivated by + help_text: The motivation or reason for the quality annotation. + help_text: > + Quality annotations provide information about the quality of the dataset, including certifications, + measurements, and assessments. These annotations follow the Data Quality Vocabulary (DQV) + and Web Annotation standards. + # TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) # resource_fields: @@ -615,3 +715,15 @@ resource_fields: - field_name: uri label: URI help_text: An URI for this resource (if not provided it will be autogenerated). + +- field_name: retention_period + label: Retention period + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date diff --git a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py index 8da1c634..d0252a11 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py @@ -124,6 +124,7 @@ def _get_extra_value_as_list(key): assert _get_extra_value('access_rights') == 'public' assert _get_extra_value('provenance') == 'Some statement about provenance' assert _get_extra_value('dcat_type') == 'test-type' + assert _get_extra_value('homepage') == 'http://dataset.info.org/home' # Lists assert sorted(_get_extra_value_as_list('language')) == [u'ca', u'en', u'es'] diff --git a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_serialize.py index 826aef47..88826149 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_serialize.py @@ -1514,4 +1514,4 @@ def test_dont_set_missing_license_for_resource_config_param_value_false(self): assert str(distribution) == utils.resource_uri(resource) # Verify that the license of the dataset is not in the distribution - assert not self._triple(g, distribution, DCT.license, URIRef(dataset['license_id'])) + assert not self._triple(g, distribution, DCT.license, URIRef(dataset['license_id'])) \ No newline at end of file diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 1d7b53c5..2d907f0f 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -182,3 +182,43 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] + assert dataset["resources"][0]["retention_period"] == [ + { + "start": "2020-03-01", + "end": "2034-12-31", + } + ] + + assert dataset["provenance_activity"] == [{ + "uri": "internalURI:wasGeneratedBy0", + "label": "http://dbpedia.org/resource/Record_linkage", + "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp", + "dct_type": "http://dbpedia.org/resource/Record_linkage", + "startedAtTime": "2021-01-01T00:00:00+00:00", + "wasAssociatedWith": [{ + "name": "Dr. Joris van Loenhout", + "url": "https://www.sciensano.be/fr/people/joris-van-loenhout", + "email": "Joris.VanLoenhout@sciensano.be", + "type": "", + "uri": "", + "identifier": "", + "actedOnBehalfOf": [{ + "name": "Contact Point" + }] + }] + }] + + assert dataset["qualified_attribution"][0]["role"] == "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor" + + agent = dataset["qualified_attribution"][0]["agent"][0] + assert agent["name"] == "Contact Point" + assert agent["email"] == "healthdata@sciensano.be" + assert agent["url"] == "https://healthdata.be" + assert agent["type"] == "" + assert agent["identifier"] == "" + + # DQV Quality Annotation + assert len(dataset["quality_annotation"]) == 1 + assert dataset["quality_annotation"][0]["body"] == "https://certificates.theodi.org/en/datasets/393/certificate" + assert dataset["quality_annotation"][0]["target"] == "https://certificates.theodi.org/en/datasets/393" + assert dataset["quality_annotation"][0]["motivated_by"] == "http://www.w3.org/ns/dqv#qualityAssessment" diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index b6e5c5c6..0c523189 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -3,9 +3,9 @@ import pytest from ckan.tests.helpers import call_action from geomet import wkt -from rdflib import Graph -from rdflib.namespace import RDF +from rdflib import Graph, PROV, Literal from rdflib.term import URIRef +from rdflib.namespace import Namespace from ckanext.dcat import utils from ckanext.dcat.processors import RDFSerializer @@ -30,6 +30,8 @@ DCAT_AP_PROFILES = ["euro_dcat_ap_3"] +# Open Annotation namespace +OA = Namespace("http://www.w3.org/ns/oa#") @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @@ -103,3 +105,99 @@ def test_e2e_ckan_to_dcat(self): assert self._triple( g, relation[0][2], predicate, value ), f"relation Predicate {predicate} does not have value {value}" + + # Test provenance activity + provenance = [t for t in g.triples((dataset_ref, PROV.wasGeneratedBy, None))] + assert len(provenance) == 1 + activity_node = provenance[0][2] + activity_items = [ + (RDF.type, PROV.Activity), + (RDFS.label, Literal(dataset_dict["provenance_activity"][0]["label"])), + (RDFS.seeAlso, URIRef(dataset_dict["provenance_activity"][0]["seeAlso"])), + (DCT.type, URIRef(dataset_dict["provenance_activity"][0]["dct_type"])), + (PROV.startedAtTime, Literal(dataset_dict["provenance_activity"][0]["startedAtTime"], datatype=XSD.dateTime)), + ] + for predicate, value in activity_items: + assert self._triple(g, activity_node, predicate, value), f"Provenance {predicate} mismatch" + + agent_triple = list(g.objects(activity_node, PROV.wasAssociatedWith)) + assert len(agent_triple) == 1 + agent_node = agent_triple[0] + agent_items = [ + (RDF.type, PROV.Agent), + (FOAF.name, Literal(dataset_dict["provenance_activity"][0]["wasAssociatedWith"][0]["name"])), + (FOAF.homepage, URIRef(dataset_dict["provenance_activity"][0]["wasAssociatedWith"][0]["homepage"])), + (FOAF.mbox, URIRef(dataset_dict["provenance_activity"][0]["wasAssociatedWith"][0]["email"])), + ] + + acted_on = list(g.objects(agent_node, PROV.actedOnBehalfOf)) + assert len(acted_on) == 1 + org_node = acted_on[0] + assert self._triple(g, org_node, FOAF.name, Literal(dataset_dict["provenance_activity"][0]["wasAssociatedWith"][0]["actedOnBehalfOf"][0]["name"])) + + # Test qualified attribution + attributions = [t for t in g.triples((dataset_ref, DCAT.qualifiedAttribution, None))] + assert len(attributions) == 1 + attr_node = attributions[0][2] + assert self._triple(g, attr_node, RDF.type, DCAT.Attribution) + assert self._triple(g, attr_node, DCAT.hadRole, URIRef(dataset_dict["qualified_attribution"][0]["role"])) + + agent_node = list(g.objects(attr_node, DCAT.agent))[0] + agent_details = dataset_dict["qualified_attribution"][0]["agent"][0] + agent_items = [ + (RDF.type, FOAF.Organization), + (FOAF.name, Literal(agent_details["name"])), + (FOAF.mbox, URIRef("mailto:" + agent_details["email"])), + (FOAF.homepage, URIRef(agent_details["homepage"])), + ] + for predicate, value in agent_items: + assert self._triple(g, agent_node, predicate, value), f"QualifiedAttribution Agent {predicate} mismatch" + + # Test qualified annotation + annotations = [t for t in + g.triples((dataset_ref, URIRef("http://www.w3.org/ns/dqv#hasQualityAnnotation"), None))] + assert len(annotations) == 1, "Expected one dqv:hasQualityAnnotation triple" + + annotation_node = annotations[0][2] + assert self._triple(g, annotation_node, RDF.type, URIRef("http://www.w3.org/ns/oa#Annotation")) + + annotation_details = dataset_dict["quality_annotation"][0] + + # Assert URI-based fields + for field, predicate_uri in [ + ("motivated_by", OA.motivatedBy), + ("body", OA.hasBody), + ("target", OA.hasTarget), + ]: + value = annotation_details.get(field) + assert value is not None, f"Missing {field} in annotation" + assert self._triple(g, annotation_node, URIRef(predicate_uri), + URIRef(value)), f"QualityAnnotation {field} mismatch" + + # Extract the distribution node + distributions = list(g.objects(dataset_ref, DCAT.distribution)) + assert len(distributions) > 0, "No distributions found" + distribution_node = distributions[0] + + distribution_details = dataset_dict["resources"][0] + + assert self._triple(g, distribution_node, RDF.type, DCAT.Distribution) + + # Check retention period + retention_nodes = list(g.objects(distribution_node, HEALTHDCATAP.retentionPeriod)) + assert len(retention_nodes) == 1, "Expected one retentionPeriod node on distribution" + retention_node = retention_nodes[0] + assert self._triple(g, retention_node, RDF.type, DCT.PeriodOfTime) + assert self._triple( + g, + retention_node, + DCAT.startDate, + Literal(distribution_details["retention_period"][0]["start"], datatype=XSD.date) + ) + assert self._triple( + g, + retention_node, + DCAT.endDate, + Literal(distribution_details["retention_period"][0]["end"], datatype=XSD.date) + ) + diff --git a/examples/ckan/health_dcat_ap.json b/examples/ckan/health_dcat_ap.json index 26450ee8..88398a3c 100644 --- a/examples/ckan/health_dcat_ap.json +++ b/examples/ckan/health_dcat_ap.json @@ -1,200 +1,299 @@ [ - { - "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", - "analytics": [ - "http://example.com/analytics" + { + "qualified_attribution": [ + { + "agent": [ + { + "name": "Contact Point", + "email": "healthdata@sciensano.be", + "homepage": "https://healthdata.be", + "type": "", + "identifier": "" + } ], - "alternate_identifier": [ - "internalURI:admsIdentifier0" - ], - "applicable_legislation": [ - "http://data.europa.eu/eli/reg/2022/868/oj" - ], - "author": null, - "author_email": null, - "code_values": [ - "http://example.com/code1", - "http://example.com/code2" - ], - "coding_system": [ - "http://www.wikidata.org/entity/P1690", - "http://www.wikidata.org/entity/P4229" - ], - "conforms_to": [ - "http://www.wikidata.org/entity/Q19597236" - ], - "creator_user_id": null, - "dcat_type": "n1049372e768c4429a6b2200c22f5f1a4b7", - "documentation": [ - "n1049372e768c4429a6b2200c22f5f1a4b9" - ], - "frequency": "http://publications.europa.eu/resource/authority/frequency/DAILY", - "health_category": [ - "http://example.com/ontology/resource/authority/healthcategories/PHDR", - "http://example.com/ontology/resource/authority/healthcategories/IDHP", - "http://example.com/ontology/resource/authority/healthcategories/DIOH", - "http://example.com/ontology/resource/authority/healthcategories/EHRS" - ], - "health_theme": [ - "http://www.wikidata.org/entity/Q7907952", - "http://www.wikidata.org/entity/Q58624061" - ], - "id": "e7ccf79d-705c-427f-8e96-f87bcd6e5318", - "identifier": "http://example.com/dataset/1234567890", - "is_referenced_by": [ - "https://doi.org/10.1038/sdata.2016.18", - "https://dx.doi.org/10.1002/jmri.28679" - ], - "isopen": false, - "issued": "2024-01-01T00:00:00+00:00", - "language": [ - "http://publications.europa.eu/resource/authority/language/ENG", - "http://publications.europa.eu/resource/authority/language/NLD", - "http://publications.europa.eu/resource/authority/language/FRA" - ], - "legal_basis": [ - "https://w3id.org/dpv#Consent" - ], - "license_id": "", - "license_title": "", - "maintainer": null, - "maintainer_email": null, - "max_typical_age": "110", - "metadata_created": "2024-12-02T19:00:30.897399", - "metadata_modified": "2024-12-02T19:00:30.897406", - "min_typical_age": "0", - "modified": "2024-12-31T23:59:59+00:00", - "name": "test-dcat-1", - "notes": "This dataset is an example of using HealthDCAT-AP in CKAN", - "num_resources": 0, - "num_tags": 3, - "number_of_records": "123456789", - "number_of_unique_individuals": "7654321", - "organization": null, - "personal_data": [ - "https://w3id.org/dpv/dpv-pd#Age", - "https://w3id.org/dpv/dpv-pd#Gender", - "https://w3id.org/dpv/dpv-pd#HealthRecord" - ], - "population_coverage": [ - "This example includes a very non-descript population" - ], - "private": false, - "provenance": "This example dataset is partly sourced from TEHDAS2", - "publisher_note": [ - "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." - ], - "publisher_type": [ - "http://example.com/publisherType/undefined" - ], - "trusted_data_holder": true, - "purpose": [ - "https://w3id.org/dpv#AcademicResearch" - ], - "qualified_relation": [ - { - "uri": "", - "relation": "http://example.com/dataset/3.141592", - "role": "http://www.iana.org/assignments/relation/related" - } - ], - "state": "active", - "temporal_resolution": "P1D", - "theme": [ - "http://publications.europa.eu/resource/authority/data-theme/HEAL" - ], - "title": "HealthDCAT-AP test dataset", - "type": "dataset", - "uri": "http://example.healthdata.nl/set/dataset", - "version_notes": "Dataset continuously updated", - "contact": [ - { - "email": "covacsurv@sciensano.be", - "identifier": "", + "role": "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor" + } + ], + "quality_annotation": [ + { + "motivated_by": "http://www.w3.org/ns/dqv#qualityAssessment", + "body": "https://acertificateserver.eu/mycertificate", + "target": "https://fair.healthdata.be/dataset/d43a158e-7d13-4660-bbc3-9d3f8d5501e5" + } + ], + "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", + "analytics": [ + "http://example.com/analytics" + ], + "alternate_identifier": [ + "internalURI:admsIdentifier0" + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg/2022/868/oj" + ], + "author": null, + "author_email": null, + "code_values": [ + "http://example.com/code1", + "http://example.com/code2" + ], + "coding_system": [ + "http://www.wikidata.org/entity/P1690", + "http://www.wikidata.org/entity/P4229" + ], + "conforms_to": [ + "http://www.wikidata.org/entity/Q19597236" + ], + "creator_user_id": null, + "dcat_type": "n1049372e768c4429a6b2200c22f5f1a4b7", + "documentation": [ + "n1049372e768c4429a6b2200c22f5f1a4b9" + ], + "frequency": "http://publications.europa.eu/resource/authority/frequency/DAILY", + "health_category": [ + "http://example.com/ontology/resource/authority/healthcategories/PHDR", + "http://example.com/ontology/resource/authority/healthcategories/IDHP", + "http://example.com/ontology/resource/authority/healthcategories/DIOH", + "http://example.com/ontology/resource/authority/healthcategories/EHRS" + ], + "health_theme": [ + "http://www.wikidata.org/entity/Q7907952", + "http://www.wikidata.org/entity/Q58624061" + ], + "id": "e7ccf79d-705c-427f-8e96-f87bcd6e5318", + "identifier": "http://example.com/dataset/1234567890", + "is_referenced_by": [ + "https://doi.org/10.1038/sdata.2016.18", + "https://dx.doi.org/10.1002/jmri.28679" + ], + "isopen": false, + "issued": "2024-01-01T00:00:00+00:00", + "language": [ + "http://publications.europa.eu/resource/authority/language/ENG", + "http://publications.europa.eu/resource/authority/language/NLD", + "http://publications.europa.eu/resource/authority/language/FRA" + ], + "legal_basis": [ + "https://w3id.org/dpv#Consent" + ], + "license_id": "", + "license_title": "", + "maintainer": null, + "maintainer_email": null, + "max_typical_age": "110", + "metadata_created": "2024-12-02T19:00:30.897399", + "metadata_modified": "2024-12-02T19:00:30.897406", + "min_typical_age": "0", + "modified": "2024-12-31T23:59:59+00:00", + "name": "test-dcat-1", + "notes": "This dataset is an example of using HealthDCAT-AP in CKAN", + "num_resources": 0, + "num_tags": 3, + "number_of_records": "123456789", + "number_of_unique_individuals": "7654321", + "organization": null, + "personal_data": [ + "https://w3id.org/dpv/dpv-pd#Age", + "https://w3id.org/dpv/dpv-pd#Gender", + "https://w3id.org/dpv/dpv-pd#HealthRecord" + ], + "population_coverage": [ + "This example includes a very non-descript population" + ], + "private": false, + "provenance": "This example dataset is partly sourced from TEHDAS2", + "publisher_note": [ + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." + ], + "publisher_type": [ + "http://example.com/publisherType/undefined" + ], + "trusted_data_holder": true, + "purpose": [ + "https://w3id.org/dpv#AcademicResearch" + ], + "qualified_relation": [ + { + "uri": "", + "relation": "http://example.com/dataset/3.141592", + "role": "http://www.iana.org/assignments/relation/related" + } + ], + "state": "active", + "temporal_resolution": "P1D", + "theme": [ + "http://publications.europa.eu/resource/authority/data-theme/HEAL" + ], + "title": "HealthDCAT-AP test dataset", + "type": "dataset", + "uri": "http://example.healthdata.nl/set/dataset", + "version_notes": "Dataset continuously updated", + "contact": [ + { + "email": "covacsurv@sciensano.be", + "identifier": "", + "name": "Contact Point" + } + ], + "creator": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "url": "https:/example.com/homepage" + } + ], + "extras": [ + { + "key": "related_resource", + "value": "[\"http://example.com/dataset/9876543210\"]" + }, + { + "key": "sample", + "value": "[\"http://example.com/sample\"]" + }, + { + "key": "spatial_uri", + "value": "http://publications.europa.eu/resource/authority/country/BEL" + } + ], + "hdab": [ + { + "email": "hdab@example.com", + "identifier": "", + "name": "EU Health Data Access Body", + "type": "", + "uri": "", + "url": "https://www.example.com/hdab" + } + ], + "provenance_activity": [ + { + "dct_type": "http://dbpedia.org/resource/Record_linkage", + "label": "http://dbpedia.org/resource/Record_linkage", + "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp", + "startedAtTime": "2021-01-01T00:00:00+00:00", + "uri": "internalURI:wasGeneratedBy0", + "wasAssociatedWith": [ + { + "homepage": "https://www.sciensano.be/fr/people/joris-van-loenhout", + "email": "mailto:Joris.VanLoenhout@sciensano.be", + "name": "Dr. Joris van Loenhout", + "actedOnBehalfOf": [ + { "name": "Contact Point" - } - ], - "creator": [ - { - "email": "info@example.com", - "identifier": "", - "name": "Contact Point", - "type": "", - "url": "https:/example.com/homepage" - } - ], - "extras": [ - { - "key": "related_resource", - "value": "[\"http://example.com/dataset/9876543210\"]" - }, - { - "key": "sample", - "value": "[\"http://example.com/sample\"]" - }, - { - "key": "spatial_uri", - "value": "http://publications.europa.eu/resource/authority/country/BEL" - } - ], - "hdab": [ - { - "email": "hdab@example.com", - "identifier": "", - "name": "EU Health Data Access Body", - "type": "", - "uri": "", - "url": "https://www.example.com/hdab" - } - ], - "publisher": [ - { - "email": "info@example.com", - "identifier": "", - "name": "Contact Point", - "type": "", - "uri": "", - "url": "https://healthdata.nl" - } + } + ] + } + ] + } + ], + "publisher": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "uri": "", + "url": "https://healthdata.nl" + } + ], + "retention_period": [ + { + "end": "2034-12-31", + "start": "2020-03-01" + } + ], + "tags": [ + { + "display_name": "Test 1", + "id": "5c418ec2-cb41-4c42-9b9c-f5d1e3a831e5", + "name": "Test 1", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Test 2", + "id": "c4117ace-2114-470d-b6e9-0df7580a12d8", + "name": "Test 2", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Test 3", + "id": "d5a5288d-3bff-431e-be94-12c71d25d75b", + "name": "Test 3", + "state": "active", + "vocabulary_id": null + } + ], + "temporal_coverage": [ + { + "end": "2024-12-31", + "start": "2020-03-01" + } + ], + "resources": [ + { + "access_url": "", + "applicable_legislation": [], + "availability": "", + "cache_last_updated": null, + "cache_url": null, + "compress_format": "", + "conforms_to": [], + "created": "2025-07-10T18:19:32.840953", + "description": "", + "documentation": [], + "download_url": "", + "format": null, + "hash": "", + "hash_algorithm": "", + "id": "7e65aeba-136d-48d6-a824-782176c63104", + "issued_date": "", + "issued_time": "", + "language": null, + "last_modified": null, + "license": "", + "metadata_modified": "2025-07-10T18:19:32.838228", + "mimetype": "", + "mimetype_inner": null, + "modified_date": "", + "modified_time": "", + "name": "", + "package_format": "", + "package_id": "16a60ea2-965a-4b5a-9a65-1284354c256e", + "position": 0, + "resource_type": null, + "rights": "", + "spatial_resolution_in_meters": "", + "state": "active", + "status": "", + "temporal_resolution": "", + "uri": "", + "url": "", + "url_type": "", + "access_services": [ + { + "access_rights": "", + "endpoint_description": "", + "endpoint_url": [], + "serves_dataset": [], + "title": "", + "uri": "" + } ], "retention_period": [ - { - "end": "2034-12-31", - "start": "2020-03-01" - } - ], - "tags": [ - { - "display_name": "Test 1", - "id": "5c418ec2-cb41-4c42-9b9c-f5d1e3a831e5", - "name": "Test 1", - "state": "active", - "vocabulary_id": null - }, - { - "display_name": "Test 2", - "id": "c4117ace-2114-470d-b6e9-0df7580a12d8", - "name": "Test 2", - "state": "active", - "vocabulary_id": null - }, - { - "display_name": "Test 3", - "id": "d5a5288d-3bff-431e-be94-12c71d25d75b", - "name": "Test 3", - "state": "active", - "vocabulary_id": null - } - ], - "temporal_coverage": [ - { - "end": "2024-12-31", - "start": "2020-03-01" - } - ], - "resources": [], - "groups": [], - "relationships_as_subject": [], - "relationships_as_object": [] - } + { + "end": "2025-07-18", + "start": "2025-07-10" + } + ] + } + ], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } ] \ No newline at end of file diff --git a/examples/dcat/dataset.rdf b/examples/dcat/dataset.rdf index 5ce71e1c..1235888f 100644 --- a/examples/dcat/dataset.rdf +++ b/examples/dcat/dataset.rdf @@ -19,6 +19,7 @@ Zimbabwe Regional Geochemical Survey. During the period 1982-86 a team of geologists from the British Geological Survey ... http://dataset.info.org + exploration geochemistry geology diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index 982a728e..f8d72b89 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -94,7 +94,7 @@ adms:sample ; adms:versionNotes "Dataset continuously updated"; dcat:contactPoint ; - # dcat:distribution ; + dcat:distribution ; dcat:hasVersion ; dcat:keyword "Test 1" , "Test 2" , "Test 3"; dcat:spatialResolutionInMeters "10"^^; @@ -136,6 +136,12 @@ ]; dct:title "Technical report number of unique study subjects available by environment for project HDBP0250"; dcat:accessURL ; + + [ a dct:PeriodOfTime; + rdfs:comment "As stated in the CSI deliberation"; + dcat:endDate "2034-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; dcat:downloadURL ; dcat:mediaType . @@ -223,7 +229,7 @@ a prov:Attribution; dcat:hadRole ; prov:agent [ a foaf:Organization; - foaf:homepage ; + foaf:homepage ; foaf:mbox ; foaf:name "Contact Point" ] . @@ -242,23 +248,6 @@ a dct:MediaTypeOrExtent . -# -# a dcat:Distribution; -# dcatap:applicableLegislation ; -# dct:description "EU Health Data Access Body For better Healthcare, Research & Policy Making"; -# dct:format ; -# dct:identifier "http://ehelse.healthdataportal.eu/distribution/13a3851d-6cdf-4570-a7f0-7f03015d1925"; -# dct:isPartOf ; -# dct:issued "2024-06-03T08:51:00Z"^^; -# dct:license ; -# dct:modified "2024-06-04T18:00:00Z"^^; -# dct:rights [ a dct:RightsStatement; -# rdfs:label "Access to data is conditional on the issuance of a permit by the HDAB after submission of a data request application (English)" -# ]; -# dct:title "EU Health Data Access Body"; -# dcat:accessURL ; -# dcat:byteSize "80000"^^ . - a prov:Activity; rdfs:label "http://dbpedia.org/resource/Record_linkage"; From 501a8de6f810f3d8ca2936264e860de6fd865199 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 10 Jul 2025 21:19:31 +0200 Subject: [PATCH 16/36] Added DCAT AP 3 has version --- ckanext/dcat/profiles/euro_dcat_ap_3.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py index 24d90c84..0f69b13c 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_3.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py @@ -40,6 +40,11 @@ def parse_dataset(self, dataset_dict, dataset_ref): if "series_order_type" not in dataset_dict: dataset_dict["series_order_type"] = "date" + # DCAT AP v3: hasVersion + values = self._object_value_list(dataset_ref, DCAT.hasVersion) + if values: + dataset_dict["has_version"] = values + return dataset_dict def graph_from_dataset(self, dataset_dict, dataset_ref): @@ -56,6 +61,13 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # DCAT AP v3 properties also applied to higher versions self._graph_from_dataset_v3(dataset_dict, dataset_ref) + # DCAT AP v3: hasVersion + value = self._get_dict_value(dataset_dict, "has_version") + if value: + items = self._read_list_value(value) + for item in items: + self.g.add((dataset_ref, DCAT.hasVersion, URIRef(item))) + def graph_from_catalog(self, catalog_dict, catalog_ref): self._graph_from_catalog_base(catalog_dict, catalog_ref) From c4ad649d70c2a71d47e399c18dfeefe5ea310f8a Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 10 Jul 2025 22:02:36 +0200 Subject: [PATCH 17/36] Added has version to DCAT 3 and added missing dataservice fields --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 57 +++++++++++++++++-- ckanext/dcat/profiles/euro_dcat_ap_3.py | 11 ++-- .../test_euro_dcatap_2_profile_parse.py | 43 ++++++++++++++ .../test_euro_dcatap_2_profile_serialize.py | 50 +++++++++++++++- 4 files changed, 149 insertions(+), 12 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index fe7e0570..848500a2 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -205,6 +205,34 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): if values: access_service_dict[key] = values + value = self._object_value(access_service, DCT.conformsTo) + if value: + access_service_dict["conforms_to"] = value + + value = self._object_value(access_service, DCT.format) + if value: + access_service_dict["format"] = value + + value = self._object_value(access_service, DCT.identifier) + if value: + access_service_dict["identifier"] = value + + value = self._object_value(access_service, DCT.language) + if value: + access_service_dict["language"] = value + + value = self._object_value(access_service, DCT.rights) + if value: + access_service_dict["rights"] = value + + value = self._object_value(access_service, DCAT.landingPage) + if value: + access_service_dict["landing_page"] = value + + values = self._object_value_list(access_service, DCAT.keyword) + if values: + access_service_dict["keyword"] = values + # Access service URI (explicitly show the missing ones) access_service_dict["uri"] = ( str(access_service) @@ -271,7 +299,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): _datatype=datatype, _class=_class, ) - + # --- Provenance serialization --- activities = dataset_dict.get("provenance_activity", []) @@ -279,7 +307,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): activity_uri = URIRef(activity.get("uri")) if activity.get("uri") else BNode() self.g.add((dataset_ref, PROV.wasGeneratedBy, activity_uri)) self.g.add((activity_uri, RDF.type, PROV.Activity)) - + if activity.get("label"): self.g.add((activity_uri, RDFS.label, Literal(activity["label"]))) if activity.get("seeAlso"): @@ -292,7 +320,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): for agent_dict in activity.get("wasAssociatedWith", []): self._add_agent_to_graph(activity_uri, PROV.wasAssociatedWith, agent_dict) - # Qualified Attribution + # Qualified Attribution qualified_attributions = dataset_dict.get("qualified_attribution", []) for attr in qualified_attributions: attr_ref = BNode() @@ -308,7 +336,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): role = attr.get("role") if role: self.g.add((attr_ref, DCAT.hadRole, URIRef(role))) - + # Temporal @@ -477,6 +505,27 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): access_service_dict, access_service_node, items ) + # Extra simple values for access services + extra_items = [ + ("conforms_to", DCT.conformsTo, None, URIRefOrLiteral), + ("format", DCT.format, None, URIRefOrLiteral), + ("identifier", DCT.identifier, None, URIRefOrLiteral), + ("language", DCT.language, None, URIRefOrLiteral), + ("rights", DCT.rights, None, URIRefOrLiteral), + ("landing_page", DCAT.landingPage, None, URIRefOrLiteral), + ] + self._add_triples_from_dict(access_service_dict, access_service_node, extra_items) + + # Add keyword list + self._add_triple_from_dict( + access_service_dict, + access_service_node, + DCAT.keyword, + "keyword", + list_value=True, + _type=Literal + ) + # Lists items = [ ( diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py index 0f69b13c..9f323707 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_3.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py @@ -61,12 +61,11 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # DCAT AP v3 properties also applied to higher versions self._graph_from_dataset_v3(dataset_dict, dataset_ref) - # DCAT AP v3: hasVersion - value = self._get_dict_value(dataset_dict, "has_version") - if value: - items = self._read_list_value(value) - for item in items: - self.g.add((dataset_ref, DCAT.hasVersion, URIRef(item))) + # DCAT AP v3: List triples + items = [ + ("has_version", DCAT.hasVersion, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) def graph_from_catalog(self, catalog_dict, catalog_ref): diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index 8573cbab..49abf2f7 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -24,6 +24,49 @@ class TestEuroDCATAP2ProfileParsing(BaseParseTest): + def test_parse_access_service_extra_fields(self): + rdf_data = ''' + + + + + + + + + service-123 + + open use + + keyword1 + keyword2 + + + + + + + ''' + p = RDFParser(profiles=DCAT_AP_PROFILES) + p.parse(rdf_data) + datasets = list(p.datasets()) + assert len(datasets) == 1 + resources = datasets[0]['resources'] + assert len(resources) == 1 + access_services = json.loads(resources[0]['access_services']) + assert len(access_services) == 1 + access_service = access_services[0] + assert access_service['conforms_to'] == 'http://example.org/spec' + assert access_service['format'] == 'http://example.org/format' + assert access_service['identifier'] == 'service-123' + assert access_service['language'] == 'http://publications.europa.eu/resource/authority/language/ENG' + assert access_service['rights'] == 'open use' + assert access_service['landing_page'] == 'http://example.org/landing' + assert sorted(access_service['keyword']) == ['keyword1', 'keyword2'] def test_dataset_all_fields(self): diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py index 8f9ebd87..139d7bef 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py @@ -418,7 +418,14 @@ def test_distribution_fields(self): 'access_rights': 'http://publications.europa.eu/resource/authority/access-right/PUBLIC', 'description': 'This SPARQL end point allow to directly query the EU Whoiswho content 1', 'endpoint_url': ['http://publications.europa.eu/webapi/rdf/sparql'], - 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'] + 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'], + 'conforms_to': 'http://example.org/spec', + 'format': 'http://example.org/format', + 'identifier': 'service-123', + 'language': 'http://publications.europa.eu/resource/authority/language/ENG', + 'rights': 'open use', + 'landing_page': 'http://example.org/landing', + 'keyword': ['keyword1', 'keyword2'] }, { 'availability': 'http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL', @@ -428,7 +435,14 @@ def test_distribution_fields(self): 'access_rights': 'http://publications.europa.eu/resource/authority/access-right/OP_DATPRO', 'description': 'This SPARQL end point allow to directly query the EU Whoiswho content 2', 'endpoint_url': ['http://publications.europa.eu/webapi/rdf/sparql'], - 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'] + 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'], + 'conforms_to': 'http://example.org/spec', + 'format': 'http://example.org/format', + 'identifier': 'service-123', + 'language': 'http://publications.europa.eu/resource/authority/language/ENG', + 'rights': 'open use', + 'landing_page': 'http://example.org/landing', + 'keyword': ['keyword1', 'keyword2'] } ]) } @@ -483,6 +497,38 @@ def test_distribution_fields(self): self._assert_simple_value(g, object[2], DCAT.endpointDescription, Literal(access_service.get('endpoint_description'))) + + self._assert_simple_value( + g, object[2], DCT.conformsTo, + URIRef(access_service.get('conforms_to')) if access_service.get('conforms_to') else None + ) + self._assert_simple_value( + g, object[2], DCT.format, + URIRef(access_service.get('format')) if access_service.get('format') else None + ) + self._assert_simple_value( + g, object[2], DCT.identifier, + Literal(access_service.get('identifier')) if access_service.get('identifier') else None + ) + self._assert_simple_value( + g, object[2], DCT.language, + URIRef(access_service.get('language')) if access_service.get('language') else None + ) + self._assert_simple_value( + g, object[2], DCT.rights, + Literal(access_service.get('rights')) if access_service.get('rights') else None + ) + self._assert_simple_value( + g, object[2], DCAT.landingPage, + URIRef(access_service.get('landing_page')) if access_service.get('landing_page') else None + ) + + if access_service.get('keyword'): + self._assert_values_list( + g, object[2], DCAT.keyword, + self._get_typed_list(access_service.get('keyword'), Literal) + ) + # Lists self._assert_values_list(g, object[2], DCAT.endpointURL, self._get_typed_list(access_service.get('endpoint_url'), URIRef)) From 163d28407f82a4ba45c259ff98d9f4b087c2ae5a Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Mon, 14 Jul 2025 10:50:46 +0200 Subject: [PATCH 18/36] fix import --- ckanext/dcat/profiles/euro_dcat_ap_3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py index 9f323707..64220430 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_3.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py @@ -8,6 +8,7 @@ RDF, ) +from .base import URIRefOrLiteral from ckanext.dcat.utils import dataset_uri from .euro_dcat_ap_2 import EuropeanDCATAP2Profile from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile From d8c04dfde2cfe7eff0b2221dbb7a56f7ad87cc73 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Mon, 14 Jul 2025 13:32:03 +0200 Subject: [PATCH 19/36] fix unit tests --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 4 ++-- .../dcat_ap_2/test_euro_dcatap_2_profile_serialize.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index 848500a2..bbcd32db 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -209,7 +209,7 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): if value: access_service_dict["conforms_to"] = value - value = self._object_value(access_service, DCT.format) + value = self._object_value(access_service, DCT["format"]) if value: access_service_dict["format"] = value @@ -508,7 +508,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): # Extra simple values for access services extra_items = [ ("conforms_to", DCT.conformsTo, None, URIRefOrLiteral), - ("format", DCT.format, None, URIRefOrLiteral), + ("format", DCT["format"], None, URIRefOrLiteral), ("identifier", DCT.identifier, None, URIRefOrLiteral), ("language", DCT.language, None, URIRefOrLiteral), ("rights", DCT.rights, None, URIRefOrLiteral), diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py index 139d7bef..4235bf1f 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py @@ -503,7 +503,7 @@ def test_distribution_fields(self): URIRef(access_service.get('conforms_to')) if access_service.get('conforms_to') else None ) self._assert_simple_value( - g, object[2], DCT.format, + g, object[2], DCT["format"], URIRef(access_service.get('format')) if access_service.get('format') else None ) self._assert_simple_value( From 63a2749a0f637be17a9eeb249aa359fc220cb614 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 10 Jul 2025 22:02:36 +0200 Subject: [PATCH 20/36] Added has version to DCAT 3 and added missing dataservice fields --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 57 +++++++++++++++++-- ckanext/dcat/profiles/euro_dcat_ap_3.py | 12 ++-- .../test_euro_dcatap_2_profile_parse.py | 43 ++++++++++++++ .../test_euro_dcatap_2_profile_serialize.py | 50 +++++++++++++++- 4 files changed, 150 insertions(+), 12 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index fe7e0570..bbcd32db 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -205,6 +205,34 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): if values: access_service_dict[key] = values + value = self._object_value(access_service, DCT.conformsTo) + if value: + access_service_dict["conforms_to"] = value + + value = self._object_value(access_service, DCT["format"]) + if value: + access_service_dict["format"] = value + + value = self._object_value(access_service, DCT.identifier) + if value: + access_service_dict["identifier"] = value + + value = self._object_value(access_service, DCT.language) + if value: + access_service_dict["language"] = value + + value = self._object_value(access_service, DCT.rights) + if value: + access_service_dict["rights"] = value + + value = self._object_value(access_service, DCAT.landingPage) + if value: + access_service_dict["landing_page"] = value + + values = self._object_value_list(access_service, DCAT.keyword) + if values: + access_service_dict["keyword"] = values + # Access service URI (explicitly show the missing ones) access_service_dict["uri"] = ( str(access_service) @@ -271,7 +299,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): _datatype=datatype, _class=_class, ) - + # --- Provenance serialization --- activities = dataset_dict.get("provenance_activity", []) @@ -279,7 +307,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): activity_uri = URIRef(activity.get("uri")) if activity.get("uri") else BNode() self.g.add((dataset_ref, PROV.wasGeneratedBy, activity_uri)) self.g.add((activity_uri, RDF.type, PROV.Activity)) - + if activity.get("label"): self.g.add((activity_uri, RDFS.label, Literal(activity["label"]))) if activity.get("seeAlso"): @@ -292,7 +320,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): for agent_dict in activity.get("wasAssociatedWith", []): self._add_agent_to_graph(activity_uri, PROV.wasAssociatedWith, agent_dict) - # Qualified Attribution + # Qualified Attribution qualified_attributions = dataset_dict.get("qualified_attribution", []) for attr in qualified_attributions: attr_ref = BNode() @@ -308,7 +336,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): role = attr.get("role") if role: self.g.add((attr_ref, DCAT.hadRole, URIRef(role))) - + # Temporal @@ -477,6 +505,27 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): access_service_dict, access_service_node, items ) + # Extra simple values for access services + extra_items = [ + ("conforms_to", DCT.conformsTo, None, URIRefOrLiteral), + ("format", DCT["format"], None, URIRefOrLiteral), + ("identifier", DCT.identifier, None, URIRefOrLiteral), + ("language", DCT.language, None, URIRefOrLiteral), + ("rights", DCT.rights, None, URIRefOrLiteral), + ("landing_page", DCAT.landingPage, None, URIRefOrLiteral), + ] + self._add_triples_from_dict(access_service_dict, access_service_node, extra_items) + + # Add keyword list + self._add_triple_from_dict( + access_service_dict, + access_service_node, + DCAT.keyword, + "keyword", + list_value=True, + _type=Literal + ) + # Lists items = [ ( diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py index 0f69b13c..64220430 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_3.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py @@ -8,6 +8,7 @@ RDF, ) +from .base import URIRefOrLiteral from ckanext.dcat.utils import dataset_uri from .euro_dcat_ap_2 import EuropeanDCATAP2Profile from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile @@ -61,12 +62,11 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # DCAT AP v3 properties also applied to higher versions self._graph_from_dataset_v3(dataset_dict, dataset_ref) - # DCAT AP v3: hasVersion - value = self._get_dict_value(dataset_dict, "has_version") - if value: - items = self._read_list_value(value) - for item in items: - self.g.add((dataset_ref, DCAT.hasVersion, URIRef(item))) + # DCAT AP v3: List triples + items = [ + ("has_version", DCAT.hasVersion, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) def graph_from_catalog(self, catalog_dict, catalog_ref): diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index 8573cbab..49abf2f7 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -24,6 +24,49 @@ class TestEuroDCATAP2ProfileParsing(BaseParseTest): + def test_parse_access_service_extra_fields(self): + rdf_data = ''' + + + + + + + + + service-123 + + open use + + keyword1 + keyword2 + + + + + + + ''' + p = RDFParser(profiles=DCAT_AP_PROFILES) + p.parse(rdf_data) + datasets = list(p.datasets()) + assert len(datasets) == 1 + resources = datasets[0]['resources'] + assert len(resources) == 1 + access_services = json.loads(resources[0]['access_services']) + assert len(access_services) == 1 + access_service = access_services[0] + assert access_service['conforms_to'] == 'http://example.org/spec' + assert access_service['format'] == 'http://example.org/format' + assert access_service['identifier'] == 'service-123' + assert access_service['language'] == 'http://publications.europa.eu/resource/authority/language/ENG' + assert access_service['rights'] == 'open use' + assert access_service['landing_page'] == 'http://example.org/landing' + assert sorted(access_service['keyword']) == ['keyword1', 'keyword2'] def test_dataset_all_fields(self): diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py index 8f9ebd87..4235bf1f 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py @@ -418,7 +418,14 @@ def test_distribution_fields(self): 'access_rights': 'http://publications.europa.eu/resource/authority/access-right/PUBLIC', 'description': 'This SPARQL end point allow to directly query the EU Whoiswho content 1', 'endpoint_url': ['http://publications.europa.eu/webapi/rdf/sparql'], - 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'] + 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'], + 'conforms_to': 'http://example.org/spec', + 'format': 'http://example.org/format', + 'identifier': 'service-123', + 'language': 'http://publications.europa.eu/resource/authority/language/ENG', + 'rights': 'open use', + 'landing_page': 'http://example.org/landing', + 'keyword': ['keyword1', 'keyword2'] }, { 'availability': 'http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL', @@ -428,7 +435,14 @@ def test_distribution_fields(self): 'access_rights': 'http://publications.europa.eu/resource/authority/access-right/OP_DATPRO', 'description': 'This SPARQL end point allow to directly query the EU Whoiswho content 2', 'endpoint_url': ['http://publications.europa.eu/webapi/rdf/sparql'], - 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'] + 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'], + 'conforms_to': 'http://example.org/spec', + 'format': 'http://example.org/format', + 'identifier': 'service-123', + 'language': 'http://publications.europa.eu/resource/authority/language/ENG', + 'rights': 'open use', + 'landing_page': 'http://example.org/landing', + 'keyword': ['keyword1', 'keyword2'] } ]) } @@ -483,6 +497,38 @@ def test_distribution_fields(self): self._assert_simple_value(g, object[2], DCAT.endpointDescription, Literal(access_service.get('endpoint_description'))) + + self._assert_simple_value( + g, object[2], DCT.conformsTo, + URIRef(access_service.get('conforms_to')) if access_service.get('conforms_to') else None + ) + self._assert_simple_value( + g, object[2], DCT["format"], + URIRef(access_service.get('format')) if access_service.get('format') else None + ) + self._assert_simple_value( + g, object[2], DCT.identifier, + Literal(access_service.get('identifier')) if access_service.get('identifier') else None + ) + self._assert_simple_value( + g, object[2], DCT.language, + URIRef(access_service.get('language')) if access_service.get('language') else None + ) + self._assert_simple_value( + g, object[2], DCT.rights, + Literal(access_service.get('rights')) if access_service.get('rights') else None + ) + self._assert_simple_value( + g, object[2], DCAT.landingPage, + URIRef(access_service.get('landing_page')) if access_service.get('landing_page') else None + ) + + if access_service.get('keyword'): + self._assert_values_list( + g, object[2], DCAT.keyword, + self._get_typed_list(access_service.get('keyword'), Literal) + ) + # Lists self._assert_values_list(g, object[2], DCAT.endpointURL, self._get_typed_list(access_service.get('endpoint_url'), URIRef)) From fdbd8bc115ab42b039b95e5a4c65d999679df514 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Mon, 14 Jul 2025 13:59:59 +0200 Subject: [PATCH 21/36] update schema --- ckanext/dcat/schemas/dcat_ap_full.yaml | 27 ++++++++++++++++++++++++ ckanext/dcat/schemas/health_dcat_ap.yaml | 26 +++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index 8cebe622..95fdab3e 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -540,6 +540,33 @@ resource_fields: validators: ignore_missing unicode_safe help_text: Information regarding access or restrictions based on privacy, security, or other policies. + - field_name: conforms_to + label: Conforms to + + - field_name: format + label: Format + + - field_name: identifier + label: Identifier + + - field_name: language + label: Language + + - field_name: rights + label: Rights + form_snippet: markdown.html + display_snippet: markdown.html + + - field_name: landing_page + label: Landing page + + - field_name: keyword + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataservice. Use commas to separate multiple values. + + help_text: A data service that gives access to the resource. # Note: if not provided, this will be autogenerated diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index 142cde4c..9f6ed85d 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -709,6 +709,32 @@ resource_fields: validators: ignore_missing unicode_safe help_text: Information regarding access or restrictions based on privacy, security, or other policies. + - field_name: conforms_to + label: Conforms to + + - field_name: format + label: Format + + - field_name: identifier + label: Identifier + + - field_name: language + label: Language + + - field_name: rights + label: Rights + form_snippet: markdown.html + display_snippet: markdown.html + + - field_name: landing_page + label: Landing page + + - field_name: keyword + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataservice. Use commas to separate multiple values. + help_text: A data service that gives access to the resource. # Note: if not provided, this will be autogenerated From 1985f6e76cf11721a5d1dc75ba7af419f92520c2 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Mon, 14 Jul 2025 14:24:41 +0200 Subject: [PATCH 22/36] fiix mapping documentation --- docs/mapping-healthdcat.md | 1 + docs/mapping.md | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/mapping-healthdcat.md b/docs/mapping-healthdcat.md index 0b919fc2..5a79597e 100644 --- a/docs/mapping-healthdcat.md +++ b/docs/mapping-healthdcat.md @@ -5,6 +5,7 @@ This section defines how CKAN fields map to the [HealthDCAT-AP](http://healthdat | DCAT Class | RDF Property | CKAN Dataset Field | Stored as | Notes | |----------------|----------------------------------------|-------------------------------------|-----------|-------| | dcat:Dataset | healthdcatap:analytics | analytics | list | Publishers are encouraged to provide URLs pointing to document repositories where users can access or request associated resources such as technical reports of the dataset, quality measurements, usability indicators,... Note that HealthDCAT-AP mentions also API endpoints or analytics services, but these would not be Distriutions but rather DatasetServices. | +| dcat:Dataset | healthdcatap:qualityAnnotation | quality_annotation | list | This field allows annotations or notes about the quality of the dataset, such as data completeness, known issues, or validation methods. | | dcat:Dataset | healthdcatap:hasCodeValues | code_values | list | Inside this property, you can provide the coding system of the dataset in the form of wikidata URI (example: https://www.wikidata.org/entity/P494 for ICD-10 ID) and the URI of the value that describes the dataset (example: https://icd.who.int/browse10/2019/en#/Y59.0 for viral vaccines) | | dcat:Dataset | healthdcatap:hasCodingSystem | coding_system | list | This property provides informatio on which coding systems are in use inside your dataset. For this, wikidata URIs must be used.| | dcat:Dataset | healthdcatap:healthCategory | health_category | list | Health-specific category values. | diff --git a/docs/mapping.md b/docs/mapping.md index fa05ade2..59bd7589 100644 --- a/docs/mapping.md +++ b/docs/mapping.md @@ -42,7 +42,9 @@ some cases the way metadata is stored internally and presented at the CKAN API l | dcat:Dataset | dcat-us:purpose | custom:purpose | | text | DCAT-US v3 and higher only | dcat:Dataset | skos:scopeNote | custom:usage | | text | DCAT-US v3 and higher only | dcat:Dataset | dct:type | custom:dcat_type | | text | | -| dcat:Dataset | dct:hasVersion | custom:has_version | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | prov:wasGeneratedBy | custom:provenance_activity | | text | | +| dcat:Dataset | prov:qualifiedAttribution | custom:qualified_attribution | | list | See [Lists](#lists). Object should contain agent and role | +| dcat:Dataset | dcat:hasVersion | custom:has_version | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | | dcat:Dataset | dct:isVersionOf | custom:is_version_of | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | | dcat:Dataset | dct:source | custom:source | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | | dcat:Dataset | adms:sample | custom:sample | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to dcat:Distribution instances | @@ -94,6 +96,16 @@ some cases the way metadata is stored internally and presented at the CKAN API l | dcat:Distribution | dct:identifier | custom:identifier | custom:guid, id | text | DCAT-US v3 and higher only | dcat:Distribution | dcat-us:describedBy | custom:data_dictionary | | list of objects | DCAT-US v3 and higher only | dcat:Distribution | dcat:accessService | resource:access_services | | list of objects | | + +| dcat:Catalog | foaf:homepage | custom:catalog_homepage | | text | | + +| dcat:DataService | dct:conformsTo | access_service:conforms_to | | list | See [Lists](#lists) | +| dcat:DataService | dct:format | access_service:format | | text | | +| dcat:DataService | dct:identifier | access_service:identifier | | text | | +| dcat:DataService | dct:language | access_service:language | | list | See [Lists](#lists) | +| dcat:DataService | dct:rights | access_service:rights | | text | | +| dcat:DataService | dcat:landingPage | access_service:landing_page | | text | | +| dcat:DataService | dcat:keyword | access_service:keyword | | list | See [Lists](#lists) | | dcat:DataService | dct:title | access_service:title | | text | | | dcat:DataService | dcat:endpointURL | access_service:endpoint_url | | list | | | dcat:DataService | dcat:endpointDescription | access_service:endpoint_description | | text | | From 3cc8905a716906c4b5903d154a39100c3b4acfe3 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Mon, 14 Jul 2025 14:31:13 +0200 Subject: [PATCH 23/36] Updated documetation for retention period --- docs/mapping-healthdcat.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/mapping-healthdcat.md b/docs/mapping-healthdcat.md index 5a79597e..8a9c09fe 100644 --- a/docs/mapping-healthdcat.md +++ b/docs/mapping-healthdcat.md @@ -26,6 +26,7 @@ Example value could be: dpv:ResearchAndDevelopment. | | dcat:Dataset | healthdcatap:numberOfUniqueIndividuals | number_of_unique_individuals | integer | This property is not mandatory, since not all datasets might include data from individuals. | | dcat:Dataset | healthdcatap:hdab | hdab | agent | Health Data Access Body responsible. | | dcat:Dataset | healthdcatap:retentionPeriod | retention_period | interval | This property makes use of the class dct:PeriodOfTime, in which a start and end date should be provided. | +| dcat:Distribution | healthdcatap:retentionPeriod | resources_retention_period | interval | This property makes use of the class dct:PeriodOfTime, in which a start and end date should be provided. | ### Notes From c03bdd2d3cbd565b57bfa22b28767cb3bf80a2c2 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Tue, 15 Jul 2025 17:03:42 +0200 Subject: [PATCH 24/36] fix(dataseries) cardanality for dataseries --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 44 +++++++++++-------- ckanext/dcat/schemas/dcat_ap_full.yaml | 10 +++++ ckanext/dcat/schemas/health_dcat_ap.yaml | 13 +++++- .../test_euro_dcatap_2_profile_parse.py | 8 ++-- .../test_euro_dcatap_2_profile_serialize.py | 26 +++++------ 5 files changed, 64 insertions(+), 37 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index bbcd32db..fc15e216 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -205,29 +205,29 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): if values: access_service_dict[key] = values - value = self._object_value(access_service, DCT.conformsTo) - if value: - access_service_dict["conforms_to"] = value + values = self._object_value_list(access_service, DCT.conformsTo) + if values: + access_service_dict["conforms_to"] = values - value = self._object_value(access_service, DCT["format"]) - if value: - access_service_dict["format"] = value + values = self._object_value_list(access_service, DCT["format"]) + if values: + access_service_dict["format"] = values value = self._object_value(access_service, DCT.identifier) if value: access_service_dict["identifier"] = value - value = self._object_value(access_service, DCT.language) - if value: - access_service_dict["language"] = value + values = self._object_value_list(access_service, DCT.language) + if values: + access_service_dict["language"] = values - value = self._object_value(access_service, DCT.rights) - if value: - access_service_dict["rights"] = value + values = self._object_value_list(access_service, DCT.rights) + if values: + access_service_dict["rights"] = values - value = self._object_value(access_service, DCAT.landingPage) - if value: - access_service_dict["landing_page"] = value + values = self._object_value_list(access_service, DCAT.landingPage) + if values: + access_service_dict["landing_page"] = values values = self._object_value_list(access_service, DCAT.keyword) if values: @@ -505,16 +505,24 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): access_service_dict, access_service_node, items ) - # Extra simple values for access services + # Extra list values for access services extra_items = [ ("conforms_to", DCT.conformsTo, None, URIRefOrLiteral), ("format", DCT["format"], None, URIRefOrLiteral), - ("identifier", DCT.identifier, None, URIRefOrLiteral), ("language", DCT.language, None, URIRefOrLiteral), ("rights", DCT.rights, None, URIRefOrLiteral), ("landing_page", DCAT.landingPage, None, URIRefOrLiteral), ] - self._add_triples_from_dict(access_service_dict, access_service_node, extra_items) + self._add_list_triples_from_dict(access_service_dict, access_service_node, extra_items) + + # Add single-value triple for identifier + self._add_triple_from_dict( + access_service_dict, + access_service_node, + DCT.identifier, + "identifier", + _type=URIRefOrLiteral + ) # Add keyword list self._add_triple_from_dict( diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index 95fdab3e..0a2029e3 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -542,23 +542,33 @@ resource_fields: - field_name: conforms_to label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: format label: Format + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: identifier label: Identifier - field_name: language label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: rights label: Rights form_snippet: markdown.html display_snippet: markdown.html + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: landing_page label: Landing page + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: keyword label: Keywords diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index 9f6ed85d..1b05ba96 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -685,7 +685,6 @@ resource_fields: label: Access services repeating_label: Access service repeating_subfields: - - field_name: uri label: URI @@ -703,7 +702,7 @@ resource_fields: label: Serves dataset preset: multiple_text validators: ignore_missing scheming_multiple_text - + - field_name: access_rights label: Access rights validators: ignore_missing unicode_safe @@ -711,23 +710,33 @@ resource_fields: - field_name: conforms_to label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: format label: Format + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: identifier label: Identifier - field_name: language label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: rights label: Rights form_snippet: markdown.html display_snippet: markdown.html + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: landing_page label: Landing page + preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: keyword label: Keywords diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index 49abf2f7..b942555d 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -61,11 +61,11 @@ def test_parse_access_service_extra_fields(self): assert len(access_services) == 1 access_service = access_services[0] assert access_service['conforms_to'] == 'http://example.org/spec' - assert access_service['format'] == 'http://example.org/format' + assert access_service['format'] == ['http://example.org/format'] assert access_service['identifier'] == 'service-123' - assert access_service['language'] == 'http://publications.europa.eu/resource/authority/language/ENG' - assert access_service['rights'] == 'open use' - assert access_service['landing_page'] == 'http://example.org/landing' + assert access_service['language'] == ['http://publications.europa.eu/resource/authority/language/ENG'] + assert access_service['rights'] == ['open use'] + assert access_service['landing_page'] == ['http://example.org/landing'] assert sorted(access_service['keyword']) == ['keyword1', 'keyword2'] def test_dataset_all_fields(self): diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py index 4235bf1f..01e8fc60 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py @@ -497,30 +497,30 @@ def test_distribution_fields(self): self._assert_simple_value(g, object[2], DCAT.endpointDescription, Literal(access_service.get('endpoint_description'))) - self._assert_simple_value( + g, object[2], DCT.identifier, + Literal(access_service.get('identifier')) if access_service.get('identifier') else None + ) + self._assert_values_list( g, object[2], DCT.conformsTo, URIRef(access_service.get('conforms_to')) if access_service.get('conforms_to') else None ) - self._assert_simple_value( + self._assert_values_list( g, object[2], DCT["format"], - URIRef(access_service.get('format')) if access_service.get('format') else None - ) - self._assert_simple_value( - g, object[2], DCT.identifier, - Literal(access_service.get('identifier')) if access_service.get('identifier') else None + self._get_typed_list([access_service.get('format')], URIRef) if access_service.get('format') else [] ) - self._assert_simple_value( + self._assert_values_list( g, object[2], DCT.language, - URIRef(access_service.get('language')) if access_service.get('language') else None + self._get_typed_list([access_service.get('language')], URIRef) if access_service.get('language') else [] ) - self._assert_simple_value( + self._assert_values_list( g, object[2], DCT.rights, - Literal(access_service.get('rights')) if access_service.get('rights') else None + self._get_typed_list([access_service.get('rights')], Literal) if access_service.get('rights') else [] ) - self._assert_simple_value( + self._assert_values_list( g, object[2], DCAT.landingPage, - URIRef(access_service.get('landing_page')) if access_service.get('landing_page') else None + self._get_typed_list([access_service.get('landing_page')], URIRef) if access_service.get( + 'landing_page') else [] ) if access_service.get('keyword'): From cccd727884f458956c4315095196b96f99e1f1a5 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 16 Jul 2025 09:14:46 +0200 Subject: [PATCH 25/36] fix(UT-cardanality) fix UT for cardanality --- .../test_euro_dcatap_2_profile_parse.py | 2 +- .../test_euro_dcatap_2_profile_serialize.py | 31 ++++++++++--------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index b942555d..6c903e1b 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -60,7 +60,7 @@ def test_parse_access_service_extra_fields(self): access_services = json.loads(resources[0]['access_services']) assert len(access_services) == 1 access_service = access_services[0] - assert access_service['conforms_to'] == 'http://example.org/spec' + assert access_service['conforms_to'] == ['http://example.org/spec'] assert access_service['format'] == ['http://example.org/format'] assert access_service['identifier'] == 'service-123' assert access_service['language'] == ['http://publications.europa.eu/resource/authority/language/ENG'] diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py index 01e8fc60..e4fc5f04 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py @@ -419,12 +419,12 @@ def test_distribution_fields(self): 'description': 'This SPARQL end point allow to directly query the EU Whoiswho content 1', 'endpoint_url': ['http://publications.europa.eu/webapi/rdf/sparql'], 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'], - 'conforms_to': 'http://example.org/spec', - 'format': 'http://example.org/format', + 'conforms_to': ['http://example.org/spec'], + 'format': ['http://example.org/format'], 'identifier': 'service-123', - 'language': 'http://publications.europa.eu/resource/authority/language/ENG', - 'rights': 'open use', - 'landing_page': 'http://example.org/landing', + 'language': ['http://publications.europa.eu/resource/authority/language/ENG'], + 'rights': ['open use'], + 'landing_page': ['http://example.org/landing'], 'keyword': ['keyword1', 'keyword2'] }, { @@ -436,12 +436,12 @@ def test_distribution_fields(self): 'description': 'This SPARQL end point allow to directly query the EU Whoiswho content 2', 'endpoint_url': ['http://publications.europa.eu/webapi/rdf/sparql'], 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'], - 'conforms_to': 'http://example.org/spec', - 'format': 'http://example.org/format', + 'conforms_to': ['http://example.org/spec'], + 'format': ['http://example.org/format'], 'identifier': 'service-123', - 'language': 'http://publications.europa.eu/resource/authority/language/ENG', - 'rights': 'open use', - 'landing_page': 'http://example.org/landing', + 'language': ['http://publications.europa.eu/resource/authority/language/ENG'], + 'rights': ['open use'], + 'landing_page': ['http://example.org/landing'], 'keyword': ['keyword1', 'keyword2'] } ]) @@ -503,23 +503,24 @@ def test_distribution_fields(self): ) self._assert_values_list( g, object[2], DCT.conformsTo, - URIRef(access_service.get('conforms_to')) if access_service.get('conforms_to') else None + self._get_typed_list(access_service.get('conforms_to'), URIRef) if access_service.get( + 'conforms_to') else [] ) self._assert_values_list( g, object[2], DCT["format"], - self._get_typed_list([access_service.get('format')], URIRef) if access_service.get('format') else [] + self._get_typed_list(access_service.get('format'), URIRef) if access_service.get('format') else [] ) self._assert_values_list( g, object[2], DCT.language, - self._get_typed_list([access_service.get('language')], URIRef) if access_service.get('language') else [] + self._get_typed_list(access_service.get('language'), URIRef) if access_service.get('language') else [] ) self._assert_values_list( g, object[2], DCT.rights, - self._get_typed_list([access_service.get('rights')], Literal) if access_service.get('rights') else [] + self._get_typed_list(access_service.get('rights'), Literal) if access_service.get('rights') else [] ) self._assert_values_list( g, object[2], DCAT.landingPage, - self._get_typed_list([access_service.get('landing_page')], URIRef) if access_service.get( + self._get_typed_list(access_service.get('landing_page'), URIRef) if access_service.get( 'landing_page') else [] ) From 6d230621374a6e7639d391d3d0e1cbc457528965 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 16 Jul 2025 11:15:40 +0200 Subject: [PATCH 26/36] add applicable_legislation to Dataservice --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 42 ++++++------------- ckanext/dcat/schemas/dcat_ap_full.yaml | 7 +++- ckanext/dcat/schemas/health_dcat_ap.yaml | 6 +++ .../test_euro_dcatap_2_profile_parse.py | 3 ++ .../test_euro_dcatap_2_profile_serialize.py | 6 +++ 5 files changed, 33 insertions(+), 31 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index fc15e216..d38a5ea4 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -184,7 +184,7 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): ): access_service_dict = {} - # Simple values + # Simple values for key, predicate in ( ("availability", DCATAP.availability), ("title", DCT.title), @@ -192,47 +192,28 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): ("license", DCT.license), ("access_rights", DCT.accessRights), ("description", DCT.description), + ("identifier", DCT.identifier), ): value = self._object_value(access_service, predicate) if value: access_service_dict[key] = value - # List + + # List values for key, predicate in ( ("endpoint_url", DCAT.endpointURL), ("serves_dataset", DCAT.servesDataset), + ("conforms_to", DCT.conformsTo), + ("format", DCT["format"]), + ("language", DCT.language), + ("rights", DCT.rights), + ("landing_page", DCAT.landingPage), + ("keyword", DCAT.keyword), + ("applicable_legislation", DCATAP.applicableLegislation), ): values = self._object_value_list(access_service, predicate) if values: access_service_dict[key] = values - values = self._object_value_list(access_service, DCT.conformsTo) - if values: - access_service_dict["conforms_to"] = values - - values = self._object_value_list(access_service, DCT["format"]) - if values: - access_service_dict["format"] = values - - value = self._object_value(access_service, DCT.identifier) - if value: - access_service_dict["identifier"] = value - - values = self._object_value_list(access_service, DCT.language) - if values: - access_service_dict["language"] = values - - values = self._object_value_list(access_service, DCT.rights) - if values: - access_service_dict["rights"] = values - - values = self._object_value_list(access_service, DCAT.landingPage) - if values: - access_service_dict["landing_page"] = values - - values = self._object_value_list(access_service, DCAT.keyword) - if values: - access_service_dict["keyword"] = values - # Access service URI (explicitly show the missing ones) access_service_dict["uri"] = ( str(access_service) @@ -512,6 +493,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): ("language", DCT.language, None, URIRefOrLiteral), ("rights", DCT.rights, None, URIRefOrLiteral), ("landing_page", DCAT.landingPage, None, URIRefOrLiteral), + ("applicable_legislation", DCATAP.applicableLegislation, None, URIRefOrLiteral, ELI.LegalResource), ] self._add_list_triples_from_dict(access_service_dict, access_service_node, extra_items) diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index 0a2029e3..48328a96 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -575,7 +575,12 @@ resource_fields: preset: tag_string_autocomplete form_placeholder: eg. economy, mental health, government help_text: Keywords or tags describing the dataservice. Use commas to separate multiple values. - + + - field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. help_text: A data service that gives access to the resource. diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index 1b05ba96..8c7b9237 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -744,6 +744,12 @@ resource_fields: form_placeholder: eg. economy, mental health, government help_text: Keywords or tags describing the dataservice. Use commas to separate multiple values. + - field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. + help_text: A data service that gives access to the resource. # Note: if not provided, this will be autogenerated diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index 6c903e1b..1f5f7b4f 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -44,6 +44,7 @@ def test_parse_access_service_extra_fields(self): keyword1 keyword2 + @@ -66,6 +67,7 @@ def test_parse_access_service_extra_fields(self): assert access_service['language'] == ['http://publications.europa.eu/resource/authority/language/ENG'] assert access_service['rights'] == ['open use'] assert access_service['landing_page'] == ['http://example.org/landing'] + assert access_service['applicable_legislation'] == ['http://data.europa.eu/eli/reg_impl/2023/138/oj'] assert sorted(access_service['keyword']) == ['keyword1', 'keyword2'] def test_dataset_all_fields(self): @@ -125,6 +127,7 @@ def test_dataset_all_fields(self): SPARQL url description + diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py index e4fc5f04..65400abd 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py @@ -420,6 +420,7 @@ def test_distribution_fields(self): 'endpoint_url': ['http://publications.europa.eu/webapi/rdf/sparql'], 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'], 'conforms_to': ['http://example.org/spec'], + 'applicable_legislation': ['http://data.europa.eu/eli/reg_impl/2023/138/oj'], 'format': ['http://example.org/format'], 'identifier': 'service-123', 'language': ['http://publications.europa.eu/resource/authority/language/ENG'], @@ -437,6 +438,7 @@ def test_distribution_fields(self): 'endpoint_url': ['http://publications.europa.eu/webapi/rdf/sparql'], 'serves_dataset': ['http://data.europa.eu/88u/dataset/eu-whoiswho-the-official-directory-of-the-european-union'], 'conforms_to': ['http://example.org/spec'], + 'applicable_legislation': ['http://data.europa.eu/eli/reg_impl/2023/138/oj'], 'format': ['http://example.org/format'], 'identifier': 'service-123', 'language': ['http://publications.europa.eu/resource/authority/language/ENG'], @@ -523,6 +525,10 @@ def test_distribution_fields(self): self._get_typed_list(access_service.get('landing_page'), URIRef) if access_service.get( 'landing_page') else [] ) + self._assert_values_list( + g, object[2], DCATAP.applicableLegislation, + self._get_typed_list(access_service.get('applicable_legislation'), URIRef) if access_service.get('applicable_legislation') else [] + ) if access_service.get('keyword'): self._assert_values_list( From fbbd48ea80d4e7574493bffb211545361d8a65b3 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 16 Jul 2025 12:31:20 +0200 Subject: [PATCH 27/36] fix(dataservice (contact & creator)) fix mapping for creator and contact within Dataservice --- ckanext/dcat/profiles/base.py | 35 ++++++++++++ ckanext/dcat/profiles/euro_dcat_ap_2.py | 15 +++++ .../dcat/profiles/euro_dcat_ap_scheming.py | 41 +------------- ckanext/dcat/schemas/dcat_ap_full.yaml | 56 +++++++++++++++++++ ckanext/dcat/schemas/health_dcat_ap.yaml | 56 +++++++++++++++++++ .../test_euro_dcatap_2_profile_parse.py | 24 +++++++- .../test_euro_dcatap_2_profile_serialize.py | 13 ++++- 7 files changed, 199 insertions(+), 41 deletions(-) diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 299f74bc..5b0591e0 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -863,6 +863,41 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict): self.g.add((org_ref, FOAF.name, Literal(sub_org["name"]))) return agent_ref + + def _add_contact_to_graph(self, subject, predicate, contact): + contact_uri = contact.get("uri") + if contact_uri: + contact_details = CleanedURIRef(contact_uri) + else: + contact_details = BNode() + + self.g.add((contact_details, RDF.type, VCARD.Kind)) + self.g.add((subject, predicate, contact_details)) + + self._add_triple_from_dict(contact, contact_details, VCARD.fn, "name") + self._add_triple_from_dict( + contact, + contact_details, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + self._add_triple_from_dict( + contact, + contact_details, + VCARD.hasUID, + "identifier", + _type=URIRefOrLiteral, + ) + self._add_triple_from_dict( + contact, + contact_details, + VCARD.hasURL, + "url", + _type=URIRef, + ) + def _add_spatial_value_to_graph(self, spatial_ref, predicate, value): """ diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index d38a5ea4..2dd59d4e 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -214,6 +214,14 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): if values: access_service_dict[key] = values + contact_points = self._contact_details(access_service, DCAT.contactPoint) + if contact_points: + access_service_dict["contact"] = contact_points[0] + + creators = self._agents_details(access_service, DCT.creator) + if creators: + access_service_dict["creator"] = creators + # Access service URI (explicitly show the missing ones) access_service_dict["uri"] = ( str(access_service) @@ -486,6 +494,13 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): access_service_dict, access_service_node, items ) + contact_point_dict = access_service_dict.get("contact") + if contact_point_dict: + self._add_contact_to_graph(access_service_node, DCAT.contactPoint, contact_point_dict) + + for creator_dict in access_service_dict.get("creator", []): + self._add_agent_to_graph(access_service_node, DCT.creator, creator_dict) + # Extra list values for access services extra_items = [ ("conforms_to", DCT.conformsTo, None, URIRefOrLiteral), diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index bea68935..82282a9f 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -148,45 +148,8 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): Add triples to the graph from new repeating subfields """ contact = dataset_dict.get("contact") - if ( - isinstance(contact, list) - and len(contact) - and self._not_empty_dict(contact[0]) - ): - for item in contact: - contact_uri = item.get("uri") - if contact_uri: - contact_details = CleanedURIRef(contact_uri) - else: - contact_details = BNode() - - self.g.add((contact_details, RDF.type, VCARD.Kind)) - self.g.add((dataset_ref, DCAT.contactPoint, contact_details)) - - self._add_triple_from_dict(item, contact_details, VCARD.fn, "name") - # Add mail address as URIRef, and ensure it has a mailto: prefix - self._add_triple_from_dict( - item, - contact_details, - VCARD.hasEmail, - "email", - _type=URIRef, - value_modifier=self._add_mailto, - ) - self._add_triple_from_dict( - item, - contact_details, - VCARD.hasUID, - "identifier", - _type=URIRefOrLiteral, - ) - self._add_triple_from_dict( - item, - contact_details, - VCARD.hasURL, - "url", - _type=URIRef, - ) + for item in contact: + self._add_contact_to_graph(dataset_ref, DCAT.contactPoint, item) self._add_agents(dataset_ref, dataset_dict, "publisher", DCT.publisher) self._add_agents(dataset_ref, dataset_dict, "creator", DCT.creator) diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index 48328a96..6290f399 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -582,6 +582,62 @@ resource_fields: validators: ignore_missing scheming_multiple_text help_text: The legislation that mandates the creation or management of the dataset. + - field_name: contact + label: Contact point + repeating_label: Contact point + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the contact point. Such as a ROR ID. + + - field_name: url + label: URL + help_text: A URL associated with the contact + help_text: Contact information for enquiries about the dataservice. + + - field_name: creator + label: Creator + repeating_label: Creator + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataservice. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + help_text: A data service that gives access to the resource. # Note: if not provided, this will be autogenerated diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index 8c7b9237..865010b7 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -750,6 +750,62 @@ resource_fields: validators: ignore_missing scheming_multiple_text help_text: The legislation that mandates the creation or management of the dataset. + - field_name: contact + label: Contact point + repeating_label: Contact point + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the contact point. Such as a ROR ID. + + - field_name: url + label: URL + help_text: A URL associated with the contact + help_text: Contact information for enquiries about the dataservice. + + - field_name: creator + label: Creator + repeating_label: Creator + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataservice. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + help_text: A data service that gives access to the resource. # Note: if not provided, this will be autogenerated diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index 1f5f7b4f..dbab2a4f 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -30,7 +30,9 @@ def test_parse_access_service_extra_fields(self): xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:dcatap="http://data.europa.eu/r5r/" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:vcard="http://www.w3.org/2006/vcard/ns#" + xmlns:foaf="http://xmlns.com/foaf/0.1/"> @@ -45,6 +47,17 @@ def test_parse_access_service_extra_fields(self): keyword1 keyword2 + + + European Commission + + + + + John Doe + + + @@ -69,6 +82,15 @@ def test_parse_access_service_extra_fields(self): assert access_service['landing_page'] == ['http://example.org/landing'] assert access_service['applicable_legislation'] == ['http://data.europa.eu/eli/reg_impl/2023/138/oj'] assert sorted(access_service['keyword']) == ['keyword1', 'keyword2'] + + contact_point = access_service.get("contact") + assert isinstance(contact_point, dict) + assert contact_point.get("name") == "John Doe" + assert contact_point.get("email") == "john@example.org" + + creator = access_service.get("creator") + assert isinstance(creator, list) + assert creator[0].get("name") == "European Commission" def test_dataset_all_fields(self): diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py index 65400abd..941db2ed 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py @@ -426,7 +426,9 @@ def test_distribution_fields(self): 'language': ['http://publications.europa.eu/resource/authority/language/ENG'], 'rights': ['open use'], 'landing_page': ['http://example.org/landing'], - 'keyword': ['keyword1', 'keyword2'] + 'keyword': ['keyword1', 'keyword2'], + 'contact': {'name': 'John Doe', 'email': 'john@example.org'}, + 'creator': [{'name': 'European Commission'}], }, { 'availability': 'http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL', @@ -536,6 +538,15 @@ def test_distribution_fields(self): self._get_typed_list(access_service.get('keyword'), Literal) ) + if access_service.get('contact'): + contact = self._triple(g, object[2], DCAT.contactPoint, None)[2] + assert self._triple(g, contact, VCARD.fn, Literal('John Doe')) + assert self._triple(g, contact, VCARD.hasEmail, URIRef('mailto:john@example.org')) + + if access_service.get('creator'): + creators = self._triples(g, object[2], DCT.creator, None) + assert any(self._triple(g, c[2], FOAF.name, Literal('European Commission')) for c in creators) + # Lists self._assert_values_list(g, object[2], DCAT.endpointURL, self._get_typed_list(access_service.get('endpoint_url'), URIRef)) From 4d1e3a091d465afebafbc88703422d09c92ed82f Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 16 Jul 2025 15:46:24 +0200 Subject: [PATCH 28/36] add mapping + UT for description within dataservice --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 1 + ckanext/dcat/schemas/dcat_ap_full.yaml | 5 +++++ ckanext/dcat/schemas/health_dcat_ap.yaml | 5 +++++ .../profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py | 2 ++ .../dcat_ap_2/test_euro_dcatap_2_profile_serialize.py | 5 +++-- 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index 2dd59d4e..6f734039 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -193,6 +193,7 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): ("access_rights", DCT.accessRights), ("description", DCT.description), ("identifier", DCT.identifier), + ("description", DCT.description), ): value = self._object_value(access_service, predicate) if value: diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index 6290f399..460c38f9 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -523,6 +523,11 @@ resource_fields: - field_name: title label: Title + - field_name: description + label: Description + form_snippet: markdown.html + help_text: A free-text account of the resource. + - field_name: endpoint_description label: Endpoint description diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index 865010b7..f5c29237 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -691,6 +691,11 @@ resource_fields: - field_name: title label: Title + - field_name: description + label: Description + form_snippet: markdown.html + help_text: A free-text account of the resource. + - field_name: endpoint_description label: Endpoint description diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index dbab2a4f..0c23c1eb 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -47,6 +47,7 @@ def test_parse_access_service_extra_fields(self): keyword1 keyword2 + This SPARQL end point allow to directly query the EU Whoiswho content European Commission @@ -82,6 +83,7 @@ def test_parse_access_service_extra_fields(self): assert access_service['landing_page'] == ['http://example.org/landing'] assert access_service['applicable_legislation'] == ['http://data.europa.eu/eli/reg_impl/2023/138/oj'] assert sorted(access_service['keyword']) == ['keyword1', 'keyword2'] + assert access_service['description'] == 'This SPARQL end point allow to directly query the EU Whoiswho content' contact_point = access_service.get("contact") assert isinstance(contact_point, dict) diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py index 941db2ed..75693119 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py @@ -496,8 +496,9 @@ def test_distribution_fields(self): URIRef(access_service.get('license'))) self._assert_simple_value(g, object[2], DCT.title, Literal(access_service.get('title'))) - self._assert_simple_value(g, object[2], DCT.description, - Literal(access_service.get('description'))) + if access_service.get('description'): + self._assert_simple_value(g, object[2], DCT.description, + Literal(access_service.get('description'))) self._assert_simple_value(g, object[2], DCAT.endpointDescription, Literal(access_service.get('endpoint_description'))) From 2abc07c62331c334e065016445191468fd8f960b Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 16 Jul 2025 16:01:31 +0200 Subject: [PATCH 29/36] Add if check by contactpoint --- ckanext/dcat/profiles/euro_dcat_ap_scheming.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 82282a9f..4a7db6f0 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -148,8 +148,9 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): Add triples to the graph from new repeating subfields """ contact = dataset_dict.get("contact") - for item in contact: - self._add_contact_to_graph(dataset_ref, DCAT.contactPoint, item) + if contact: + for item in contact: + self._add_contact_to_graph(dataset_ref, DCAT.contactPoint, item) self._add_agents(dataset_ref, dataset_dict, "publisher", DCT.publisher) self._add_agents(dataset_ref, dataset_dict, "creator", DCT.creator) From 7759e0781c8e0054b007f96498bcdaf23b1a3525 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 16 Jul 2025 18:44:38 +0200 Subject: [PATCH 30/36] Add modified, publisher, license and theme to dataservice --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 17 +++++++- ckanext/dcat/schemas/dcat_ap_full.yaml | 43 +++++++++++++++++++ ckanext/dcat/schemas/health_dcat_ap.yaml | 37 ++++++++++++++++ .../test_euro_dcatap_2_profile_parse.py | 16 +++++++ .../test_euro_dcatap_2_profile_serialize.py | 19 ++++++++ 5 files changed, 131 insertions(+), 1 deletion(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index 6f734039..65b8a47e 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -194,6 +194,7 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): ("description", DCT.description), ("identifier", DCT.identifier), ("description", DCT.description), + ("modified", DCT.modified), ): value = self._object_value(access_service, predicate) if value: @@ -210,6 +211,7 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): ("landing_page", DCAT.landingPage), ("keyword", DCAT.keyword), ("applicable_legislation", DCATAP.applicableLegislation), + ("theme", DCAT.theme), ): values = self._object_value_list(access_service, predicate) if values: @@ -218,6 +220,10 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): contact_points = self._contact_details(access_service, DCAT.contactPoint) if contact_points: access_service_dict["contact"] = contact_points[0] + + publishers = self._agents_details(access_service, DCT.publisher) + if publishers: + access_service_dict["publisher"] = publishers[0] creators = self._agents_details(access_service, DCT.creator) if creators: @@ -489,16 +495,24 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): RDFS.Resource, ), ("description", DCT.description, None, Literal), + ("modified", DCT.modified, None, Literal), ] - self._add_triples_from_dict( access_service_dict, access_service_node, items ) + if access_service_dict.get("modified"): + self._add_date_triple(access_service_node, DCT.modified, access_service_dict.get("modified")) + + contact_point_dict = access_service_dict.get("contact") if contact_point_dict: self._add_contact_to_graph(access_service_node, DCAT.contactPoint, contact_point_dict) + publisher_dict = access_service_dict.get("publisher") + if publisher_dict: + self._add_agent_to_graph(access_service_node, DCT.publisher, publisher_dict) + for creator_dict in access_service_dict.get("creator", []): self._add_agent_to_graph(access_service_node, DCT.creator, creator_dict) @@ -510,6 +524,7 @@ def _graph_from_dataset_v2(self, dataset_dict, dataset_ref): ("rights", DCT.rights, None, URIRefOrLiteral), ("landing_page", DCAT.landingPage, None, URIRefOrLiteral), ("applicable_legislation", DCATAP.applicableLegislation, None, URIRefOrLiteral, ELI.LegalResource), + ("theme", DCAT.theme, None, URIRefOrLiteral), ] self._add_list_triples_from_dict(access_service_dict, access_service_node, extra_items) diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index 460c38f9..39a7ae67 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -643,6 +643,49 @@ resource_fields: label: Identifier help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + - field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the publisher, such as a ROR ID. + help_text: Entity responsible for making the dataset available. + + - field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. + + - field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + + - field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataservice. A Dataservice may be associated with multiple themes. + help_text: A data service that gives access to the resource. # Note: if not provided, this will be autogenerated diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index f5c29237..bfeb5791 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -811,6 +811,43 @@ resource_fields: label: Identifier help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + - field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the publisher, such as a ROR ID. + help_text: Entity responsible for making the dataset available. + + - field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. + + - field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + help_text: A data service that gives access to the resource. # Note: if not provided, this will be autogenerated diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index 0c23c1eb..6db5400d 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -53,6 +53,11 @@ def test_parse_access_service_extra_fields(self): European Commission + + + Publications Office of the European Union + + John Doe @@ -94,6 +99,10 @@ def test_parse_access_service_extra_fields(self): assert isinstance(creator, list) assert creator[0].get("name") == "European Commission" + publisher = access_service.get("publisher") + assert isinstance(publisher, dict) + assert publisher.get("name") == "Publications Office of the European Union" + def test_dataset_all_fields(self): temporal_resolution = 'P1D' @@ -152,6 +161,9 @@ def test_dataset_all_fields(self): + 2012-05-01T00:04:06 + + @@ -225,11 +237,15 @@ def test_dataset_all_fields(self): assert access_service.get('license') == 'http://publications.europa.eu/resource/authority/licence/COM_REUSE' assert access_service.get('access_rights') == 'http://publications.europa.eu/resource/authority/access-right/PUBLIC' assert access_service.get('description') == 'This SPARQL end point allow to directly query the EU Whoiswho content (organization / membership / person)' + assert access_service.get('modified') == '2012-05-01T00:04:06' # List endpoint_url_list = access_service.get('endpoint_url') assert len(endpoint_url_list) == 1 assert 'http://publications.europa.eu/webapi/rdf/sparql' in endpoint_url_list + theme_list = access_service.get('theme') + assert isinstance(theme_list, list) + assert sorted(theme_list) == ['http://example.org/theme/environment', 'http://example.org/theme/transport'] def test_availability_distibutions_without_uri(self): diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py index 75693119..dc5cba96 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_serialize.py @@ -429,6 +429,9 @@ def test_distribution_fields(self): 'keyword': ['keyword1', 'keyword2'], 'contact': {'name': 'John Doe', 'email': 'john@example.org'}, 'creator': [{'name': 'European Commission'}], + 'publisher': {'name': 'Publications Office of the European Union'}, + 'modified': '2024-01-01T12:00:00', + 'theme': ['http://example.org/theme/environment', 'http://example.org/theme/transport'], }, { 'availability': 'http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL', @@ -548,6 +551,22 @@ def test_distribution_fields(self): creators = self._triples(g, object[2], DCT.creator, None) assert any(self._triple(g, c[2], FOAF.name, Literal('European Commission')) for c in creators) + if access_service.get('publisher'): + publishers = self._triples(g, object[2], DCT.publisher, None) + assert any(self._triple(g, p[2], FOAF.name, Literal('Publications Office of the European Union')) for p in publishers) + + if access_service.get('modified'): + assert self._triple( + g, object[2], DCT.modified, + Literal(access_service.get('modified'), datatype=XSD.dateTime) + ) + + if access_service.get('theme'): + self._assert_values_list( + g, object[2], DCAT.theme, + self._get_typed_list(access_service.get('theme'), URIRef) + ) + # Lists self._assert_values_list(g, object[2], DCAT.endpointURL, self._get_typed_list(access_service.get('endpoint_url'), URIRef)) From 5762c35732ef8d02e0116b406d8051da57e2cfec Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 27 Aug 2025 22:26:36 +0200 Subject: [PATCH 31/36] fix(dataseries) Remove dataseries from pull request --- ckanext/dcat/harvesters/rdf.py | 112 ++++++------------- ckanext/dcat/processors.py | 45 +------- ckanext/dcat/profiles/euro_dcat_ap_3.py | 11 -- ckanext/dcat/profiles/euro_health_dcat_ap.py | 1 + 4 files changed, 35 insertions(+), 134 deletions(-) diff --git a/ckanext/dcat/harvesters/rdf.py b/ckanext/dcat/harvesters/rdf.py index 00a7f91a..a22e0b97 100644 --- a/ckanext/dcat/harvesters/rdf.py +++ b/ckanext/dcat/harvesters/rdf.py @@ -210,18 +210,39 @@ def gather_stage(self, harvest_job): return [] try: - source_dataset = model.Package.get(harvest_job.source.id) - - series_ids, series_mapping = self._parse_and_collect( - parser.dataset_series(), - source_dataset, - harvest_job, - guids_in_source, - is_series=True, - collect_series_mapping=True - ) - object_ids += series_ids - object_ids += self._parse_and_collect(parser.datasets(series_mapping), source_dataset, harvest_job, guids_in_source, is_series=False) + + source_dataset = model.Package.get(harvest_job.source.id) + + for dataset in parser.datasets(): + if not dataset.get('name'): + dataset['name'] = self._gen_new_name(dataset['title']) + if dataset['name'] in self._names_taken: + suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1 + dataset['name'] = '{}-{}'.format(dataset['name'], suffix) + self._names_taken.append(dataset['name']) + + # Unless already set by the parser, get the owner organization (if any) + # from the harvest source dataset + if not dataset.get('owner_org'): + if source_dataset.owner_org: + dataset['owner_org'] = source_dataset.owner_org + + # Try to get a unique identifier for the harvested dataset + guid = self._get_guid(dataset, source_url=source_dataset.url) + + if not guid: + self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset), + harvest_job) + continue + + dataset['extras'].append({'key': 'guid', 'value': guid}) + guids_in_source.append(guid) + + obj = HarvestObject(guid=guid, job=harvest_job, + content=json.dumps(dataset)) + + obj.save() + object_ids.append(obj.id) except Exception as e: self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) @@ -401,70 +422,3 @@ def import_stage(self, harvest_object): model.Session.commit() return True - - def _parse_and_collect( - self, - items, - source_dataset, - harvest_job, - guids_in_source, - is_series=False, - collect_series_mapping=False - ): - object_ids = [] - label = "dataset series" if is_series else "dataset" - series_mapping = {} if collect_series_mapping else None - - for item in items: - original_title = item.get("title", label) - if not item.get("name"): - item["name"] = self._gen_new_name(original_title) - - if item["name"] in self._names_taken: - suffix = len([i for i in self._names_taken if i.startswith(item["name"] + "-")]) + 1 - item["name"] = f"{item['name']}-{suffix}" - - self._names_taken.append(item["name"]) - - if not item.get("owner_org") and source_dataset.owner_org: - item["owner_org"] = source_dataset.owner_org - - guid = self._get_guid(item, source_url=source_dataset.url) - if not guid: - self._save_gather_error(f"Could not get a unique identifier for {label}: {item}", harvest_job) - continue - - item.setdefault("extras", []).append({"key": "guid", "value": guid}) - guids_in_source.append(guid) - - obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(item)) - obj.save() - object_ids.append(obj.id) - - # Store mapping of RDF URI to dataset name if requested - if collect_series_mapping: - series_uri = item.get("uri") or item.get("identifier") - if series_uri: - # Try to find an existing active dataset series by 'guid' match - existing = model.Session.query(model.Package).\ - join(model.PackageExtra).\ - filter(model.PackageExtra.key == 'guid').\ - filter(model.PackageExtra.value == series_uri).\ - filter(model.Package.type == 'dataset_series').\ - filter(model.Package.state == 'active').\ - first() - - if existing: - item["name"] = existing.name - - series_mapping[str(series_uri)] = { - "id": existing.id if existing else item.get("id"), - "name": item["name"] - } - - - if collect_series_mapping: - return object_ids, series_mapping - - return object_ids - diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index d255d582..79f35821 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -119,16 +119,6 @@ def _datasets(self): for dataset in self.g.subjects(RDF.type, DCAT.Dataset): yield dataset - def _dataset_series(self): - ''' - Generator that returns all DCAT dataset series on the graph - - Yields rdflib.term.URIRef objects that can be used on graph lookups - and queries - ''' - for dataset_series in self.g.subjects(RDF.type, DCAT.DatasetSeries): - yield dataset_series - def next_page(self): ''' Returns the URL of the next page or None if there is no next page @@ -183,7 +173,7 @@ def supported_formats(self): for plugin in rdflib.plugin.plugins(kind=rdflib.parser.Parser)]) - def datasets(self, series_mapping=None): + def datasets(self): ''' Generator that returns CKAN datasets parsed from the RDF graph @@ -203,39 +193,6 @@ def datasets(self, series_mapping=None): ) profile.parse_dataset(dataset_dict, dataset_ref) - # Add in_series if present in RDF and mapped - in_series = [] - for series_ref in self.g.objects(dataset_ref, DCAT.inSeries): - key = str(series_ref) - if series_mapping and key in series_mapping: - in_series.append(series_mapping[key]["id"]) - - if in_series: - dataset_dict["in_series"] = in_series - - yield dataset_dict - - - def dataset_series(self): - ''' - Generator that returns CKAN dataset series parsed from the RDF graph - - Each dataset series is passed to all the loaded profiles before being - yielded, so it can be further modified by each one of them. - - Returns a dataset series dict that can be passed to eg `package_create` - or `package_update` - ''' - for dataset_ref in self._dataset_series(): - dataset_dict = {} - for profile_class in self._profiles: - profile = profile_class( - self.g, - dataset_type=self.dataset_type, - compatibility_mode=self.compatibility_mode - ) - profile.parse_dataset(dataset_dict, dataset_ref) - yield dataset_dict diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py index 64220430..a99cadfe 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_3.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py @@ -30,17 +30,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): # DCAT AP v2 scheming fields dataset_dict = self._parse_dataset_v2_scheming(dataset_dict, dataset_ref) - - # Check if it's a dataset series - if (dataset_ref, RDF.type, DCAT.DatasetSeries) in self.g: - dataset_dict["type"] = "dataset_series" - - # Example defaulting logic (adjust based on RDF vocab if you have it) - if "series_order_field" not in dataset_dict: - dataset_dict["series_order_field"] = "metadata_created" - if "series_order_type" not in dataset_dict: - dataset_dict["series_order_type"] = "date" - # DCAT AP v3: hasVersion values = self._object_value_list(dataset_ref, DCAT.hasVersion) if values: diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 253f4ee9..e461e5af 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -162,6 +162,7 @@ def _parse_retention_period(self, subject_ref): return [retention_dict] if retention_dict else [] + def graph_from_dataset(self, dataset_dict, dataset_ref): super().graph_from_dataset(dataset_dict, dataset_ref) for prefix, namespace in namespaces.items(): From 5cf6942dda2fe88ec5f58993e1d0f5185dbb3eb8 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 27 Aug 2025 22:30:49 +0200 Subject: [PATCH 32/36] Remove fluent extension tag --- .github/workflows/test.yml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5d3184b4..9f83c6d8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,9 +26,6 @@ jobs: - ckan-version: "2.10" ckan-image: "ckan/ckan-dev:2.10-py3.10" solr-version: "9" - - ckan-version: "2.9" - ckan-image: "ckan/ckan-dev:2.9-py3.9" - solr-version: "8" fail-fast: false name: CKAN ${{ matrix.ckan-version }} @@ -64,10 +61,6 @@ jobs: pip install -e . # Replace default path to CKAN core config file with the one on the container sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Install requirements (2.9) - run: | - pip install -U pytest-rerunfailures - if: ${{ matrix.ckan-version == '2.9' }} - name: Setup other extensions run: | git clone https://github.com/ckan/ckanext-harvest @@ -75,7 +68,8 @@ jobs: pip install -r ckanext-harvest/requirements.txt git clone https://github.com/ckan/ckanext-scheming pip install -e ckanext-scheming - pip install git+https://github.com/ckan/ckanext-fluent.git@4e9340a#egg=ckanext-fluent + git clone https://github.com/ckan/ckanext-fluent + pip install -e ckanext-fluent git clone https://github.com/ckan/ckanext-dataset-series pip install -e ckanext-dataset-series - name: Setup extension @@ -83,4 +77,4 @@ jobs: ckan -c test.ini db init ckan -c test.ini db pending-migrations --apply - name: Run tests - run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests + run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests \ No newline at end of file From 4485715e7d5b0e65775aae9b6ae09b4c1dbbef3a Mon Sep 17 00:00:00 2001 From: Hans-Christian Date: Wed, 3 Sep 2025 14:47:23 +0200 Subject: [PATCH 33/36] Update health_dcat_ap.yaml --- ckanext/dcat/schemas/health_dcat_ap.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index bfeb5791..5181df94 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -733,8 +733,6 @@ resource_fields: - field_name: rights label: Rights - form_snippet: markdown.html - display_snippet: markdown.html preset: multiple_text validators: ignore_missing scheming_multiple_text From ba081d7eba69bca6041a56fe0e344f09e4456f40 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 11 Sep 2025 21:35:56 +0200 Subject: [PATCH 34/36] fix: Always store as list when complex object --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 4 ++-- .../dcat_ap_3/test_euro_dcatap_3_profile_serialize.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index 65b8a47e..fd0a6bc5 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -219,11 +219,11 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): contact_points = self._contact_details(access_service, DCAT.contactPoint) if contact_points: - access_service_dict["contact"] = contact_points[0] + access_service_dict["contact"] = contact_points publishers = self._agents_details(access_service, DCT.publisher) if publishers: - access_service_dict["publisher"] = publishers[0] + access_service_dict["publisher"] = publishers creators = self._agents_details(access_service, DCT.creator) if creators: diff --git a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py index 17ad472f..7c06fa11 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py @@ -31,13 +31,13 @@ @pytest.mark.usefixtures("with_plugins", "clean_db") -@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets dataset_series") @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) @pytest.mark.ckan_config( "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.dataset_series.schemas:presets.yaml " ) @pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_dcat_ap_3") class TestEuroDCATAP3ProfileSerializeDataset(BaseSerializeTest): From 419c364919ce4c4068973d1a8a8e2b620ae5a9cf Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 11 Sep 2025 21:43:59 +0200 Subject: [PATCH 35/36] fix: parse of creator and contact within acces service --- .../dcat_ap_2/test_euro_dcatap_2_profile_parse.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index 6db5400d..94a1e541 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -90,18 +90,18 @@ def test_parse_access_service_extra_fields(self): assert sorted(access_service['keyword']) == ['keyword1', 'keyword2'] assert access_service['description'] == 'This SPARQL end point allow to directly query the EU Whoiswho content' - contact_point = access_service.get("contact") - assert isinstance(contact_point, dict) - assert contact_point.get("name") == "John Doe" - assert contact_point.get("email") == "john@example.org" + contact_points = access_service.get("contact") + assert isinstance(contact_points, list) + assert contact_points[0].get("name") == "John Doe" + assert contact_points[0].get("email") == "john@example.org" creator = access_service.get("creator") assert isinstance(creator, list) assert creator[0].get("name") == "European Commission" - publisher = access_service.get("publisher") - assert isinstance(publisher, dict) - assert publisher.get("name") == "Publications Office of the European Union" + publishers = access_service.get("publisher") + assert isinstance(publishers, list) + assert publishers[0].get("name") == "Publications Office of the European Union" def test_dataset_all_fields(self): From 03a5f889aac814068282acdf26f943eb70c475d9 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Fri, 12 Sep 2025 13:46:52 +0200 Subject: [PATCH 36/36] fix(croisant) point to mlcroisant version 1.0.22 --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 1883b253..aa725832 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,6 @@ responses>=0.25.2 pyshacl -mlcroissant; python_version >= '3.10' +mlcroissant==1.0.21; python_version >= '3.10' mock pytest-ckan pytest-cov