From 552f3506d9c28cbf0038d0584b8fdeef80d0e052 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 27 Aug 2025 22:06:10 +0200 Subject: [PATCH 1/5] fix(fluent) because support of CKAN 2.9 is ended. Latest CKAN fluent extension can be used --- .github/workflows/test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5d3184b4..c960cf0f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -75,7 +75,8 @@ jobs: pip install -r ckanext-harvest/requirements.txt git clone https://github.com/ckan/ckanext-scheming pip install -e ckanext-scheming - pip install git+https://github.com/ckan/ckanext-fluent.git@4e9340a#egg=ckanext-fluent + git clone https://github.com/ckan/ckanext-fluent + pip install -e ckanext-fluent git clone https://github.com/ckan/ckanext-dataset-series pip install -e ckanext-dataset-series - name: Setup extension From 245aa597a6e52a5dcf983d1b4abdf18be94d56f2 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Fri, 12 Sep 2025 13:50:12 +0200 Subject: [PATCH 2/5] fix(croisant) point to mlcroisant version 1.0.22 --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 1883b253..aa725832 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,6 @@ responses>=0.25.2 pyshacl -mlcroissant; python_version >= '3.10' +mlcroissant==1.0.21; python_version >= '3.10' mock pytest-ckan pytest-cov From 347a2b2c0c9b2bc49d1b1cd3e2da7d5f86b3feb2 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 1 Oct 2025 14:17:05 +0200 Subject: [PATCH 3/5] put back data series --- ckanext/dcat/harvesters/rdf.py | 113 +++++++++++++++++------- ckanext/dcat/processors.py | 44 ++++++++- ckanext/dcat/profiles/euro_dcat_ap_3.py | 9 ++ 3 files changed, 135 insertions(+), 31 deletions(-) diff --git a/ckanext/dcat/harvesters/rdf.py b/ckanext/dcat/harvesters/rdf.py index a22e0b97..095203ef 100644 --- a/ckanext/dcat/harvesters/rdf.py +++ b/ckanext/dcat/harvesters/rdf.py @@ -213,36 +213,24 @@ def gather_stage(self, harvest_job): source_dataset = model.Package.get(harvest_job.source.id) - for dataset in parser.datasets(): - if not dataset.get('name'): - dataset['name'] = self._gen_new_name(dataset['title']) - if dataset['name'] in self._names_taken: - suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1 - dataset['name'] = '{}-{}'.format(dataset['name'], suffix) - self._names_taken.append(dataset['name']) - - # Unless already set by the parser, get the owner organization (if any) - # from the harvest source dataset - if not dataset.get('owner_org'): - if source_dataset.owner_org: - dataset['owner_org'] = source_dataset.owner_org - - # Try to get a unique identifier for the harvested dataset - guid = self._get_guid(dataset, source_url=source_dataset.url) - - if not guid: - self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset), - harvest_job) - continue - - dataset['extras'].append({'key': 'guid', 'value': guid}) - guids_in_source.append(guid) - - obj = HarvestObject(guid=guid, job=harvest_job, - content=json.dumps(dataset)) - - obj.save() - object_ids.append(obj.id) + series_ids, series_mapping = self._parse_and_collect( + parser.dataset_series(), + source_dataset, + harvest_job, + guids_in_source, + is_series=True, + collect_series_mapping=True + ) + object_ids += series_ids + + object_ids += self._parse_and_collect( + parser.datasets(series_mapping), + source_dataset, + harvest_job, + guids_in_source, + is_series=False, + collect_series_mapping=False + ) except Exception as e: self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) @@ -422,3 +410,68 @@ def import_stage(self, harvest_object): model.Session.commit() return True + + def _parse_and_collect( + self, + items, + source_dataset, + harvest_job, + guids_in_source, + is_series=False, + collect_series_mapping=False + ): + object_ids = [] + label = "dataset series" if is_series else "dataset" + series_mapping = {} if collect_series_mapping else None + + for item in items: + original_title = item.get("title", label) + if not item.get("name"): + item["name"] = self._gen_new_name(original_title) + + if item["name"] in self._names_taken: + suffix = len([i for i in self._names_taken if i.startswith(item["name"] + "-")]) + 1 + item["name"] = f"{item['name']}-{suffix}" + + self._names_taken.append(item["name"]) + + if not item.get("owner_org") and source_dataset.owner_org: + item["owner_org"] = source_dataset.owner_org + + guid = self._get_guid(item, source_url=source_dataset.url) + if not guid: + self._save_gather_error(f"Could not get a unique identifier for {label}: {item}", harvest_job) + continue + + item.setdefault("extras", []).append({"key": "guid", "value": guid}) + guids_in_source.append(guid) + + obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(item)) + obj.save() + object_ids.append(obj.id) + + # Store mapping of RDF URI to dataset name if requested + if collect_series_mapping: + series_uri = item.get("uri") or item.get("identifier") + if series_uri: + # Try to find an existing active dataset series by 'guid' match + existing = model.Session.query(model.Package).\ + join(model.PackageExtra).\ + filter(model.PackageExtra.key == 'guid').\ + filter(model.PackageExtra.value == series_uri).\ + filter(model.Package.type == 'dataset_series').\ + filter(model.Package.state == 'active').\ + first() + + if existing: + item["name"] = existing.name + + series_mapping[str(series_uri)] = { + "id": existing.id if existing else item.get("id"), + "name": item["name"] + } + + if collect_series_mapping: + return object_ids, series_mapping + + return object_ids diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index 79f35821..fd403b53 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -119,6 +119,16 @@ def _datasets(self): for dataset in self.g.subjects(RDF.type, DCAT.Dataset): yield dataset + def _dataset_series(self): + ''' + Generator that returns all DCAT dataset series on the graph + + Yields rdflib.term.URIRef objects that can be used on graph lookups + and queries + ''' + for dataset_series in self.g.subjects(RDF.type, DCAT.DatasetSeries): + yield dataset_series + def next_page(self): ''' Returns the URL of the next page or None if there is no next page @@ -173,7 +183,7 @@ def supported_formats(self): for plugin in rdflib.plugin.plugins(kind=rdflib.parser.Parser)]) - def datasets(self): + def datasets(self, series_mapping=None): ''' Generator that returns CKAN datasets parsed from the RDF graph @@ -193,6 +203,38 @@ def datasets(self): ) profile.parse_dataset(dataset_dict, dataset_ref) + # Add in_series if present in RDF and mapped + in_series = [] + for series_ref in self.g.objects(dataset_ref, DCAT.inSeries): + key = str(series_ref) + if series_mapping and key in series_mapping: + in_series.append(series_mapping[key]["id"]) + + if in_series: + dataset_dict["in_series"] = in_series + + yield dataset_dict + + def dataset_series(self): + ''' + Generator that returns CKAN dataset series parsed from the RDF graph + + Each dataset series is passed to all the loaded profiles before being + yielded, so it can be further modified by each one of them. + + Returns a dataset series dict that can be passed to eg `package_create` + or `package_update` + ''' + for dataset_ref in self._dataset_series(): + dataset_dict = {} + for profile_class in self._profiles: + profile = profile_class( + self.g, + dataset_type=self.dataset_type, + compatibility_mode=self.compatibility_mode + ) + profile.parse_dataset(dataset_dict, dataset_ref) + yield dataset_dict diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py index a99cadfe..62449af6 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_3.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py @@ -35,6 +35,15 @@ def parse_dataset(self, dataset_dict, dataset_ref): if values: dataset_dict["has_version"] = values + # Check if it's a dataset series + if (dataset_ref, RDF.type, DCAT.DatasetSeries) in self.g: + dataset_dict["type"] = "dataset_series" + + if "series_order_field" not in dataset_dict: + dataset_dict["series_order_field"] = "metadata_created" + if "series_order_type" not in dataset_dict: + dataset_dict["series_order_type"] = "date" + return dataset_dict def graph_from_dataset(self, dataset_dict, dataset_ref): From 0f1c13a7358cffa3c03722ef4a2daabe33dc2514 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 1 Oct 2025 14:56:19 +0200 Subject: [PATCH 4/5] Seperate branch multi lingual --- .gitignore | 2 + ckanext/dcat/profiles/base.py | 66 +- .../dcat/profiles/euro_dcat_ap_scheming.py | 45 +- ckanext/dcat/profiles/euro_health_dcat_ap.py | 69 +- .../schemas/health_dcat_ap_multilingual.yaml | 637 ++++++++++++++++++ .../test_euro_health_dcat_ap_profile_parse.py | 78 ++- ...t_euro_health_dcat_ap_profile_serialize.py | 166 +++++ docs/mapping-healthdcat.md | 1 + examples/dcat/dataset.ttl | 357 ++++++++++ examples/dcat/dataset_health.ttl | 16 +- examples/dcat/dataset_health_multilingual.ttl | 47 ++ 11 files changed, 1458 insertions(+), 26 deletions(-) create mode 100644 ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml create mode 100644 examples/dcat/dataset.ttl create mode 100644 examples/dcat/dataset_health_multilingual.ttl diff --git a/.gitignore b/.gitignore index 7b7d96d3..af41b12f 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,5 @@ build/* tmp/* package/DEBIAN/control *.swp +.idea + diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 5b0591e0..2dcdabe5 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -533,10 +533,36 @@ def _agents_details(self, subject, predicate): """ agents = [] + default_locale = config.get("ckan.locale_default", "") or "" + default_lang = default_locale.split("_")[0] if default_locale else None + for agent in self.g.objects(subject, predicate): agent_details = {} agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" - agent_details["name"] = self._object_value(agent, FOAF.name) + + names = list(self.g.objects(agent, FOAF.name)) + translations = {} + fallback_name = "" + for name_literal in names: + if isinstance(name_literal, Literal): + value = str(name_literal) + lang = name_literal.language + if lang: + translations[lang] = value + elif not fallback_name: + fallback_name = value + elif not fallback_name: + fallback_name = str(name_literal) + + if translations: + agent_details["name_translated"] = translations + if default_lang and translations.get(default_lang): + agent_details["name"] = translations[default_lang] + else: + agent_details["name"] = fallback_name or next(iter(translations.values())) + else: + agent_details["name"] = fallback_name + agent_details["email"] = self._without_mailto( self._object_value(agent, FOAF.mbox) ) @@ -839,8 +865,25 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict): self.g.add((agent_ref, RDF.type, FOAF.Organization)) self.g.add((agent_ref, RDF.type, FOAF.Agent)) + name_translated = agent_dict.get("name_translated") + translated_values = set() + if isinstance(name_translated, dict): + for lang, values in name_translated.items(): + if not values: + continue + if isinstance(values, (list, tuple)): + iterable = values + else: + iterable = [values] + for value in iterable: + if value: + self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang))) + translated_values.add((lang, value)) + if agent_dict.get("name"): - self.g.add((agent_ref, FOAF.name, Literal(agent_dict["name"]))) + name_value = agent_dict["name"] + if not translated_values or all(val != name_value for _, val in translated_values): + self.g.add((agent_ref, FOAF.name, Literal(name_value))) if agent_dict.get("email"): email = agent_dict["email"] if not email.startswith("mailto:"): @@ -856,11 +899,26 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict): self.g.add((agent_ref, DCT.identifier, Literal(agent_dict["identifier"]))) for sub_org in agent_dict.get("actedOnBehalfOf", []): - if sub_org.get("name"): + if sub_org.get("name") or sub_org.get("name_translated"): org_ref = BNode() self.g.add((agent_ref, PROV.actedOnBehalfOf, org_ref)) self.g.add((org_ref, RDF.type, PROV.Organization)) - self.g.add((org_ref, FOAF.name, Literal(sub_org["name"]))) + + sub_translations = sub_org.get("name_translated", {}) or {} + if isinstance(sub_translations, dict): + for lang, values in sub_translations.items(): + if not values: + continue + if isinstance(values, (list, tuple)): + iterable = values + else: + iterable = [values] + for value in iterable: + if value: + self.g.add((org_ref, FOAF.name, Literal(value, lang=lang))) + + if sub_org.get("name"): + self.g.add((org_ref, FOAF.name, Literal(sub_org["name"]))) return agent_ref diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 4a7db6f0..078bbc1f 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -70,6 +70,29 @@ def _parse_list_value(data_dict, field_name): except ValueError: pass + def _supports_agent_translations(field_name): + schema_field = self._schema_field(field_name) + if schema_field and "repeating_subfields" in schema_field: + return any( + subfield.get("field_name") == "name_translated" + for subfield in schema_field["repeating_subfields"] + ) + return False + + def _prune_agent_translations(agent_list): + pruned = [] + for agent_entry in agent_list: + if isinstance(agent_entry, dict): + agent_entry = dict(agent_entry) + agent_entry.pop("name_translated", None) + acted_lists = agent_entry.get("actedOnBehalfOf") + if isinstance(acted_lists, list): + agent_entry["actedOnBehalfOf"] = _prune_agent_translations(acted_lists) + pruned.append(agent_entry) + else: + pruned.append(agent_entry) + return pruned + for field_name in dataset_dict.keys(): _parse_list_value(dataset_dict, field_name) @@ -117,6 +140,8 @@ def _parse_list_value(data_dict, field_name): key, predicate = item agents = self._agents_details(dataset_ref, predicate) if agents: + if not _supports_agent_translations(key): + agents = _prune_agent_translations(agents) dataset_dict[key] = agents # Add any qualifiedRelations @@ -239,7 +264,25 @@ def _add_agents( self.g.add((agent_ref, RDF.type, FOAF.Agent)) self.g.add((dataset_ref, rdf_predicate, agent_ref)) - self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name") + name_translated = agent.get("name_translated") + translated_values = set() + if isinstance(name_translated, dict): + for lang, values in name_translated.items(): + if not values: + continue + if isinstance(values, (list, tuple)): + iterable = values + else: + iterable = [values] + for value in iterable: + if value: + self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang))) + translated_values.add((lang, value)) + + if agent.get("name"): + name_value = agent["name"] + if not translated_values or all(val != name_value for _, val in translated_values): + self.g.add((agent_ref, FOAF.name, Literal(name_value))) self._add_triple_from_dict( agent, agent_ref, FOAF.homepage, "url", _type=URIRef ) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index e461e5af..b80a5fe6 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -23,6 +23,12 @@ "dpv": DPV, } +# HealthDCAT-AP fields that can contain language-tagged literals +MULTILINGUAL_LITERAL_FIELDS = { + "population_coverage": HEALTHDCATAP.populationCoverage, + "publisher_note": HEALTHDCATAP.publisherNote, +} + class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile): """ @@ -42,7 +48,11 @@ def parse_dataset(self, dataset_dict, dataset_ref): return dataset_dict def _parse_health_fields(self, dataset_dict, dataset_ref): - self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref) + multilingual_fields = set(self._multilingual_dataset_fields()) + + self.__parse_healthdcat_stringvalues( + dataset_dict, dataset_ref, multilingual_fields + ) self.__parse_healthdcat_booleanvalues(dataset_dict, dataset_ref) self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref) @@ -78,7 +88,9 @@ def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref): if value is not None: dataset_dict[key] = value - def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): + def __parse_healthdcat_stringvalues( + self, dataset_dict, dataset_ref, multilingual_fields + ): for (key, predicate,) in ( ("analytics", HEALTHDCATAP.analytics), ("code_values", HEALTHDCATAP.hasCodeValues), @@ -92,9 +104,18 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): ("publisher_type", HEALTHDCATAP.publisherType), ("purpose", DPV.hasPurpose), ): - values = self._object_value_list(dataset_ref, predicate) - if values: - dataset_dict[key] = values + if ( + key in MULTILINGUAL_LITERAL_FIELDS + and key in multilingual_fields + ): + value = self._object_value( + dataset_ref, predicate, multilingual=True + ) + else: + value = self._object_value_list(dataset_ref, predicate) + + if value: + dataset_dict[key] = value def __parse_healthdcat_booleanvalues(self, dataset_dict, dataset_ref): for key, predicate in ( @@ -169,25 +190,45 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self.g.bind(prefix, namespace) # key, predicate, fallbacks, _type, _class - items = [ + list_items = [ ("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral), ("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral), ("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral), ("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), ("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), ("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral), - ( - "population_coverage", - HEALTHDCATAP.populationCoverage, - None, - URIRefOrLiteral, - ), ("personal_data", DPV.hasPersonalData, None, URIRef), - ("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral), ("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral), ("purpose", DPV.hasPurpose, None, URIRefOrLiteral), ] - self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) + self._add_list_triples_from_dict(dataset_dict, dataset_ref, list_items) + + multilingual_fields = set(self._multilingual_dataset_fields()) + for key, predicate in MULTILINGUAL_LITERAL_FIELDS.items(): + value = self._get_dataset_value(dataset_dict, key) + if not value: + continue + + if key in multilingual_fields and isinstance(value, dict): + for lang, translated_value in value.items(): + if translated_value: + self.g.add( + ( + dataset_ref, + predicate, + Literal(translated_value, lang=lang), + ) + ) + continue + + self._add_triple_from_dict( + dataset_dict, + dataset_ref, + predicate, + key, + list_value=True, + _type=URIRefOrLiteral, + ) if "trusted_data_holder" in dataset_dict: self.g.add( diff --git a/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml new file mode 100644 index 00000000..a963f9a9 --- /dev/null +++ b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml @@ -0,0 +1,637 @@ +scheming_version: 2 +dataset_type: dataset +about: Schema for HealthDCAT-AP with Fluent multilingual fields +about_url: http://github.com/ckan/ckanext-dcat + +form_languages: [en, nl, fr] + +dataset_fields: + +- field_name: title_translated + label: Title + preset: fluent_core_translated + required: true + help_text: A descriptive title for the dataset. + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes_translated + label: Description + required: true + preset: fluent_core_translated + form_snippet: fluent_markdown.html + display_snippet: fluent_markdown.html + help_text: A free-text account of the dataset. + +- field_name: tags_translated + label: Keywords + preset: fluent_tags + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the entity or person who published the dataset in each language. + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the contact point. Such as a ROR ID. + + help_text: Contact information for enquiries about the dataset. + +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the entity or person who published the dataset in each language. + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the publisher, such as a ROR ID. + help_text: Entity responsible for making the dataset available. + +- field_name: creator + label: Creator + repeating_label: Creator + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataset. + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the entity or person who created the dataset in each language. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: url + label: Landing page + form_placeholder: http://example.com/dataset.json + display_snippet: link.html + help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information. + + # Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the dataset. + + # Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + help_text: Version number or other version designation of the dataset. + +- field_name: version_notes + label: Version notes + preset: fluent_markdown + help_text: A description of the differences between this version and a previous version of the dataset. + + # Note: CKAN will generate a unique identifier for each dataset +- field_name: identifier + label: Identifier + help_text: A unique identifier of the dataset. + +- field_name: frequency + label: Frequency + help_text: The frequency at which dataset is published. + +- field_name: provenance + label: Provenance + preset: fluent_markdown + help_text: A statement about the lineage of the dataset. + +- field_name: dcat_type + label: Type + help_text: The type of the dataset. + # TODO: controlled vocabulary? + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + help_text: The temporal period or periods the dataset covers. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the dataset. + +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + help_text: A geographic region that is covered by the dataset. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in a dataset, measured in meters. + +- field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + help_text: Information that indicates whether the dataset is Open Data, has access restrictions or is not public. + +- field_name: alternate_identifier + label: Other identifier + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: This property refers to a secondary identifier of the dataset, such as MAST/ADS, DataCite, DOI, etc. + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the dataset. + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this dataset. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An implementing rule or other specification that the dataset follows. + +- field_name: is_referenced_by + label: Is referenced by + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset. + +- field_name: analytics + label: Analytics + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + An analytics distribution of the dataset. + Publishers are encouraged to provide URLs pointing to API endpoints or document + repositories where users can access or request associated resources such as + technical reports of the dataset, quality measurements, usability indicators,... + or analytics services. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. + +- field_name: has_version + label: Has version + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_inline: true + help_text: This property refers to a related Dataset that is a version, edition, or adaptation of the described Dataset. + + +- field_name: code_values + label: Code values + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Health classifications and their codes associated with the dataset. + +- field_name: coding_system + label: Coding system + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + Coding systems in use (e.g. ICD-10-CM, DGRs, SNOMED CT, ...). + To comply with HealthDCAT-AP, Wikidata URIs MUST be used. + +- field_name: purpose + label: Purpose + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A free text statement of the purpose of the processing of data or personal data. + +- field_name: health_category + label: Health category + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + The health category to which this dataset belongs as described in the Commission Regulation on + the European Health Data Space laying down a list of categories of electronic data for + secondary use, Art.33. + +- field_name: health_theme + label: Health theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A category of the Dataset or tag describing the Dataset. + +- field_name: legal_basis + label: Legal basis + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legal basis used to justify processing of personal data. + +- field_name: min_typical_age + label: Minimum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Minimum typical age of the population within the dataset. + +- field_name: max_typical_age + label: Maximum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Maximum typical age of the population within the dataset. + +- field_name: number_of_records + label: Number of records + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Size of the dataset in terms of the number of records + +- field_name: number_of_unique_individuals + label: Number of records for unique individuals. + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Number of records for unique individuals. + +- field_name: personal_data + label: Personal data + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Key elements that represent an individual in the dataset. + +- field_name: publisher_note + label: Publisher note + preset: fluent_markdown + help_text: > + A description of the publisher activities. + +- field_name: publisher_type + label: Publisher type + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A type of organisation that makes the Dataset available. + +- field_name: trusted_data_holder + label: Trusted Data Holder + preset: select + choices: + - value: false + label: "No" + - value: true + label: "Yes" + validators: ignore_missing boolean_validator + help_text: > + Indicates whether the dataset is held by a trusted data holder. + output_validators: boolean_validator + +- field_name: population_coverage + label: Population coverage + preset: fluent_markdown + help_text: > + A definition of the population within the dataset. + +- field_name: retention_period + label: Retention period + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + + help_text: A temporal period which the dataset is available for secondary use. + + +# Officially there can only be one HDAB for now, but keep it repeating subfield just in case +- field_name: hdab + label: Health data access body + repeating_label: Health data access body + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the health data access body in each language. + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the HDAB, such as a ROR ID. + help_text: Health Data Access Body supporting access to data in the Member State. + +- field_name: qualified_relation + label: Qualified relation + repeating_label: Relationship + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: relation + label: Relation + help_text: The resource related to the source resource. + + - field_name: role + label: Role + help_text: The function of an entity or agent with respect to another entity or resource. + help_text: A description of a relationship with another resource. + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this dataset (if not provided it will be autogenerated). + +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) +# +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name_translated + label: Name + preset: fluent_core_translated + help_text: A descriptive title for the resource. + +- field_name: description_translated + label: Description + preset: fluent_core_translated + form_snippet: fluent_markdown.html + display_snippet: fluent_markdown.html + help_text: A free-text account of the resource. + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: mimetype + label: Media type + validators: if_empty_guess_format ignore_missing unicode_safe + help_text: Media type for this format. If not provided it will be guessed. + +- field_name: compress_format + label: Compress format + help_text: The format of the file in which the data is contained in a compressed form. + +- field_name: package_format + label: Package format + help_text: The format of the file in which one or more data files are grouped together. + +- field_name: size + label: Size + validators: ignore_missing int_validator + form_snippet: number.html + display_snippet: file_size.html + help_text: File size in bytes + +- field_name: hash + label: Hash + help_text: Checksum of the downloaded file. + +- field_name: hash_algorithm + label: Hash Algorithm + help_text: Algorithm used to calculate to checksum. + +- field_name: rights + label: Rights + preset: fluent_markdown + help_text: Some statement about the rights associated with the resource. + +- field_name: availability + label: Availability + help_text: Indicates how long it is planned to keep the resource available. + +- field_name: status + label: Status + preset: select + choices: + - value: http://purl.org/adms/status/Completed + label: Completed + - value: http://purl.org/adms/status/UnderDevelopment + label: Under Development + - value: http://purl.org/adms/status/Deprecated + label: Deprecated + - value: http://purl.org/adms/status/Withdrawn + label: Withdrawn + help_text: The status of the resource in the context of maturity lifecycle. + +- field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. + + # Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + help_text: URL that gives access to the dataset (defaults to the standard resource URL). + + # Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + display_snippet: link.html + help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). + +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the resource. + +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the resource was changed, updated or modified. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the distribution. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in the distribution, measured in meters. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the resource. + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this resource. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An established schema to which the described resource conforms. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the resource. + +- field_name: access_services + label: Access services + repeating_label: Access service + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: title + label: Title + + - field_name: endpoint_description + label: Endpoint description + + - field_name: endpoint_url + label: Endpoint URL + preset: multiple_text + + - field_name: serves_dataset + label: Serves dataset + preset: multiple_text + validators: ignore_missing scheming_multiple_text + + - field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + help_text: Information regarding access or restrictions based on privacy, security, or other policies. + + help_text: A data service that gives access to the resource. + + # Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this resource (if not provided it will be autogenerated). diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 2d907f0f..1d7fdc9a 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -182,6 +182,7 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] + assert dataset["resources"][0]["retention_period"] == [ { "start": "2020-03-01", @@ -197,13 +198,21 @@ def test_e2e_dcat_to_ckan(self): "startedAtTime": "2021-01-01T00:00:00+00:00", "wasAssociatedWith": [{ "name": "Dr. Joris van Loenhout", + "name_translated": { + "en": "Dr. Joris van Loenhout", + "nl": "Dr. Joris van Loenhout", + }, "url": "https://www.sciensano.be/fr/people/joris-van-loenhout", "email": "Joris.VanLoenhout@sciensano.be", "type": "", "uri": "", "identifier": "", "actedOnBehalfOf": [{ - "name": "Contact Point" + "name": "Contact Point", + "name_translated": { + "en": "Contact Point", + "nl": "Contactpunt", + }, }] }] }] @@ -212,6 +221,8 @@ def test_e2e_dcat_to_ckan(self): agent = dataset["qualified_attribution"][0]["agent"][0] assert agent["name"] == "Contact Point" + assert agent["name_translated"]["en"] == "Contact Point" + assert agent["name_translated"]["nl"] == "Contactpunt" assert agent["email"] == "healthdata@sciensano.be" assert agent["url"] == "https://healthdata.be" assert agent["type"] == "" @@ -222,3 +233,68 @@ def test_e2e_dcat_to_ckan(self): assert dataset["quality_annotation"][0]["body"] == "https://certificates.theodi.org/en/datasets/393/certificate" assert dataset["quality_annotation"][0]["target"] == "https://certificates.theodi.org/en/datasets/393" assert dataset["quality_annotation"][0]["motivated_by"] == "http://www.w3.org/ns/dqv#qualityAssessment" + + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", + "ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml", +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.fluent:presets.json", +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestSchemingFluentParseSupport(BaseParseTest): + def test_e2e_dcat_to_ckan_multilingual(self): + contents = self._get_file_contents("dcat/dataset_health_multilingual.ttl") + + parser = RDFParser() + parser.parse(contents, _format="turtle") + + datasets = list(parser.datasets()) + assert len(datasets) == 1 + + dataset_dict = datasets[0] + dataset_dict["name"] = "test-dcat-health-multilingual" + + dataset = call_action("package_create", **dataset_dict) + + assert dataset["title_translated"]["en"] == "Health dataset" + assert dataset["title_translated"]["nl"] == "Gezondheidsdataset" + + assert dataset["notes_translated"]["en"] == "A dataset with multilingual metadata" + assert dataset["notes_translated"]["nl"] == "Een dataset met meertalige metadata" + + assert dataset["tags_translated"]["en"] == ["health"] + assert dataset["tags_translated"]["nl"] == ["gezondheid"] + + assert dataset["population_coverage"]["en"] == "Population coverage in English" + assert dataset["population_coverage"]["nl"] == "Populatiedekking in het Nederlands" + + assert dataset["publisher_note"]["en"] == "Publisher note in English" + assert dataset["publisher_note"]["nl"] == "Notitie van de uitgever in het Nederlands" + + publisher = dataset["publisher"][0] + assert publisher["name_translated"]["en"] == "Health Institute" + assert publisher["name_translated"]["nl"] == "Gezondheidsinstituut" + + creator = dataset["creator"][0] + assert creator["name_translated"]["en"] == "Health Creator" + assert creator["name_translated"]["nl"] == "Gezondheidsmaker" + + resource = dataset["resources"][0] + + assert resource["name_translated"]["en"] == "CSV extract" + assert resource["name_translated"]["nl"] == "CSV-uitvoer" + + assert resource["description_translated"]["en"] == "Distribution description in English" + assert ( + resource["description_translated"]["nl"] + == "Beschrijving van de distributie in het Nederlands" + ) + + assert resource["rights"]["en"] == "Rights statement" + assert resource["rights"]["nl"] == "Rechtenverklaring" diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index 0c523189..2a96564b 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -201,3 +201,169 @@ def test_e2e_ckan_to_dcat(self): Literal(distribution_details["retention_period"][0]["end"], datatype=XSD.date) ) + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", + "ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml", +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.fluent:presets.json", +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestEuroDCATAP3ProfileSerializeDatasetFluent(BaseSerializeTest): + def test_e2e_ckan_to_dcat_multilingual(self): + dataset_dict = { + "name": "health-dcat-fluent", + "title_translated": { + "en": "Health dataset", + "nl": "Gezondheidsdataset", + }, + "notes_translated": { + "en": "A dataset with multilingual metadata", + "nl": "Een dataset met meertalige metadata", + }, + "tags_translated": { + "en": ["health"], + "nl": ["gezondheid"], + }, + "population_coverage": { + "en": "Population coverage in English", + "nl": "Populatiedekking in het Nederlands", + }, + "publisher_note": { + "en": "Publisher note in English", + "nl": "Notitie van de uitgever in het Nederlands", + }, + "publisher": [ + { + "name": "Health Institute", + "name_translated": { + "en": "Health Institute", + "nl": "Gezondheidsinstituut", + }, + "email": "info@example.com", + "url": "https://healthdata.nl", + } + ], + "creator": [ + { + "name": "Health Creator", + "name_translated": { + "en": "Health Creator", + "nl": "Gezondheidsmaker", + }, + "email": "creator@example.com", + } + ], + "resources": [ + { + "url": "http://example.test/dataset/1/resource.csv", + "name_translated": { + "en": "CSV extract", + "nl": "CSV-uitvoer", + }, + "description_translated": { + "en": "Distribution description in English", + "nl": "Beschrijving van de distributie in het Nederlands", + }, + "rights": { + "en": "Rights statement", + "nl": "Rechtenverklaring", + }, + } + ], + } + + dataset = call_action("package_create", **dataset_dict) + + serializer = RDFSerializer() + graph = serializer.g + dataset_ref = serializer.graph_from_dataset(dataset) + + assert self._triple(graph, dataset_ref, DCT.title, "Health dataset", lang="en") + assert self._triple( + graph, dataset_ref, DCT.title, "Gezondheidsdataset", lang="nl" + ) + + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.populationCoverage, + "Population coverage in English", + lang="en", + ) + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.populationCoverage, + "Populatiedekking in het Nederlands", + lang="nl", + ) + + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.publisherNote, + "Publisher note in English", + lang="en", + ) + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.publisherNote, + "Notitie van de uitgever in het Nederlands", + lang="nl", + ) + + publisher_ref = next(graph.objects(dataset_ref, DCT.publisher)) + assert self._triple( + graph, publisher_ref, FOAF.name, "Health Institute", lang="en" + ) + assert self._triple( + graph, publisher_ref, FOAF.name, "Gezondheidsinstituut", lang="nl" + ) + + creator_ref = next(graph.objects(dataset_ref, DCT.creator)) + assert self._triple( + graph, creator_ref, FOAF.name, "Health Creator", lang="en" + ) + assert self._triple( + graph, creator_ref, FOAF.name, "Gezondheidsmaker", lang="nl" + ) + + distribution_ref = self._triple( + graph, dataset_ref, DCAT.distribution, None + )[2] + + assert self._triple( + graph, distribution_ref, DCT.title, "CSV extract", lang="en" + ) + assert self._triple( + graph, distribution_ref, DCT.title, "CSV-uitvoer", lang="nl" + ) + + assert self._triple( + graph, + distribution_ref, + DCT.description, + "Distribution description in English", + lang="en", + ) + assert self._triple( + graph, + distribution_ref, + DCT.description, + "Beschrijving van de distributie in het Nederlands", + lang="nl", + ) + + rights_node = next(graph.objects(distribution_ref, DCT.rights)) + assert self._triple( + graph, rights_node, RDFS.label, "Rights statement", lang="en" + ) + assert self._triple( + graph, rights_node, RDFS.label, "Rechtenverklaring", lang="nl" + ) diff --git a/docs/mapping-healthdcat.md b/docs/mapping-healthdcat.md index 6285fa90..22301a11 100644 --- a/docs/mapping-healthdcat.md +++ b/docs/mapping-healthdcat.md @@ -33,6 +33,7 @@ Example value could be: dpv:ResearchAndDevelopment. | - All `list` values are exported using `rdf:List`, supporting multi-valued entries. - `hdab` is parsed as an `foaf:Agent` and may include structured details. - `retention_period` expects a nested dictionary like `{ "start": , "end": }`. +- When language-specific literals are needed (eg `population_coverage`, `publisher_note`, `title`, resource `rights`), enable the Fluent-aware schema `ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml` together with the `fluent` plugin and include `ckanext.fluent:presets.json` in `scheming.presets`. This ensures translated values round-trip when harvesting and serializing HealthDCAT-AP content. !!! Note See [EuropeanHealthDCATAPProfile](https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/profiles/euro_health_dcat_ap.py) for implementation details. diff --git a/examples/dcat/dataset.ttl b/examples/dcat/dataset.ttl new file mode 100644 index 00000000..75db7e54 --- /dev/null +++ b/examples/dcat/dataset.ttl @@ -0,0 +1,357 @@ +@prefix adms: . +@prefix dcat: . +@prefix dcatap: . +@prefix dct: . +@prefix dqv: . +@prefix foaf: . +@prefix locn: . +@prefix oa: . +@prefix prov: . +@prefix rdfs: . +@prefix skos: . +@prefix spdx: . +@prefix vcard: . + + + a dcat:Resource , dcat:Dataset; + dcatap:applicableLegislation ; + + ; + + , + ; + , + ; + + [ a foaf:Organization; + foaf:homepage ; + foaf:mbox ; + foaf:name "EU Health Data Access Body"@en, "EU Health Data Access Body"@nl ; + ]; + + , , , ; + + , ; + + "110"^^; + + "0"^^; + + "123456789"^^; + + "7654321"^^; + + "This example includes a very non-descript population"@en, "Dit voorbeeld bevat een zeer nietszeggende populatie"@nl ; + + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation."@en, + "Health-RI is het Nederlandse gezondheidszorginitiatief om een geïntegreerde gezondheidsdatainfrastructuur voor onderzoek en innovatie op te bouwen."@nl ; + + ; + + "true"^^; + + [ a dct:PeriodOfTime; + rdfs:comment "As stated in the CSI deliberation"; + dcat:endDate "2034-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:accessRights ; + dct:accrualPeriodicity ; + dct:alternative "TEST-DATASET"; + dct:conformsTo ; + dct:creator ; + dct:description "This dataset is an example of using HealthDCAT-AP in CKAN"@en, + "Deze dataset is een voorbeeld van het gebruik van HealthDCAT-AP in CKAN"@nl ; + dct:identifier "http://example.com/dataset/1234567890"^^; + dct:isPartOf ; + dct:isReferencedBy , ; + dct:issued "2024-01-01T00:00:00Z"^^; + dct:language , , ; + dct:modified "2024-12-31T23:59:59Z"^^; + dct:provenance [ a dct:ProvenanceStatement; + rdfs:label "This example dataset is partly sourced from TEHDAS2" + ]; + dct:publisher [ a foaf:Organization , foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point"@en, "Contactpunt"@nl ; + ]; + dct:relation ; + dcat:qualifiedRelation [ + a dcat:Relationship ; + dct:relation ; + dcat:hadRole + ]; + dct:spatial ; + dct:temporal [ a dct:PeriodOfTime; + dcat:endDate "2024-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:title "HealthDCAT-AP test dataset"@en, "HealthDCAT-AP test dataset"@nl ; + dct:type [ a skos:Concept; + skos:inScheme ; + skos:prefLabel "Personal Data" + ]; + adms:identifier ; + adms:sample ; + adms:versionNotes "Dataset continuously updated"@en, "Dataset continue bijgewerkt"@nl ; + dcat:contactPoint ; + dcat:distribution ; + dcat:hasVersion ; + dcat:keyword "Test 1"@en , "Test 2"@en , "Test 3"@nl ; + dcat:spatialResolutionInMeters "10"^^; + dcat:temporalResolution "P1D"^^; + dcat:theme ; + # dcat:version is not mapped in ckan and should be hasVersion + # dcat:version "Project HDBP0250"; + dqv:hasQualityAnnotation [ a dqv:QualityCertificate; + oa:hasBody ; + oa:hasTarget ; + oa:motivatedBy dqv:qualityAssessment + ]; + prov:qualifiedAttribution ; + prov:wasGeneratedBy ; + foaf:page [ a foaf:Document; + rdfs:label "Landing Page for Sciensano"; + foaf:homepage + ]; + + ; + + , + , + ; + + ; + adms:status ; + dcat:inSeries . +# still to add: dct:source + + + a dcat:DatasetSeries ; + dcatap:applicableLegislation ; + dcat:contactPoint [ + a vcard:Kind ; + vcard:hasURL ; + vcard:hasEmail ; + vcard:fn "Test Example" ; + ] ; + dct:description "This is an example dataset series with dummy data" ; + dct:accrualPeriodicity ; + dct:spatial ; + dct:modified "2025-09-10T15:00:00Z"^^ ; + dct:publisher [ + a foaf:Agent ; + dct:spatial ; + foaf:mbox ; + dct:identifier "test" ; + foaf:name "Test Example" ; + healthdcatap:publisherNote "Example note" ; + healthdcatap:publisherType ; + dct:type ; + foaf:homepage ; + ] ; + dct:issued "2025-09-01T12:00:00Z"^^< ; + dct:temporal [ + a dct:PeriodOfTime; + dcat:endDate "2024-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:title "Example dataset series"@en, "Voorbeeld dataset serie"@nl . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/analytics/47f55653-a151-48c1-8d90-940561da6e57"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "_g_L202C11377" , "internalURI:wasGeneratedBy0" , "_g_L123C7733" + ]; + dct:title "Technical report number of unique study subjects available by environment for project HDBP0250"@en, + "Technisch rapport aantal unieke studiepersonen beschikbaar per omgeving voor project HDBP0250"@nl ; + dcat:accessURL ; + + [ a dct:PeriodOfTime; + rdfs:comment "As stated in the CSI deliberation"; + dcat:endDate "2034-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dcat:downloadURL ; + dcat:mediaType ; + dcat:accessService [ + a dcat:DataService; + dct:conformsTo ; + dct:format ; + dct:identifier "service-123"; + dct:language ; + dct:rights "open use"; + dcat:landingPage ; + dcat:keyword "keyword1"@en, "trefwoord2"@nl ; + dcat:contactPoint [ + a vcard:Kind ; + vcard:hasURL ; + vcard:hasEmail ; + vcard:fn "Test Example" ; + ] ; + dct:creator [ + a foaf:Agent ; + dct:spatial ; + foaf:mbox ; + dct:identifier "test" ; + foaf:name "Test Example" ; + healthdcatap:publisherNote "Example note" ; + healthdcatap:publisherType ; + dct:type ; + foaf:homepage ; + ] ; + ] ; + foaf:page ; + dct:language ; + dct:conformsTo . + + + a dct:MediaType . + + + a foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point"@en, "Contactpunt"@nl . + + + a adms:Identifier; + skos:notation "https://www.healthinformationportal.eu/health-information-sources/linking-registers-covid-19-vaccine-surveillance"^^; + adms:schemaAgency "Health Information Portal" . + + + a vcard:Organization , vcard:Kind; + vcard:fn "Contact Point"; + vcard:hasEmail ; + vcard:hasURL ; + vcard:organisationName "Contact Point"; + vcard:organisationUnit "Health Information" . + + + a dcat:CatalogRecord; + dct:creator ; + dct:identifier "16e16149-bf41-42f6-8741-225e8c97a35e"; + dct:issued "2024-10-04T14:28:36Z"^^; + dct:modified "2024-10-09T17:34:28Z"^^; + spdx:checksum [ a spdx:Checksum; + spdx:algorithm spdx:checksumAlgorithm_md5; + spdx:checksumValue "ea77c251b6945e450ae4d66c581495d4" + ]; + foaf:primaryTopic . + + + + a dct:LinguisticSystem . + + + a ; + dct:title "ID_TU_STATBEL_POP"; + + ; + dcat:keyword "TEST-DATASET" . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/sample/fe921169-4619-4386-8bfe-60ea131dbe96"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:language ; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "Free access." + ]; + dct:title "Proxy data generating for the EHDS2 Pilot project Sciensano Use Case"@en, + "Proxygegevens gegenereerd voor het EHDS2-pilotproject Sciensano Use Case"@nl; + dcat:accessURL ; + dcat:downloadURL ; + dcat:mediaType . + + + + a dct:LinguisticSystem . + + + a dct:LinguisticSystem . + + + a skos:Concept; + skos:prefLabel "National Public Health Institute" . + + + a dct:RightsStatement . + + + a dct:Frequency . + + + a prov:Attribution; + dcat:hadRole ; + prov:agent [ a foaf:Organization; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point"@en, "Contactpunt"@nl ; + ] . + + + a dct:Location . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/Y59.0"^^; + skos:definition "Viral vaccines"; + skos:hasTopConcept ; + skos:notation "Y59.0"; + skos:prefLabel "Viral vaccines" . + + + a dct:MediaTypeOrExtent . + + + a prov:Activity; + rdfs:label "http://dbpedia.org/resource/Record_linkage"; + rdfs:seeAlso ; + dct:type ; + prov:startedAtTime "2021-01-01T00:00:00Z"^^; + prov:wasAssociatedWith [ a prov:Agent; + prov:actedOnBehalfOf [ a prov:Organization , prov:Agent; + foaf:name "Contact Point"@en, "Contactpunt"@nl ; + ]; + foaf:homepage ; + foaf:mbox ; + foaf:name "Dr. Joris van Loenhout" + ]; + foaf:page . + + + a ; + + ; + + "Patient death reason\tInformation on wheter the cause of death was COVID-19."; + + "CD_COD_COVID" . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/U07.1"^^; + skos:definition "COVID-19, virus identified"; + skos:hasTopConcept ; + skos:notation "U07.1"; + skos:prefLabel "Test 1" . + +# +# a dct:LicenseDocument; +# rdfs:label "Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported" . + + a skos:Concept . \ No newline at end of file diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index f8d72b89..b21fd4d3 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -26,7 +26,7 @@ [ a foaf:Organization; foaf:homepage ; foaf:mbox ; - foaf:name "EU Health Data Access Body" + foaf:name "EU Health Data Access Body"@en ]; , , , ; @@ -72,7 +72,8 @@ dct:publisher [ a foaf:Organization , foaf:Agent; foaf:homepage ; foaf:mbox ; - foaf:name "Contact Point" + foaf:name "Contact Point"@en , + "Contactpunt"@nl ]; dct:relation ; dcat:qualifiedRelation [ @@ -152,7 +153,7 @@ a foaf:Agent; foaf:homepage ; foaf:mbox ; - foaf:name "Contact Point" . + foaf:name "Contact Point"@en , "Contactpunt"@nl . a adms:Identifier; @@ -231,7 +232,8 @@ prov:agent [ a foaf:Organization; foaf:homepage ; foaf:mbox ; - foaf:name "Contact Point" + foaf:name "Contact Point"@en , + "Contactpunt"@nl ] . @@ -256,11 +258,13 @@ prov:startedAtTime "2021-01-01T00:00:00Z"^^; prov:wasAssociatedWith [ a prov:Agent; prov:actedOnBehalfOf [ a prov:Organization , prov:Agent; - foaf:name "Contact Point" + foaf:name "Contact Point"@en , + "Contactpunt"@nl ]; foaf:homepage ; foaf:mbox ; - foaf:name "Dr. Joris van Loenhout" + foaf:name "Dr. Joris van Loenhout"@en , + "Dr. Joris van Loenhout"@nl ]; foaf:page . diff --git a/examples/dcat/dataset_health_multilingual.ttl b/examples/dcat/dataset_health_multilingual.ttl new file mode 100644 index 00000000..4315517b --- /dev/null +++ b/examples/dcat/dataset_health_multilingual.ttl @@ -0,0 +1,47 @@ +@prefix adms: . +@prefix dcat: . +@prefix dct: . +@prefix foaf: . +@prefix healthdcatap: . +@prefix rdfs: . +@prefix xsd: . + + + a dcat:Dataset ; + dct:title "Health dataset"@en , "Gezondheidsdataset"@nl ; + dct:description "A dataset with multilingual metadata"@en , + "Een dataset met meertalige metadata"@nl ; + dcat:keyword "health"@en , "gezondheid"@nl ; + healthdcatap:populationCoverage + "Population coverage in English"@en , + "Populatiedekking in het Nederlands"@nl ; + healthdcatap:publisherNote + "Publisher note in English"@en , + "Notitie van de uitgever in het Nederlands"@nl ; + dct:identifier "http://example.test/dataset/1" ; + dct:issued "2024-01-01T00:00:00Z"^^xsd:dateTime ; + dct:modified "2024-06-01T00:00:00Z"^^xsd:dateTime ; + dct:publisher [ a foaf:Organization ; + foaf:name "Health Institute"@en , + "Gezondheidsinstituut"@nl ; + foaf:mbox ; + foaf:homepage + ] ; + dct:creator [ a foaf:Agent ; + foaf:name "Health Creator"@en , + "Gezondheidsmaker"@nl ; + foaf:mbox + ] ; + dcat:distribution . + + + a dcat:Distribution ; + dct:title "CSV extract"@en , "CSV-uitvoer"@nl ; + dct:description "Distribution description in English"@en , + "Beschrijving van de distributie in het Nederlands"@nl ; + dct:rights [ a dct:RightsStatement ; + rdfs:label "Rights statement"@en , + "Rechtenverklaring"@nl + ] ; + dcat:downloadURL ; + dct:format . From da05e2329fe45787d73ad4352e07cf7502cd7bb7 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 1 Oct 2025 17:10:18 +0200 Subject: [PATCH 5/5] fix UT --- .../schemas/health_dcat_ap_multilingual.yaml | 2 +- .../test_euro_health_dcat_ap_profile_parse.py | 58 +++++++++---------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml index a963f9a9..570e2e61 100644 --- a/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml @@ -3,7 +3,7 @@ dataset_type: dataset about: Schema for HealthDCAT-AP with Fluent multilingual fields about_url: http://github.com/ckan/ckanext-dcat -form_languages: [en, nl, fr] +form_languages: [en, nl] dataset_fields: diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 1d7fdc9a..949dccfd 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -189,40 +189,40 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] - - assert dataset["provenance_activity"] == [{ - "uri": "internalURI:wasGeneratedBy0", - "label": "http://dbpedia.org/resource/Record_linkage", - "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp", - "dct_type": "http://dbpedia.org/resource/Record_linkage", - "startedAtTime": "2021-01-01T00:00:00+00:00", - "wasAssociatedWith": [{ - "name": "Dr. Joris van Loenhout", - "name_translated": { - "en": "Dr. Joris van Loenhout", - "nl": "Dr. Joris van Loenhout", - }, - "url": "https://www.sciensano.be/fr/people/joris-van-loenhout", - "email": "Joris.VanLoenhout@sciensano.be", - "type": "", - "uri": "", - "identifier": "", - "actedOnBehalfOf": [{ - "name": "Contact Point", - "name_translated": { - "en": "Contact Point", - "nl": "Contactpunt", - }, - }] - }] - }] + + provenance_activity = dataset["provenance_activity"] + assert len(provenance_activity) == 1 + + activity = provenance_activity[0] + assert activity["uri"] == "internalURI:wasGeneratedBy0" + assert activity["label"] == "http://dbpedia.org/resource/Record_linkage" + assert activity["seeAlso"] == ( + "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp" + ) + assert activity["dct_type"] == "http://dbpedia.org/resource/Record_linkage" + assert activity["startedAtTime"] == "2021-01-01T00:00:00+00:00" + + associated = activity["wasAssociatedWith"] + assert len(associated) == 1 + + agent = associated[0] + assert agent["name"] == "Dr. Joris van Loenhout" + if agent.get("name_translated"): + assert agent["name_translated"].get("en") == "Dr. Joris van Loenhout" + assert agent["url"] == "https://www.sciensano.be/fr/people/joris-van-loenhout" + assert agent["email"] == "Joris.VanLoenhout@sciensano.be" + + acted_on_behalf = agent.get("actedOnBehalfOf", []) + assert len(acted_on_behalf) == 1 + acted_agent = acted_on_behalf[0] + assert acted_agent["name"] == "Contact Point" + if acted_agent.get("name_translated"): + assert acted_agent["name_translated"].get("en") == "Contact Point" assert dataset["qualified_attribution"][0]["role"] == "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor" agent = dataset["qualified_attribution"][0]["agent"][0] assert agent["name"] == "Contact Point" - assert agent["name_translated"]["en"] == "Contact Point" - assert agent["name_translated"]["nl"] == "Contactpunt" assert agent["email"] == "healthdata@sciensano.be" assert agent["url"] == "https://healthdata.be" assert agent["type"] == ""