Skip to content
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
e79099b
feat(rdf): support serialization of DatasetSeries from RDF and link c…
Jun 5, 2025
22face7
remove fallback. not support by dataseries extension
Jun 19, 2025
84ef929
Remove enter
Jun 19, 2025
1f5110b
fix: skip fluent for 2.9
Jul 1, 2025
2d70ee8
fix import
Jul 1, 2025
b061455
check if this works
Jul 1, 2025
c9e4626
Disable support for 2.9 in tests
Jul 1, 2025
01991b4
check if this works
Jul 1, 2025
ed6b696
Merge pull request #1 from GenomicDataInfrastructure/support-RDF-data…
hcvdwerf Jul 1, 2025
806e248
feat(missing field) add missing fields
Jul 2, 2025
b1c8193
Add homepage
Jul 7, 2025
a6e1e4b
Also serrilaize homepage when available
Jul 7, 2025
217da9a
Added retention period to healthDCAT
Jul 8, 2025
cd8661b
Fix retention period UT
Jul 10, 2025
90dac79
fix test
Jul 10, 2025
1294f4a
feat(missing field) add missing fields
Jul 2, 2025
501a8de
Added DCAT AP 3 has version
Jul 10, 2025
c0efdfc
Merge branch 'add-missing-fields' of https://github.com/hcvdwerf/ckan…
Jul 10, 2025
c4ad649
Added has version to DCAT 3 and added missing dataservice fields
Jul 10, 2025
163d284
fix import
Jul 14, 2025
d8c04df
fix unit tests
Jul 14, 2025
63a2749
Added has version to DCAT 3 and added missing dataservice fields
Jul 10, 2025
fdbd8bc
update schema
Jul 14, 2025
572acbe
Merge branch 'add-missing-fields' of https://github.com/hcvdwerf/ckan…
Jul 14, 2025
1985f6e
fiix mapping documentation
Jul 14, 2025
3cc8905
Updated documetation for retention period
Jul 14, 2025
c03bdd2
fix(dataseries) cardanality for dataseries
Jul 15, 2025
cccd727
fix(UT-cardanality) fix UT for cardanality
Jul 16, 2025
6d23062
add applicable_legislation to Dataservice
Jul 16, 2025
fbbd48e
fix(dataservice (contact & creator)) fix mapping for creator and cont…
Jul 16, 2025
4d1e3a0
add mapping + UT for description within dataservice
Jul 16, 2025
2abc07c
Add if check by contactpoint
Jul 16, 2025
7759e07
Add modified, publisher, license and theme to dataservice
Jul 16, 2025
5762c35
fix(dataseries) Remove dataseries from pull request
Aug 27, 2025
5cf6942
Remove fluent extension tag
Aug 27, 2025
6eec310
Merge branch 'master' into add-missing-fields
hcvdwerf Aug 27, 2025
4485715
Update health_dcat_ap.yaml
hcvdwerf Sep 3, 2025
ba081d7
fix: Always store as list when complex object
Sep 11, 2025
419c364
fix: parse of creator and contact within acces service
Sep 11, 2025
03a5f88
fix(croisant) point to mlcroisant version 1.0.22
Sep 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ jobs:
pip install -r ckanext-harvest/requirements.txt
git clone https://github.com/ckan/ckanext-scheming
pip install -e ckanext-scheming
git clone https://github.com/ckan/ckanext-fluent
pip install -e ckanext-fluent
pip install git+https://github.com/ckan/ckanext-fluent.git@4e9340a#egg=ckanext-fluent

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed support for CKAN 2.9 in be3c8d6 so this should not be longer necessary

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I revert the test.yml

git clone https://github.com/ckan/ckanext-dataset-series
pip install -e ckanext-dataset-series
- name: Setup extension
Expand Down
112 changes: 79 additions & 33 deletions ckanext/dcat/harvesters/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,39 +210,18 @@ def gather_stage(self, harvest_job):
return []

try:

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hcvdwerf Can you remove the changes from #350 from this PR?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes done

source_dataset = model.Package.get(harvest_job.source.id)

for dataset in parser.datasets():
if not dataset.get('name'):
dataset['name'] = self._gen_new_name(dataset['title'])
if dataset['name'] in self._names_taken:
suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1
dataset['name'] = '{}-{}'.format(dataset['name'], suffix)
self._names_taken.append(dataset['name'])

# Unless already set by the parser, get the owner organization (if any)
# from the harvest source dataset
if not dataset.get('owner_org'):
if source_dataset.owner_org:
dataset['owner_org'] = source_dataset.owner_org

# Try to get a unique identifier for the harvested dataset
guid = self._get_guid(dataset, source_url=source_dataset.url)

if not guid:
self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset),
harvest_job)
continue

dataset['extras'].append({'key': 'guid', 'value': guid})
guids_in_source.append(guid)

obj = HarvestObject(guid=guid, job=harvest_job,
content=json.dumps(dataset))

obj.save()
object_ids.append(obj.id)
source_dataset = model.Package.get(harvest_job.source.id)

series_ids, series_mapping = self._parse_and_collect(
parser.dataset_series(),
source_dataset,
harvest_job,
guids_in_source,
is_series=True,
collect_series_mapping=True
)
object_ids += series_ids
object_ids += self._parse_and_collect(parser.datasets(series_mapping), source_dataset, harvest_job, guids_in_source, is_series=False)
except Exception as e:
self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()),
harvest_job)
Expand Down Expand Up @@ -422,3 +401,70 @@ def import_stage(self, harvest_object):
model.Session.commit()

return True

def _parse_and_collect(
self,
items,
source_dataset,
harvest_job,
guids_in_source,
is_series=False,
collect_series_mapping=False
):
object_ids = []
label = "dataset series" if is_series else "dataset"
series_mapping = {} if collect_series_mapping else None

for item in items:
original_title = item.get("title", label)
if not item.get("name"):
item["name"] = self._gen_new_name(original_title)

if item["name"] in self._names_taken:
suffix = len([i for i in self._names_taken if i.startswith(item["name"] + "-")]) + 1
item["name"] = f"{item['name']}-{suffix}"

self._names_taken.append(item["name"])

if not item.get("owner_org") and source_dataset.owner_org:
item["owner_org"] = source_dataset.owner_org

guid = self._get_guid(item, source_url=source_dataset.url)
if not guid:
self._save_gather_error(f"Could not get a unique identifier for {label}: {item}", harvest_job)
continue

item.setdefault("extras", []).append({"key": "guid", "value": guid})
guids_in_source.append(guid)

obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(item))
obj.save()
object_ids.append(obj.id)

# Store mapping of RDF URI to dataset name if requested
if collect_series_mapping:
series_uri = item.get("uri") or item.get("identifier")
if series_uri:
# Try to find an existing active dataset series by 'guid' match
existing = model.Session.query(model.Package).\
join(model.PackageExtra).\
filter(model.PackageExtra.key == 'guid').\
filter(model.PackageExtra.value == series_uri).\
filter(model.Package.type == 'dataset_series').\
filter(model.Package.state == 'active').\
first()

if existing:
item["name"] = existing.name

series_mapping[str(series_uri)] = {
"id": existing.id if existing else item.get("id"),
"name": item["name"]
}


if collect_series_mapping:
return object_ids, series_mapping

return object_ids

45 changes: 44 additions & 1 deletion ckanext/dcat/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,16 @@ def _datasets(self):
for dataset in self.g.subjects(RDF.type, DCAT.Dataset):
yield dataset

def _dataset_series(self):
'''
Generator that returns all DCAT dataset series on the graph

Yields rdflib.term.URIRef objects that can be used on graph lookups
and queries
'''
for dataset_series in self.g.subjects(RDF.type, DCAT.DatasetSeries):
yield dataset_series

def next_page(self):
'''
Returns the URL of the next page or None if there is no next page
Expand Down Expand Up @@ -173,7 +183,7 @@ def supported_formats(self):
for plugin
in rdflib.plugin.plugins(kind=rdflib.parser.Parser)])

def datasets(self):
def datasets(self, series_mapping=None):
'''
Generator that returns CKAN datasets parsed from the RDF graph

Expand All @@ -193,6 +203,39 @@ def datasets(self):
)
profile.parse_dataset(dataset_dict, dataset_ref)

# Add in_series if present in RDF and mapped
in_series = []
for series_ref in self.g.objects(dataset_ref, DCAT.inSeries):
key = str(series_ref)
if series_mapping and key in series_mapping:
in_series.append(series_mapping[key]["id"])

if in_series:
dataset_dict["in_series"] = in_series

yield dataset_dict


def dataset_series(self):
'''
Generator that returns CKAN dataset series parsed from the RDF graph

Each dataset series is passed to all the loaded profiles before being
yielded, so it can be further modified by each one of them.

Returns a dataset series dict that can be passed to eg `package_create`
or `package_update`
'''
for dataset_ref in self._dataset_series():
dataset_dict = {}
for profile_class in self._profiles:
profile = profile_class(
self.g,
dataset_type=self.dataset_type,
compatibility_mode=self.compatibility_mode
)
profile.parse_dataset(dataset_dict, dataset_ref)

yield dataset_dict


Expand Down
88 changes: 84 additions & 4 deletions ckanext/dcat/profiles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ckantoolkit import ObjectNotFound, asbool, aslist, config, get_action, url_for
from dateutil.parser import parse as parse_date
from geomet import InvalidGeoJSONException, wkt
from rdflib import BNode, Literal, URIRef, term
from rdflib import BNode, Literal, URIRef, term, PROV
from rdflib.namespace import ORG, RDF, RDFS, SKOS, XSD, Namespace

from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
Expand Down Expand Up @@ -95,7 +95,6 @@ def __new__(cls, value, lang=None):
# In case something goes wrong: use Literal
return Literal(value, lang=lang)


class CleanedURIRef(object):
"""Performs some basic URL encoding on value before creating an URIRef object.

Expand Down Expand Up @@ -547,9 +546,13 @@ def _agents_details(self, subject, predicate):
)
agent_details["url"] = self._object_value(agent, FOAF.homepage)
agent_details["type"] = self._object_value(agent, DCT.type)
agent_details['identifier'] = self._object_value(agent, DCT.identifier)
agents.append(agent_details)
agent_details["identifier"] = self._object_value(agent, DCT.identifier)

acted_orgs = self._agents_details(agent, PROV.actedOnBehalfOf)
if acted_orgs:
agent_details["actedOnBehalfOf"] = acted_orgs

agents.append(agent_details)
return agents

def _contact_details(self, subject, predicate):
Expand Down Expand Up @@ -819,6 +822,83 @@ def _read_list_value(self, value):

return items

def _add_agent_to_graph(self, subject_ref, predicate, agent_dict):
"""
Serializes a foaf:Agent or foaf:Organization with optional subfields into the RDF graph.

Parameters:
- subject_ref: The RDF subject (dataset, activity, etc.)
- predicate: The RDF predicate (e.g., dct:publisher, prov:wasAssociatedWith, dcat:agent)
- agent_dict: A dict with agent metadata (e.g., name, email, homepage, type, identifier, actedOnBehalfOf)
"""
uri = agent_dict.get("uri", "").strip()

agent_ref = URIRefOrLiteral(uri) if uri else BNode()

self.g.add((subject_ref, predicate, agent_ref))
self.g.add((agent_ref, RDF.type, FOAF.Organization))
self.g.add((agent_ref, RDF.type, FOAF.Agent))

if agent_dict.get("name"):
self.g.add((agent_ref, FOAF.name, Literal(agent_dict["name"])))
if agent_dict.get("email"):
email = agent_dict["email"]
if not email.startswith("mailto:"):
email = f"mailto:{email}"
self.g.add((agent_ref, FOAF.mbox, URIRef(email)))
if agent_dict.get("url"):
self.g.add((agent_ref, FOAF.homepage, URIRef(agent_dict["url"])))
if agent_dict.get("homepage"):
self.g.add((agent_ref, FOAF.homepage, URIRef(agent_dict["homepage"])))
if agent_dict.get("type"):
self.g.add((agent_ref, DCT.type, URIRef(agent_dict["type"])))
if agent_dict.get("identifier"):
self.g.add((agent_ref, DCT.identifier, Literal(agent_dict["identifier"])))

for sub_org in agent_dict.get("actedOnBehalfOf", []):
if sub_org.get("name"):
org_ref = BNode()
self.g.add((agent_ref, PROV.actedOnBehalfOf, org_ref))
self.g.add((org_ref, RDF.type, PROV.Organization))
self.g.add((org_ref, FOAF.name, Literal(sub_org["name"])))

return agent_ref

def _add_contact_to_graph(self, subject, predicate, contact):
contact_uri = contact.get("uri")
if contact_uri:
contact_details = CleanedURIRef(contact_uri)
else:
contact_details = BNode()

self.g.add((contact_details, RDF.type, VCARD.Kind))
self.g.add((subject, predicate, contact_details))

self._add_triple_from_dict(contact, contact_details, VCARD.fn, "name")
self._add_triple_from_dict(
contact,
contact_details,
VCARD.hasEmail,
"email",
_type=URIRef,
value_modifier=self._add_mailto,
)
self._add_triple_from_dict(
contact,
contact_details,
VCARD.hasUID,
"identifier",
_type=URIRefOrLiteral,
)
self._add_triple_from_dict(
contact,
contact_details,
VCARD.hasURL,
"url",
_type=URIRef,
)


def _add_spatial_value_to_graph(self, spatial_ref, predicate, value):
"""
Adds spatial triples to the graph. Assumes that value is a GeoJSON string
Expand Down
Loading