Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions ckanext/dcat/profiles/euro_dcat_ap_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ def parse_dataset(self, dataset_dict, dataset_ref):
# DCAT AP v2 scheming fields
dataset_dict = self._parse_dataset_v2_scheming(dataset_dict, dataset_ref)

sample_uris = []
for sample in self.g.objects(dataset_ref, ADMS.sample):
if (sample, RDF.type, DCAT.Distribution) in self.g:
resource_dict = self._parse_distribution(sample)
dataset_dict["resources"].append(resource_dict)
sample_uris.append(str(sample))

if sample_uris:
dataset_dict["sample"] = sample_uris

# DCAT AP v3: hasVersion
values = self._object_value_list(dataset_ref, DCAT.hasVersion)
if values:
Expand Down Expand Up @@ -63,6 +73,11 @@ def graph_from_catalog(self, catalog_dict, catalog_ref):

def _graph_from_dataset_v3(self, dataset_dict, dataset_ref):

items = [
("sample", ADMS.sample, None, URIRefOrLiteral),
]
self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

dataset_series = False

# TODO: support custom type names (ckan/ckanext-dataset-series#6)
Expand Down
195 changes: 102 additions & 93 deletions ckanext/dcat/profiles/euro_dcat_ap_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
("has_version", DCT.hasVersion),
("is_version_of", DCT.isVersionOf),
("source", DCT.source),
("sample", ADMS.sample),
):
values = self._object_value_list(dataset_ref, predicate)
if values:
Expand Down Expand Up @@ -218,99 +217,14 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
)

# Resources
distribution_uris = []
for distribution in self._distributions(dataset_ref):

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know the method is long but I'd like to keep this in the _parse_dataset_base() method for now to not break other profiles

resource_dict = {}

multilingual_fields = self._multilingual_resource_fields()

# Simple values
for key, predicate in (
("access_url", DCAT.accessURL),
("download_url", DCAT.downloadURL),
("issued", DCT.issued),
("modified", DCT.modified),
("status", ADMS.status),
("license", DCT.license),
("rights", DCT.rights),
):
multilingual = key in multilingual_fields
value = self._object_value(
distribution, predicate, multilingual=multilingual
)
if value:
resource_dict[key] = value

# Multilingual core fields
for key, predicate in (
("name", DCT.title),
("description", DCT.description)
):
if f"{key}_translated" in multilingual_fields:
value = self._object_value(
distribution, predicate, multilingual=True
)
resource_dict[f"{key}_translated"] = value
resource_dict[f"{key}"] = value.get(self._default_lang)
else:
value = self._object_value(distribution, predicate)
if value:
resource_dict[key] = value

# URL

resource_dict["url"] = self._object_value(
distribution, DCAT.downloadURL
) or self._object_value(distribution, DCAT.accessURL)

# Lists
for key, predicate in (
("language", DCT.language),
("documentation", FOAF.page),
("conforms_to", DCT.conformsTo),
):
values = self._object_value_list(distribution, predicate)
if values:
resource_dict[key] = json.dumps(values)

# Format and media type
normalize_ckan_format = toolkit.asbool(
config.get("ckanext.dcat.normalize_ckan_format", True)
)
imt, label = self._distribution_format(distribution, normalize_ckan_format)

if imt:
resource_dict["mimetype"] = imt

if label:
resource_dict["format"] = label
elif imt:
resource_dict["format"] = imt

# Size
size = self._object_value_int(distribution, DCAT.byteSize)
if size is not None:
resource_dict["size"] = size

# Checksum
for checksum in self.g.objects(distribution, SPDX.checksum):
algorithm = self._object_value(checksum, SPDX.algorithm)
checksum_value = self._object_value(checksum, SPDX.checksumValue)
if algorithm:
resource_dict["hash_algorithm"] = algorithm
if checksum_value:
resource_dict["hash"] = checksum_value

# Distribution URI (explicitly show the missing ones)
resource_dict["uri"] = (
str(distribution) if isinstance(distribution, term.URIRef) else ""
)

# Remember the (internal) distribution reference for referencing in
# further profiles, e.g. for adding more properties
resource_dict["distribution_ref"] = str(distribution)

resource_dict = self._parse_distribution(distribution)
dataset_dict["resources"].append(resource_dict)
distribution_uris.append(str(distribution))

if distribution_uris:
dataset_dict["distribution"] = distribution_uris
Comment on lines +224 to +227

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused about this, Why do we need it?


if self.compatibility_mode:
# Tweak the resulting dict to make it compatible with previous
Expand All @@ -329,6 +243,98 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):

return dataset_dict

def _parse_distribution(self, distribution):
resource_dict = {}

multilingual_fields = self._multilingual_resource_fields()

# Simple values
for key, predicate in (
("access_url", DCAT.accessURL),
("download_url", DCAT.downloadURL),
("issued", DCT.issued),
("modified", DCT.modified),
("status", ADMS.status),
("license", DCT.license),
("rights", DCT.rights),
):
multilingual = key in multilingual_fields
value = self._object_value(
distribution, predicate, multilingual=multilingual
)
if value:
resource_dict[key] = value

# Multilingual core fields
for key, predicate in (
("name", DCT.title),
("description", DCT.description)
):
if f"{key}_translated" in multilingual_fields:
value = self._object_value(
distribution, predicate, multilingual=True
)
resource_dict[f"{key}_translated"] = value
resource_dict[f"{key}"] = value.get(self._default_lang)
else:
value = self._object_value(distribution, predicate)
if value:
resource_dict[key] = value

# URL
resource_dict["url"] = self._object_value(
distribution, DCAT.downloadURL
) or self._object_value(distribution, DCAT.accessURL)

# Lists
for key, predicate in (
("language", DCT.language),
("documentation", FOAF.page),
("conforms_to", DCT.conformsTo),
):
values = self._object_value_list(distribution, predicate)
if values:
resource_dict[key] = json.dumps(values)

# Format and media type
normalize_ckan_format = toolkit.asbool(
config.get("ckanext.dcat.normalize_ckan_format", True)
)
imt, label = self._distribution_format(distribution, normalize_ckan_format)

if imt:
resource_dict["mimetype"] = imt

if label:
resource_dict["format"] = label
elif imt:
resource_dict["format"] = imt

# Size
size = self._object_value_int(distribution, DCAT.byteSize)
if size is not None:
resource_dict["size"] = size

# Checksum
for checksum in self.g.objects(distribution, SPDX.checksum):
algorithm = self._object_value(checksum, SPDX.algorithm)
checksum_value = self._object_value(checksum, SPDX.checksumValue)
if algorithm:
resource_dict["hash_algorithm"] = algorithm
if checksum_value:
resource_dict["hash"] = checksum_value

# Distribution URI (explicitly show the missing ones)
resource_dict["uri"] = (
str(distribution) if isinstance(distribution, term.URIRef) else ""
)

# Remember the (internal) distribution reference for referencing in
# further profiles, e.g. for adding more properties
resource_dict["distribution_ref"] = str(distribution)

return resource_dict

def _graph_from_dataset_base(self, dataset_dict, dataset_ref):

g = self.g
Expand Down Expand Up @@ -387,7 +393,6 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref):
("has_version", DCT.hasVersion, None, URIRefOrLiteral),
("is_version_of", DCT.isVersionOf, None, URIRefOrLiteral),
("source", DCT.source, None, URIRefOrLiteral),
("sample", ADMS.sample, None, URIRefOrLiteral, DCAT.Distribution),
]
self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

Expand Down Expand Up @@ -751,6 +756,10 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref):

g.add((distribution, SPDX.checksum, checksum))

for dist_uri in dataset_dict.get("distribution", []):
if dist_uri:
g.add((dataset_ref, DCAT.distribution, URIRef(dist_uri)))

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this done? IIUC dataset["distribution"] is an internal property used during parsing, that should not be part of the output dataset_dict, so it shouldn't be available when serializing (or at least we shouldn't rely on it being present)
Besides the reference between datasets and each distribution is already added on line 608

def _graph_from_catalog_base(self, catalog_dict, catalog_ref):

g = self.g
Expand Down
Loading
Loading