From 8e6b39c6c0e81d305e49601fdd48cd2b920da793 Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Fri, 14 Dec 2018 14:29:15 +0100 Subject: [PATCH 1/4] Introduce option to prefer default language for Literal values --- ckanext/dcat/profiles.py | 15 ++++++++-- ckanext/dcat/tests/test_base_profile.py | 40 +++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index f3934c4c..fe9cc306 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -156,7 +156,7 @@ def _object(self, subject, predicate): return _object return None - def _object_value(self, subject, predicate): + def _object_value(self, subject, predicate, use_default_lang=False): ''' Given a subject and a predicate, returns the value of the object @@ -164,9 +164,18 @@ def _object_value(self, subject, predicate): If found, the unicode representation is returned, else an empty string ''' + default_lang = config.get('ckan.locale_default', 'en') + fallback = '' for o in self.g.objects(subject, predicate): - return unicode(o) - return '' + if use_default_lang and isinstance(o, Literal): + if o.language and o.language == default_lang: + return unicode(o) + # Use first object as fallback if no object with the default language is available + elif fallback == '': + fallback = unicode(o) + else: + return unicode(o) + return fallback def _object_value_int(self, subject, predicate): ''' diff --git a/ckanext/dcat/tests/test_base_profile.py b/ckanext/dcat/tests/test_base_profile.py index 3c1cb48e..337d4a99 100644 --- a/ckanext/dcat/tests/test_base_profile.py +++ b/ckanext/dcat/tests/test_base_profile.py @@ -3,6 +3,8 @@ from rdflib import Graph, URIRef, Literal from rdflib.namespace import Namespace +from ckantoolkit.tests import helpers + from ckanext.dcat.profiles import RDFProfile, CleanedURIRef from ckanext.dcat.tests.test_base_parser import _default_graph @@ -113,6 +115,44 @@ def test_object_value_not_found(self): eq_(value, '') + @helpers.change_config('ckan.locale_default', 'de') + def test_object_value_default_lang(self): + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Datensatz 1', lang='de'))) + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Dataset 1 (EN)', lang='en'))) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.title, use_default_lang=True) + + assert isinstance(value, unicode) + eq_(value, 'Test Datensatz 1') + + def test_object_value_default_lang_fallback(self): + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Datensatz 1', lang='de'))) + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Dataset 1 (EN)', lang='en'))) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.title, use_default_lang=True) + + assert isinstance(value, unicode) + eq_(value, 'Test Dataset 1 (EN)') + + def test_object_value_default_lang_missing_lang_param(self): + p = RDFProfile(_default_graph()) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.title, use_default_lang=True) + + assert isinstance(value, unicode) + eq_(value, 'Test Dataset 1') + def test_object_int(self): p = RDFProfile(_default_graph()) From e9e1468216bd295f5ef5fab1280087a9fbfd131f Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Fri, 14 Dec 2018 14:34:16 +0100 Subject: [PATCH 2/4] Prefer default language for some Literal nodes * dct:title (dataset and distribution) * dct:description (dataset and distribution) * foaf:name in dct:publisher * vcard:fn in dcat:contactPoint --- ckanext/dcat/profiles.py | 26 ++++-- .../tests/test_euro_dcatap_profile_parse.py | 82 ++++++++++++++++++- 2 files changed, 101 insertions(+), 7 deletions(-) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index fe9cc306..e8bc8b68 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -287,7 +287,7 @@ def _publisher(self, subject, predicate): publisher['uri'] = (unicode(agent) if isinstance(agent, rdflib.term.URIRef) else '') - publisher['name'] = self._object_value(agent, FOAF.name) + publisher['name'] = self._object_value(agent, FOAF.name, use_default_lang=True) publisher['email'] = self._object_value(agent, FOAF.mbox) @@ -314,7 +314,7 @@ def _contact_details(self, subject, predicate): contact['uri'] = (unicode(agent) if isinstance(agent, rdflib.term.URIRef) else '') - contact['name'] = self._object_value(agent, VCARD.fn) + contact['name'] = self._object_value(agent, VCARD.fn, use_default_lang=True) contact['email'] = self._without_mailto( self._object_value(agent, VCARD.hasEmail) @@ -778,8 +778,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): # Basic fields for key, predicate in ( - ('title', DCT.title), - ('notes', DCT.description), ('url', DCAT.landingPage), ('version', OWL.versionInfo), ): @@ -787,6 +785,15 @@ def parse_dataset(self, dataset_dict, dataset_ref): if value: dataset_dict[key] = value + # Multilingual basic fields + for key, predicate in ( + ('title', DCT.title), + ('notes', DCT.description), + ): + value = self._object_value(dataset_ref, predicate, use_default_lang=True) + if value: + dataset_dict[key] = value + if not dataset_dict.get('version'): # adms:version was supported on the first version of the DCAT-AP value = self._object_value(dataset_ref, ADMS.version) @@ -904,8 +911,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): # Simple values for key, predicate in ( - ('name', DCT.title), - ('description', DCT.description), ('access_url', DCAT.accessURL), ('download_url', DCAT.downloadURL), ('issued', DCT.issued), @@ -922,6 +927,15 @@ def parse_dataset(self, dataset_dict, dataset_ref): DCAT.downloadURL) or self._object_value(distribution, DCAT.accessURL)) + # Multilingual simple values + for key, predicate in ( + ('name', DCT.title), + ('description', DCT.description), + ): + value = self._object_value(distribution, predicate, use_default_lang=True) + if value: + resource_dict[key] = value + # Lists for key, predicate in ( ('language', DCT.language), diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py b/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py index 8c018ac8..af9f1afe 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py @@ -14,7 +14,7 @@ from ckanext.dcat.processors import RDFParser, RDFSerializer from ckanext.dcat.profiles import (DCAT, DCT, ADMS, LOCN, SKOS, GSP, RDFS, - GEOJSON_IMT) + GEOJSON_IMT, FOAF, VCARD) from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS eq_ = nose.tools.eq_ @@ -352,6 +352,86 @@ def test_distribution_format_imt_only(self): else: eq_(resource['format'], u'text/csv') + @staticmethod + def _prepare_default_lang_graph(): + def _add_node_de_en(g, item_ref, predicate, literal_base_value): + g.add((item_ref, predicate, Literal(literal_base_value + '(DE)', lang='de'))) + g.add((item_ref, predicate, Literal(literal_base_value + '(EN)', lang='en'))) + + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + _add_node_de_en(g, dataset1, DCT.title, 'Test dataset') + _add_node_de_en(g, dataset1, DCT.description, 'some description') + + publisher_node = BNode() + g.add((publisher_node, RDF.type, FOAF.Organization)) + _add_node_de_en(g, publisher_node, FOAF.name, 'a publisher') + g.add((dataset1, DCT.publisher, publisher_node)) + + contact_node = BNode() + g.add((contact_node, RDF.type, VCARD.Organization)) + _add_node_de_en(g, contact_node, VCARD.fn, 'a contact') + g.add((dataset1, DCAT.contactPoint, contact_node)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + _add_node_de_en(g, distribution1_1, DCT.title, 'Test resource') + _add_node_de_en(g, distribution1_1, DCT.description, 'some res description') + + g.add((dataset1, DCAT.distribution, distribution1_1)) + + return g + + def _assert_lang_graph(self, dataset, expected_lang): + extras = self._extras(dataset) + resource = dataset['resources'][0] + eq_(dataset['title'], u'Test dataset' + expected_lang) + eq_(dataset['notes'], u'some description' + expected_lang) + eq_(extras['publisher_name'], u'a publisher' + expected_lang) + eq_(extras['contact_name'], u'a contact' + expected_lang) + eq_(resource['name'], u'Test resource' + expected_lang) + eq_(resource['description'], u'some res description' + expected_lang) + + @helpers.change_config('ckan.locale_default', 'en') + def test_default_lang_en(self): + g = self._prepare_default_lang_graph() + + p = RDFParser(profiles=['euro_dcat_ap']) + p.g = g + + dataset = [d for d in p.datasets()][0] + self._assert_lang_graph(dataset, '(EN)') + + @helpers.change_config('ckan.locale_default', 'de') + def test_default_lang_de(self): + g = self._prepare_default_lang_graph() + + p = RDFParser(profiles=['euro_dcat_ap']) + p.g = g + + dataset = [d for d in p.datasets()][0] + self._assert_lang_graph(dataset, '(DE)') + + @helpers.change_config('ckan.locale_default', 'fr') + def test_default_lang_not_in_graph(self): + g = self._prepare_default_lang_graph() + + p = RDFParser(profiles=['euro_dcat_ap']) + p.g = g + + dataset = [d for d in p.datasets()][0] + extras = self._extras(dataset) + resource = dataset['resources'][0] + + # default lang is not present in graph, so only check for correct base values + assert_true(u'Test dataset' in dataset['title']) + assert_true(u'some description' in dataset['notes']) + assert_true(u'a publisher' in extras['publisher_name']) + assert_true(u'a contact' in extras['contact_name']) + assert_true(u'Test resource' in resource['name']) + assert_true(u'some res description' in resource['description']) + @helpers.change_config('ckanext.dcat.normalize_ckan_format', False) def test_distribution_format_imt_only_normalize_false(self): g = Graph() From 2f1f7dc4c452ad56b56bbca27bbafa7404b25eff Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Wed, 19 Dec 2018 13:59:59 +0100 Subject: [PATCH 3/4] Remove use_default_lang parameter from _object_value --- ckanext/dcat/profiles.py | 30 +++++++------------------ ckanext/dcat/tests/test_base_profile.py | 6 ++--- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index e8bc8b68..b91e4dba 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -156,7 +156,7 @@ def _object(self, subject, predicate): return _object return None - def _object_value(self, subject, predicate, use_default_lang=False): + def _object_value(self, subject, predicate): ''' Given a subject and a predicate, returns the value of the object @@ -167,7 +167,7 @@ def _object_value(self, subject, predicate, use_default_lang=False): default_lang = config.get('ckan.locale_default', 'en') fallback = '' for o in self.g.objects(subject, predicate): - if use_default_lang and isinstance(o, Literal): + if isinstance(o, Literal): if o.language and o.language == default_lang: return unicode(o) # Use first object as fallback if no object with the default language is available @@ -287,7 +287,7 @@ def _publisher(self, subject, predicate): publisher['uri'] = (unicode(agent) if isinstance(agent, rdflib.term.URIRef) else '') - publisher['name'] = self._object_value(agent, FOAF.name, use_default_lang=True) + publisher['name'] = self._object_value(agent, FOAF.name) publisher['email'] = self._object_value(agent, FOAF.mbox) @@ -314,7 +314,7 @@ def _contact_details(self, subject, predicate): contact['uri'] = (unicode(agent) if isinstance(agent, rdflib.term.URIRef) else '') - contact['name'] = self._object_value(agent, VCARD.fn, use_default_lang=True) + contact['name'] = self._object_value(agent, VCARD.fn) contact['email'] = self._without_mailto( self._object_value(agent, VCARD.hasEmail) @@ -778,6 +778,8 @@ def parse_dataset(self, dataset_dict, dataset_ref): # Basic fields for key, predicate in ( + ('title', DCT.title), + ('notes', DCT.description), ('url', DCAT.landingPage), ('version', OWL.versionInfo), ): @@ -785,15 +787,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): if value: dataset_dict[key] = value - # Multilingual basic fields - for key, predicate in ( - ('title', DCT.title), - ('notes', DCT.description), - ): - value = self._object_value(dataset_ref, predicate, use_default_lang=True) - if value: - dataset_dict[key] = value - if not dataset_dict.get('version'): # adms:version was supported on the first version of the DCAT-AP value = self._object_value(dataset_ref, ADMS.version) @@ -911,6 +904,8 @@ def parse_dataset(self, dataset_dict, dataset_ref): # Simple values for key, predicate in ( + ('name', DCT.title), + ('description', DCT.description), ('access_url', DCAT.accessURL), ('download_url', DCAT.downloadURL), ('issued', DCT.issued), @@ -927,15 +922,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): DCAT.downloadURL) or self._object_value(distribution, DCAT.accessURL)) - # Multilingual simple values - for key, predicate in ( - ('name', DCT.title), - ('description', DCT.description), - ): - value = self._object_value(distribution, predicate, use_default_lang=True) - if value: - resource_dict[key] = value - # Lists for key, predicate in ( ('language', DCT.language), diff --git a/ckanext/dcat/tests/test_base_profile.py b/ckanext/dcat/tests/test_base_profile.py index 337d4a99..e02102fc 100644 --- a/ckanext/dcat/tests/test_base_profile.py +++ b/ckanext/dcat/tests/test_base_profile.py @@ -125,7 +125,7 @@ def test_object_value_default_lang(self): DCT.title, Literal('Test Dataset 1 (EN)', lang='en'))) value = p._object_value(URIRef('http://example.org/datasets/1'), - DCT.title, use_default_lang=True) + DCT.title) assert isinstance(value, unicode) eq_(value, 'Test Datensatz 1') @@ -139,7 +139,7 @@ def test_object_value_default_lang_fallback(self): DCT.title, Literal('Test Dataset 1 (EN)', lang='en'))) value = p._object_value(URIRef('http://example.org/datasets/1'), - DCT.title, use_default_lang=True) + DCT.title) assert isinstance(value, unicode) eq_(value, 'Test Dataset 1 (EN)') @@ -148,7 +148,7 @@ def test_object_value_default_lang_missing_lang_param(self): p = RDFProfile(_default_graph()) value = p._object_value(URIRef('http://example.org/datasets/1'), - DCT.title, use_default_lang=True) + DCT.title) assert isinstance(value, unicode) eq_(value, 'Test Dataset 1') From 53e28ea477bf2aaa882123551d23c001e947b4ba Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Wed, 19 Dec 2018 14:00:56 +0100 Subject: [PATCH 4/4] Test default language handling in base profile instead of checking specific values for DCAT-AP --- ckanext/dcat/tests/test_base_profile.py | 16 ++++ .../tests/test_euro_dcatap_profile_parse.py | 82 +------------------ 2 files changed, 17 insertions(+), 81 deletions(-) diff --git a/ckanext/dcat/tests/test_base_profile.py b/ckanext/dcat/tests/test_base_profile.py index e02102fc..0266c0a8 100644 --- a/ckanext/dcat/tests/test_base_profile.py +++ b/ckanext/dcat/tests/test_base_profile.py @@ -130,6 +130,21 @@ def test_object_value_default_lang(self): assert isinstance(value, unicode) eq_(value, 'Test Datensatz 1') + @helpers.change_config('ckan.locale_default', 'fr') + def test_object_value_default_lang_not_in_graph(self): + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Datensatz 1', lang='de'))) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.title) + + assert isinstance(value, unicode) + # FR is not in graph, so either node may be used + assert value.startswith('Test D') + assert value.endswith(' 1') + def test_object_value_default_lang_fallback(self): p = RDFProfile(_default_graph()) @@ -142,6 +157,7 @@ def test_object_value_default_lang_fallback(self): DCT.title) assert isinstance(value, unicode) + # without config parameter, EN is used as default eq_(value, 'Test Dataset 1 (EN)') def test_object_value_default_lang_missing_lang_param(self): diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py b/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py index af9f1afe..8c018ac8 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py @@ -14,7 +14,7 @@ from ckanext.dcat.processors import RDFParser, RDFSerializer from ckanext.dcat.profiles import (DCAT, DCT, ADMS, LOCN, SKOS, GSP, RDFS, - GEOJSON_IMT, FOAF, VCARD) + GEOJSON_IMT) from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS eq_ = nose.tools.eq_ @@ -352,86 +352,6 @@ def test_distribution_format_imt_only(self): else: eq_(resource['format'], u'text/csv') - @staticmethod - def _prepare_default_lang_graph(): - def _add_node_de_en(g, item_ref, predicate, literal_base_value): - g.add((item_ref, predicate, Literal(literal_base_value + '(DE)', lang='de'))) - g.add((item_ref, predicate, Literal(literal_base_value + '(EN)', lang='en'))) - - g = Graph() - - dataset1 = URIRef("http://example.org/datasets/1") - g.add((dataset1, RDF.type, DCAT.Dataset)) - _add_node_de_en(g, dataset1, DCT.title, 'Test dataset') - _add_node_de_en(g, dataset1, DCT.description, 'some description') - - publisher_node = BNode() - g.add((publisher_node, RDF.type, FOAF.Organization)) - _add_node_de_en(g, publisher_node, FOAF.name, 'a publisher') - g.add((dataset1, DCT.publisher, publisher_node)) - - contact_node = BNode() - g.add((contact_node, RDF.type, VCARD.Organization)) - _add_node_de_en(g, contact_node, VCARD.fn, 'a contact') - g.add((dataset1, DCAT.contactPoint, contact_node)) - - distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") - _add_node_de_en(g, distribution1_1, DCT.title, 'Test resource') - _add_node_de_en(g, distribution1_1, DCT.description, 'some res description') - - g.add((dataset1, DCAT.distribution, distribution1_1)) - - return g - - def _assert_lang_graph(self, dataset, expected_lang): - extras = self._extras(dataset) - resource = dataset['resources'][0] - eq_(dataset['title'], u'Test dataset' + expected_lang) - eq_(dataset['notes'], u'some description' + expected_lang) - eq_(extras['publisher_name'], u'a publisher' + expected_lang) - eq_(extras['contact_name'], u'a contact' + expected_lang) - eq_(resource['name'], u'Test resource' + expected_lang) - eq_(resource['description'], u'some res description' + expected_lang) - - @helpers.change_config('ckan.locale_default', 'en') - def test_default_lang_en(self): - g = self._prepare_default_lang_graph() - - p = RDFParser(profiles=['euro_dcat_ap']) - p.g = g - - dataset = [d for d in p.datasets()][0] - self._assert_lang_graph(dataset, '(EN)') - - @helpers.change_config('ckan.locale_default', 'de') - def test_default_lang_de(self): - g = self._prepare_default_lang_graph() - - p = RDFParser(profiles=['euro_dcat_ap']) - p.g = g - - dataset = [d for d in p.datasets()][0] - self._assert_lang_graph(dataset, '(DE)') - - @helpers.change_config('ckan.locale_default', 'fr') - def test_default_lang_not_in_graph(self): - g = self._prepare_default_lang_graph() - - p = RDFParser(profiles=['euro_dcat_ap']) - p.g = g - - dataset = [d for d in p.datasets()][0] - extras = self._extras(dataset) - resource = dataset['resources'][0] - - # default lang is not present in graph, so only check for correct base values - assert_true(u'Test dataset' in dataset['title']) - assert_true(u'some description' in dataset['notes']) - assert_true(u'a publisher' in extras['publisher_name']) - assert_true(u'a contact' in extras['contact_name']) - assert_true(u'Test resource' in resource['name']) - assert_true(u'some res description' in resource['description']) - @helpers.change_config('ckanext.dcat.normalize_ckan_format', False) def test_distribution_format_imt_only_normalize_false(self): g = Graph()