From 0254ee630d283f6edd326b8d75e2d1b24f06314e Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 5 Aug 2024 13:22:56 +0200 Subject: [PATCH 1/3] implement design --- pyproject.toml | 4 +++- src/fundus/parser/base_parser.py | 16 ++++++++++++++++ src/fundus/publishers/base_objects.py | 2 +- src/fundus/scraping/scraper.py | 3 ++- 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c0aa6068d..b98028afc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,8 @@ dependencies = [ "tqdm>=4.66, <5", "fastwarc>=0.14, <1", "chardet>=5.2, <6", - "dill>=0.3, <1" + "dill>=0.3, <1", + "deprecated>=1.2.14, <2" ] [project.urls] @@ -53,6 +54,7 @@ dev = [ "types-python-dateutil>=2.8, <3", "types-requests>=2.28, <3", "types-colorama>=0.4, <1", + "types-deprecated>=1.2.9, <2" ] [tool.mypy] diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py index 773a47593..4cd9f464f 100644 --- a/src/fundus/parser/base_parser.py +++ b/src/fundus/parser/base_parser.py @@ -24,6 +24,7 @@ import lxml.html import more_itertools +from deprecated import deprecated from lxml.etree import XPath from fundus.logging import create_logger @@ -289,7 +290,14 @@ def __init__(self): mapping[validation_date] = _ParserCache(versioned_parser) self._parser_mapping = mapping + @deprecated( + version="0.4.1", + reason=f"Calling is legacy behaviour. Use of instead", + ) def __call__(self, crawl_date: Optional[Union[datetime, date]] = None) -> BaseParser: + return self._get_parser(crawl_date) + + def _get_parser(self, crawl_date: Optional[Union[datetime, date]] = None) -> BaseParser: if crawl_date is None: return self._get_latest_cache()() @@ -343,3 +351,11 @@ def _get_latest_cache(self) -> _ParserCache: @property def latest_version(self) -> Type[BaseParser]: return self._get_latest_cache().factory + + def parse( + self, + html: str, + crawl_date: Union[datetime, date], + error_handling: Literal["suppress", "catch", "raise"] = "raise", + ) -> Dict[str, Any]: + return self._get_parser(crawl_date).parse(html, error_handling) diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py index 6c36ff39c..572f25cc3 100644 --- a/src/fundus/publishers/base_objects.py +++ b/src/fundus/publishers/base_objects.py @@ -162,7 +162,7 @@ def search( unique_attributes = set(attributes) spec: Publisher for publisher in cls: - if unique_attributes.issubset(publisher.parser().attributes().names) and ( + if unique_attributes.issubset(publisher.parser.latest_version.attributes().names) and ( publisher.supports(source_types) if source_types else True ): matched.append(publisher) diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index bc694e3a7..52af863e2 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -1,3 +1,4 @@ +import warnings from typing import Dict, Iterator, List, Literal, Optional, Type import more_itertools @@ -34,7 +35,7 @@ def scrape( parser = self.parser_mapping[html.source_info.publisher] try: - extraction = parser(html.crawl_date).parse(html.content, error_handling) + extraction = parser.parse(html.content, html.crawl_date, error_handling) except Exception as error: if error_handling == "raise": From e9ee28ef4ad80c5dc0e700df8cff961934a63495 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 5 Aug 2024 16:50:15 +0200 Subject: [PATCH 2/3] make `_get_parser` public --- src/fundus/parser/base_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py index 4cd9f464f..e2b4960d6 100644 --- a/src/fundus/parser/base_parser.py +++ b/src/fundus/parser/base_parser.py @@ -295,9 +295,9 @@ def __init__(self): reason=f"Calling is legacy behaviour. Use of instead", ) def __call__(self, crawl_date: Optional[Union[datetime, date]] = None) -> BaseParser: - return self._get_parser(crawl_date) + return self.get_parser(crawl_date) - def _get_parser(self, crawl_date: Optional[Union[datetime, date]] = None) -> BaseParser: + def get_parser(self, crawl_date: Optional[Union[datetime, date]] = None) -> BaseParser: if crawl_date is None: return self._get_latest_cache()() @@ -358,4 +358,4 @@ def parse( crawl_date: Union[datetime, date], error_handling: Literal["suppress", "catch", "raise"] = "raise", ) -> Dict[str, Any]: - return self._get_parser(crawl_date).parse(html, error_handling) + return self.get_parser(crawl_date).parse(html, error_handling) From f49439c78f770c0fc49ffd77b463caa004a5075c Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 5 Aug 2024 16:50:24 +0200 Subject: [PATCH 3/3] fix test cases --- tests/test_parser.py | 8 ++++---- tests/utility.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index c34baec79..38a3480cf 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -108,13 +108,13 @@ def test_latest(self, proxy_with_two_versions_and_different_attrs): assert parser_proxy.latest_version == parser_proxy.Later def test_call(self, proxy_with_two_versions_and_different_attrs): - parser_proxy = proxy_with_two_versions_and_different_attrs() - assert type(parser_proxy()) == parser_proxy.latest_version + parser_proxy: ParserProxy = proxy_with_two_versions_and_different_attrs() + assert type(parser_proxy.get_parser()) == parser_proxy.latest_version for versioned_parser in parser_proxy: - from_proxy = parser_proxy(versioned_parser.VALID_UNTIL) + from_proxy = parser_proxy.get_parser(versioned_parser.VALID_UNTIL) assert isinstance(from_proxy, versioned_parser) - assert from_proxy == parser_proxy(versioned_parser.VALID_UNTIL) + assert from_proxy == parser_proxy.get_parser(versioned_parser.VALID_UNTIL) def test_mapping(self, proxy_with_two_versions_and_different_attrs): parser_proxy = proxy_with_two_versions_and_different_attrs() diff --git a/tests/utility.py b/tests/utility.py index 137b5ab95..f9dd738e0 100644 --- a/tests/utility.py +++ b/tests/utility.py @@ -263,7 +263,7 @@ def load_html_test_file_mapping(publisher: Publisher) -> Dict[Type[BaseParser], html_files = [HTMLTestFile.load(path) for path in html_paths] html_mapping: Dict[Type[BaseParser], HTMLTestFile] = {} for html_file in html_files: - versioned_parser = publisher.parser(html_file.crawl_date) + versioned_parser = publisher.parser.get_parser(html_file.crawl_date) if html_mapping.get(type(versioned_parser)): raise KeyError(f"Duplicate html files for {publisher.name!r} and version {type(versioned_parser).__name__}") html_mapping[type(versioned_parser)] = html_file