diff --git a/pyproject.toml b/pyproject.toml index c0aa6068d..b98028afc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,8 @@ dependencies = [ "tqdm>=4.66, <5", "fastwarc>=0.14, <1", "chardet>=5.2, <6", - "dill>=0.3, <1" + "dill>=0.3, <1", + "deprecated>=1.2.14, <2" ] [project.urls] @@ -53,6 +54,7 @@ dev = [ "types-python-dateutil>=2.8, <3", "types-requests>=2.28, <3", "types-colorama>=0.4, <1", + "types-deprecated>=1.2.9, <2" ] [tool.mypy] diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py index 773a47593..e2b4960d6 100644 --- a/src/fundus/parser/base_parser.py +++ b/src/fundus/parser/base_parser.py @@ -24,6 +24,7 @@ import lxml.html import more_itertools +from deprecated import deprecated from lxml.etree import XPath from fundus.logging import create_logger @@ -289,7 +290,14 @@ def __init__(self): mapping[validation_date] = _ParserCache(versioned_parser) self._parser_mapping = mapping + @deprecated( + version="0.4.1", + reason=f"Calling is legacy behaviour. Use of instead", + ) def __call__(self, crawl_date: Optional[Union[datetime, date]] = None) -> BaseParser: + return self.get_parser(crawl_date) + + def get_parser(self, crawl_date: Optional[Union[datetime, date]] = None) -> BaseParser: if crawl_date is None: return self._get_latest_cache()() @@ -343,3 +351,11 @@ def _get_latest_cache(self) -> _ParserCache: @property def latest_version(self) -> Type[BaseParser]: return self._get_latest_cache().factory + + def parse( + self, + html: str, + crawl_date: Union[datetime, date], + error_handling: Literal["suppress", "catch", "raise"] = "raise", + ) -> Dict[str, Any]: + return self.get_parser(crawl_date).parse(html, error_handling) diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py index 6c36ff39c..572f25cc3 100644 --- a/src/fundus/publishers/base_objects.py +++ b/src/fundus/publishers/base_objects.py @@ -162,7 +162,7 @@ def search( unique_attributes = set(attributes) spec: Publisher for publisher in cls: - if unique_attributes.issubset(publisher.parser().attributes().names) and ( + if unique_attributes.issubset(publisher.parser.latest_version.attributes().names) and ( publisher.supports(source_types) if source_types else True ): matched.append(publisher) diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index bc694e3a7..52af863e2 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -1,3 +1,4 @@ +import warnings from typing import Dict, Iterator, List, Literal, Optional, Type import more_itertools @@ -34,7 +35,7 @@ def scrape( parser = self.parser_mapping[html.source_info.publisher] try: - extraction = parser(html.crawl_date).parse(html.content, error_handling) + extraction = parser.parse(html.content, html.crawl_date, error_handling) except Exception as error: if error_handling == "raise": diff --git a/tests/test_parser.py b/tests/test_parser.py index c34baec79..38a3480cf 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -108,13 +108,13 @@ def test_latest(self, proxy_with_two_versions_and_different_attrs): assert parser_proxy.latest_version == parser_proxy.Later def test_call(self, proxy_with_two_versions_and_different_attrs): - parser_proxy = proxy_with_two_versions_and_different_attrs() - assert type(parser_proxy()) == parser_proxy.latest_version + parser_proxy: ParserProxy = proxy_with_two_versions_and_different_attrs() + assert type(parser_proxy.get_parser()) == parser_proxy.latest_version for versioned_parser in parser_proxy: - from_proxy = parser_proxy(versioned_parser.VALID_UNTIL) + from_proxy = parser_proxy.get_parser(versioned_parser.VALID_UNTIL) assert isinstance(from_proxy, versioned_parser) - assert from_proxy == parser_proxy(versioned_parser.VALID_UNTIL) + assert from_proxy == parser_proxy.get_parser(versioned_parser.VALID_UNTIL) def test_mapping(self, proxy_with_two_versions_and_different_attrs): parser_proxy = proxy_with_two_versions_and_different_attrs() diff --git a/tests/utility.py b/tests/utility.py index 137b5ab95..f9dd738e0 100644 --- a/tests/utility.py +++ b/tests/utility.py @@ -263,7 +263,7 @@ def load_html_test_file_mapping(publisher: Publisher) -> Dict[Type[BaseParser], html_files = [HTMLTestFile.load(path) for path in html_paths] html_mapping: Dict[Type[BaseParser], HTMLTestFile] = {} for html_file in html_files: - versioned_parser = publisher.parser(html_file.crawl_date) + versioned_parser = publisher.parser.get_parser(html_file.crawl_date) if html_mapping.get(type(versioned_parser)): raise KeyError(f"Duplicate html files for {publisher.name!r} and version {type(versioned_parser).__name__}") html_mapping[type(versioned_parser)] = html_file