Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- name: Run pytest
run: python -m pytest -vv

mypy:
pyright:
# Containers must run in Linux based operating systems
runs-on: ubuntu-latest
steps:
Expand All @@ -53,5 +53,5 @@ jobs:
run: |
pip install -e .[dev]

- name: Run mypy
run: python -m mypy .
- name: Run pyright
run: pyright
22 changes: 8 additions & 14 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,28 +49,22 @@ dependencies = [
[project.optional-dependencies]
dev = [
"pytest~=7.2.2",
"mypy==1.9.0",
"pyright==1.1.408",
"ruff==0.15.6",
# type stubs
"types-lxml",
"types-python-dateutil>=2.8, <3",
"types-requests>=2.28, <3",
"types-colorama>=0.4, <1",
"types-dateparser>=1.2.0, <2"
"types-dateparser>=1.2.0, <2",
"types-xmltodict>=0.13.0, <1",
"types-tqdm>=4.66, <5"
]

[tool.mypy]
check_untyped_defs = true
disallow_any_generics = true
ignore_missing_imports = true
no_implicit_optional = true
show_error_codes = true
strict_equality = true
warn_redundant_casts = true
warn_return_any = true
warn_unreachable = true
warn_unused_configs = true
no_implicit_reexport = true
[tool.pyright]
pythonVersion = "3.8"
typeCheckingMode = "standard"
reportMissingImports = false

[tool.ruff]
line-length = 120
Expand Down
2 changes: 1 addition & 1 deletion scripts/check_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def main() -> None:
if (parsed := parse_coverage_file(txt)) is None:
raise RuntimeError(f"Couldn't parse latest coverage file for run {latest_run.id}")

failed_publishers = [publisher for publisher, status in parsed.items() if not status] # type: ignore[union-attr]
failed_publishers = [publisher for publisher, status in parsed.items() if not status]

print(f"Latest run on '{run_time}' with {len(failed_publishers)} failed publishers.")
print(failed_publishers)
Expand Down
2 changes: 1 addition & 1 deletion scripts/generate_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def align_tables(tables: Sequence[lxml.html.HtmlElement]) -> None:

for column_index, colum_heads in enumerate(
more_itertools.transpose(table_heads),
start=1, # type: ignore[attr-defined]
start=1,
):
column_texts: List[str] = [
text for table in tables for text in table.xpath(f"/table/tbody/tr/td[{column_index}]//text()")
Expand Down
2 changes: 1 addition & 1 deletion scripts/publisher_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def main() -> None:
# skip publishers providing no sources for forward crawling
print(f"⏩ SKIPPED: {publisher_name!r} - No sources defined")
continue
if publisher.deprecated: # type: ignore[attr-defined]
if publisher.deprecated:
print(f"⏩ SKIPPED: {publisher_name!r} - Deprecated")
continue
if publisher.__name__ in parsed_arguments.skip:
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def add_handler(handler: logging.Handler):
logger.addHandler(handler)


def get_current_config() -> JSONVal:
def get_current_config() -> Dict[str, JSONVal]:
"""Get the current logging configuration as JSON.

Returns:
Expand Down
39 changes: 36 additions & 3 deletions src/fundus/parser/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
Union,
get_args,
get_origin,
overload,
)

import lxml.html
Expand Down Expand Up @@ -131,6 +132,30 @@ def wrapper(func):
return wrapper(cls)


@overload
def attribute(
cls: Callable[..., Any],
/,
*,
priority: Optional[int] = ...,
validate: bool = ...,
deprecated: Optional[date] = ...,
default_factory: Optional[Callable[[], Any]] = ...,
) -> Any: ...


@overload
def attribute(
cls: None = ...,
/,
*,
priority: Optional[int] = ...,
validate: bool = ...,
deprecated: Optional[date] = ...,
default_factory: Optional[Callable[[], Any]] = ...,
) -> Callable[[Any], Any]: ...


def attribute(
cls=None,
/,
Expand All @@ -139,7 +164,7 @@ def attribute(
validate: bool = True,
deprecated: Optional[date] = None,
default_factory: Optional[Callable[[], Any]] = None,
):
) -> Any:
return _register(
cls,
factory=Attribute,
Expand All @@ -150,7 +175,15 @@ def attribute(
)


def function(cls=None, /, *, priority: Optional[int] = None):
@overload
def function(cls: Callable[..., Any], /, *, priority: Optional[int] = ...) -> Any: ...


@overload
def function(cls: None = ..., /, *, priority: Optional[int] = ...) -> Callable[[Any], Any]: ...


def function(cls=None, /, *, priority: Optional[int] = None) -> Any:
return _register(cls, factory=Function, priority=priority)


Expand Down Expand Up @@ -375,7 +408,7 @@ def predicate(x: object) -> bool:
mapping: Dict[date, _ParserCache] = {}
for versioned_parser in sorted(included_parsers, key=lambda parser: parser.VALID_UNTIL):
validation_date: date
if prev := mapping.get(validation_date := versioned_parser.VALID_UNTIL): # type: ignore
if prev := mapping.get(validation_date := versioned_parser.VALID_UNTIL):
raise ValueError(
f"Found versions {prev.factory.__name__!r} and {versioned_parser.__name__!r} of "
f"{str(self)!r} with same validation date.\nMake sure you use class attribute VALID_UNTIL "
Expand Down
16 changes: 8 additions & 8 deletions src/fundus/parser/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self, lds: Iterable[Dict[str, Any]] = ()):
self.add_ld(nested)
else:
self.add_ld(ld)
self.__xml: Optional[lxml.etree.Element] = None
self.__xml: Optional[lxml.etree._Element] = None

def __getstate__(self):
state = self.__dict__.copy()
Expand Down Expand Up @@ -128,7 +128,7 @@ def get_value_by_key_path(self, key_path: List[str], default: Any = None) -> Opt
tmp = nxt
return tmp

def __as_xml__(self) -> lxml.etree.Element:
def __as_xml__(self) -> lxml.etree._Element:
pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.keys())))

def to_unicode_characters(text: str) -> str:
Expand Down Expand Up @@ -189,7 +189,7 @@ def xpath_search(self, query: Union[XPath, str], scalar: bool = False):

pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.values())))

def node2string(n: lxml.etree.Element) -> str:
def node2string(n: lxml.etree._Element) -> str:
node_value = lxml.etree.tostring(n, encoding="unicode").strip()
if match := self.__value_regex__.match(node_value):
return match.group("value")
Expand Down Expand Up @@ -299,9 +299,9 @@ def __init__(self, texts: Iterable[str]):
def __getitem__(self, i: int) -> str: ...

@overload
def __getitem__(self, s: slice) -> "TextSequence": ...
def __getitem__(self, i: slice) -> "TextSequence": ...

def __getitem__(self, i):
def __getitem__(self, i: Union[int, slice]) -> Union[str, "TextSequence"]:
return self._data[i] if isinstance(i, int) else type(self)(self._data[i])

def __len__(self) -> int:
Expand Down Expand Up @@ -334,14 +334,14 @@ def text(self, join_on: str = "\n\n") -> str:
return join_on.join(self.as_text_sequence())

def df_traversal(self) -> Iterable[TextSequence]:
def recursion(o: object):
def recursion(o: object) -> Iterator[TextSequence]:
if isinstance(o, TextSequence):
yield o
elif isinstance(o, Collection):
for el in o:
yield from el
yield from recursion(el)
else:
yield o
return

for value in self:
yield from recursion(value)
Expand Down
3 changes: 2 additions & 1 deletion src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
)
from urllib.parse import urljoin

import lxml.etree
import lxml.html
import more_itertools
import validators
Expand Down Expand Up @@ -578,7 +579,7 @@ class CustomParserInfo(parser.parserinfo):
("Oct", "October", "Oktober", "Okt"),
("Nov", "November"),
("Dec", "December", "Dezember", "Dez"),
] # type: ignore[assignment]
]
# type ignore due to types-python-dateutil==2.9.0.20251008, see https://github.com/flairNLP/fundus/issues/806


Expand Down
4 changes: 2 additions & 2 deletions src/fundus/publishers/base_objects.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
from textwrap import indent
from typing import Dict, Iterable, Iterator, List, Optional, Set, Type, Union
from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Set, Type, Union
from urllib.robotparser import RobotFileParser
from warnings import warn

Expand Down Expand Up @@ -127,7 +127,7 @@ def __init__(
name: str,
domain: str,
parser: Type[ParserProxy],
sources: List[URLSource],
sources: Sequence[URLSource],
query_parameter: Optional[Dict[str, str]] = None,
url_filter: Optional[URLFilter] = None,
request_header: Optional[Dict[str, str]] = _default_header,
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/de/winfuture.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def body(self) -> Optional[ArticleBody]:
html_as_string = re.sub(r"(?<=<br>)\n(?!([<\W]))", "\n<p>", html_as_string)
html_as_string = re.sub(r"(?<=(ipt|div)>)\n(?![\W<])", "\n<p>", html_as_string)
html_as_string = re.sub(r"(?<![\W>])\n(?=<[a-z0-9=_'\"]*>)", "</p>\n", html_as_string)
doc: HtmlElement = fromstring(html_as_string) # type: ignore
doc: HtmlElement = fromstring(html_as_string)
return extract_article_body_with_selector(
doc=doc,
paragraph_selector=self._paragraph_selector,
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/fr/le_monde.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def title(self) -> Optional[str]:

@attribute
def topics(self) -> List[str]:
return self.precomputed.ld.bf_search("keywords") # type: ignore
return self.precomputed.ld.bf_search("keywords")

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/ind/times_of_india.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def body(self) -> Optional[ArticleBody]:
r"<div class=\"_s30J clearfix \">", "<div class=\"_s30J clearfix \"><p class='intro'>", html_as_string
)
return extract_article_body_with_selector(
fromstring(html_as_string), # type: ignore
fromstring(html_as_string),
summary_selector=self._summary_selector,
paragraph_selector=self._paragraph_selector,
subheadline_selector=self._subheadline_selector,
Expand Down
4 changes: 2 additions & 2 deletions src/fundus/scraping/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,12 @@ def to_json(self, *attributes: str) -> Dict[str, JSONVal]:

def serialize(v: Any) -> JSONVal:
if hasattr(v, "serialize"):
return v.serialize() # type: ignore[no-any-return]
return v.serialize()
elif isinstance(v, datetime):
return str(v)
elif not is_jsonable(v):
raise TypeError(f"Attribute {attribute!r} of type {type(v)!r} is not JSON serializable")
return v # type: ignore[no-any-return]
return v

serialization: Dict[str, JSONVal] = {}
for attribute in attributes:
Expand Down
Loading
Loading