From 68ca4f7feb8178c795be69b2157c66470e8eb763 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 16 Mar 2026 15:50:42 +0100
Subject: [PATCH 1/3] replace `mypy` with `pyright`

---
 .github/workflows/tests.yml |  6 +++---
 pyproject.toml              | 18 +++++-------------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 235729297..055540cbb 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Run pytest
         run: python -m pytest -vv
 
-  mypy:
+  pyright:
     # Containers must run in Linux based operating systems
     runs-on: ubuntu-latest
     steps:
@@ -53,5 +53,5 @@ jobs:
         run: |
           pip install -e .[dev]
 
-      - name: Run mypy
-        run: python -m mypy .
+      - name: Run pyright
+        run: pyright
diff --git a/pyproject.toml b/pyproject.toml
index 496f47061..b03e573d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
     "pytest~=7.2.2",
-    "mypy==1.9.0",
+    "pyright==1.1.408",
     "ruff==0.15.6",
     # type stubs
     "types-lxml",
@@ -59,18 +59,10 @@ dev = [
     "types-dateparser>=1.2.0, <2"
 ]
 
-[tool.mypy]
-check_untyped_defs = true
-disallow_any_generics = true
-ignore_missing_imports = true
-no_implicit_optional = true
-show_error_codes = true
-strict_equality = true
-warn_redundant_casts = true
-warn_return_any = true
-warn_unreachable = true
-warn_unused_configs = true
-no_implicit_reexport = true
+[tool.pyright]
+pythonVersion = "3.8"
+typeCheckingMode = "standard"
+reportMissingImports = false
 
 [tool.ruff]
 line-length = 120

From ce0767f10877b1033a23d0e75fc09b4457649935 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 16 Mar 2026 18:08:32 +0100
Subject: [PATCH 2/3] fix pyright errors

---
 pyproject.toml                        |  4 +-
 src/fundus/logging.py                 |  2 +-
 src/fundus/parser/base_parser.py      | 37 +++++++++++++++++-
 src/fundus/parser/data.py             | 16 ++++----
 src/fundus/parser/utility.py          |  1 +
 src/fundus/publishers/base_objects.py |  4 +-
 src/fundus/scraping/crawler.py        | 55 +++++++++++----------------
 src/fundus/scraping/html.py           | 10 +++--
 src/fundus/scraping/scraper.py        |  4 ++
 src/fundus/scraping/url.py            |  7 ++--
 src/fundus/utils/timeout.py           |  8 ++--
 tests/utility.py                      | 41 +++++++++++---------
 12 files changed, 115 insertions(+), 74 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b03e573d5..b3abaf0b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,9 @@ dev = [
     "types-python-dateutil>=2.8, <3",
     "types-requests>=2.28, <3",
     "types-colorama>=0.4, <1",
-    "types-dateparser>=1.2.0, <2"
+    "types-dateparser>=1.2.0, <2",
+    "types-xmltodict>=0.13.0, <1",
+    "types-tqdm>=4.66, <5"
 ]
 
 [tool.pyright]
diff --git a/src/fundus/logging.py b/src/fundus/logging.py
index dd6be7eab..7ed01883e 100644
--- a/src/fundus/logging.py
+++ b/src/fundus/logging.py
@@ -67,7 +67,7 @@ def add_handler(handler: logging.Handler):
         logger.addHandler(handler)
 
 
-def get_current_config() -> JSONVal:
+def get_current_config() -> Dict[str, JSONVal]:
     """Get the current logging configuration as JSON.
 
     Returns:
diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py
index 30f3ab2cf..7d2e8cbd4 100644
--- a/src/fundus/parser/base_parser.py
+++ b/src/fundus/parser/base_parser.py
@@ -21,6 +21,7 @@
     Union,
     get_args,
     get_origin,
+    overload,
 )
 
 import lxml.html
@@ -131,6 +132,30 @@ def wrapper(func):
     return wrapper(cls)
 
 
+@overload
+def attribute(
+    cls: Callable[..., Any],
+    /,
+    *,
+    priority: Optional[int] = ...,
+    validate: bool = ...,
+    deprecated: Optional[date] = ...,
+    default_factory: Optional[Callable[[], Any]] = ...,
+) -> Any: ...
+
+
+@overload
+def attribute(
+    cls: None = ...,
+    /,
+    *,
+    priority: Optional[int] = ...,
+    validate: bool = ...,
+    deprecated: Optional[date] = ...,
+    default_factory: Optional[Callable[[], Any]] = ...,
+) -> Callable[[Any], Any]: ...
+
+
 def attribute(
     cls=None,
     /,
@@ -139,7 +164,7 @@ def attribute(
     validate: bool = True,
     deprecated: Optional[date] = None,
     default_factory: Optional[Callable[[], Any]] = None,
-):
+) -> Any:
     return _register(
         cls,
         factory=Attribute,
@@ -150,7 +175,15 @@ def attribute(
     )
 
 
-def function(cls=None, /, *, priority: Optional[int] = None):
+@overload
+def function(cls: Callable[..., Any], /, *, priority: Optional[int] = ...) -> Any: ...
+
+
+@overload
+def function(cls: None = ..., /, *, priority: Optional[int] = ...) -> Callable[[Any], Any]: ...
+
+
+def function(cls=None, /, *, priority: Optional[int] = None) -> Any:
     return _register(cls, factory=Function, priority=priority)
 
 
diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index ec43bff80..516e6d5bf 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -70,7 +70,7 @@ def __init__(self, lds: Iterable[Dict[str, Any]] = ()):
                     self.add_ld(nested)
             else:
                 self.add_ld(ld)
-        self.__xml: Optional[lxml.etree.Element] = None
+        self.__xml: Optional[lxml.etree._Element] = None
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -128,7 +128,7 @@ def get_value_by_key_path(self, key_path: List[str], default: Any = None) -> Opt
             tmp = nxt
         return tmp
 
-    def __as_xml__(self) -> lxml.etree.Element:
+    def __as_xml__(self) -> lxml.etree._Element:
         pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.keys())))
 
         def to_unicode_characters(text: str) -> str:
@@ -189,7 +189,7 @@ def xpath_search(self, query: Union[XPath, str], scalar: bool = False):
 
         pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.values())))
 
-        def node2string(n: lxml.etree.Element) -> str:
+        def node2string(n: lxml.etree._Element) -> str:
             node_value = lxml.etree.tostring(n, encoding="unicode").strip()
             if match := self.__value_regex__.match(node_value):
                 return match.group("value")
@@ -299,9 +299,9 @@ def __init__(self, texts: Iterable[str]):
     def __getitem__(self, i: int) -> str: ...
 
     @overload
-    def __getitem__(self, s: slice) -> "TextSequence": ...
+    def __getitem__(self, i: slice) -> "TextSequence": ...
 
-    def __getitem__(self, i):
+    def __getitem__(self, i: Union[int, slice]) -> Union[str, "TextSequence"]:
         return self._data[i] if isinstance(i, int) else type(self)(self._data[i])
 
     def __len__(self) -> int:
@@ -334,14 +334,14 @@ def text(self, join_on: str = "\n\n") -> str:
         return join_on.join(self.as_text_sequence())
 
     def df_traversal(self) -> Iterable[TextSequence]:
-        def recursion(o: object):
+        def recursion(o: object) -> Iterator[TextSequence]:
             if isinstance(o, TextSequence):
                 yield o
             elif isinstance(o, Collection):
                 for el in o:
-                    yield from el
+                    yield from recursion(el)
             else:
-                yield o
+                return
 
         for value in self:
             yield from recursion(value)
diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index 5bb861b83..ae6f5a09b 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -28,6 +28,7 @@
 )
 from urllib.parse import urljoin
 
+import lxml.etree
 import lxml.html
 import more_itertools
 import validators
diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py
index 7741a6619..82921ce15 100644
--- a/src/fundus/publishers/base_objects.py
+++ b/src/fundus/publishers/base_objects.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 from textwrap import indent
-from typing import Dict, Iterable, Iterator, List, Optional, Set, Type, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Set, Type, Union
 from urllib.robotparser import RobotFileParser
 from warnings import warn
 
@@ -127,7 +127,7 @@ def __init__(
         name: str,
         domain: str,
         parser: Type[ParserProxy],
-        sources: List[URLSource],
+        sources: Sequence[URLSource],
         query_parameter: Optional[Dict[str, str]] = None,
         url_filter: Optional[URLFilter] = None,
         request_header: Optional[Dict[str, str]] = _default_header,
diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 1b022619c..0121bcf89 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -85,7 +85,7 @@ def tqdm(self, *args, **kwargs) -> tqdm:
 
 
 @contextlib.contextmanager
-def get_proxy_tqdm(*args, **kwargs) -> tqdm:
+def get_proxy_tqdm(*args, **kwargs) -> Iterator[tqdm]:
     """
     This functions returns a proxy to a tqdm instance. Init args are the same as for any other tqdm instance.
     :param args: tqdm args
@@ -120,7 +120,7 @@ def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
         return self._deserialize()(*args, **kwargs)
 
 
-def get_execution_context():
+def get_execution_context() -> Tuple[str, int]:
     """
     Determines whether the current execution context is in a thread or process.
     Returns:
@@ -129,10 +129,10 @@ def get_execution_context():
     """
     if multiprocessing.current_process().name != "MainProcess":
         process = multiprocessing.current_process()
-        return process.name, process.ident
+        return process.name, process.ident or 0
     else:
         thread = current_thread()
-        return thread.name, thread.ident
+        return thread.name, thread.ident or 0
 
 
 def publisher_context_wrapper(func: Callable[[Publisher], None]) -> Callable[[Publisher], None]:
@@ -414,9 +414,10 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
         callback: Optional[Callable[[], None]]
         if isinstance(self, CCNewsCrawler) and self.processes > 0:
 
-            def callback() -> None:
+            def _stop_callback() -> None:
                 __EVENTS__.set_event("stop", "main-thread")
 
+            callback = _stop_callback
         else:
             callback = None
 
@@ -579,6 +580,7 @@ def _single_crawl(
     def _threaded_crawl(
         self, publishers: Tuple[Publisher, ...], article_task: Callable[[Publisher], Iterator[Article]]
     ) -> Iterator[Article]:
+
         @contextlib.contextmanager
         def _manage_pool(*args, **kwargs) -> Iterator[ThreadPool]:
             managed_pool = ThreadPool(*args, **kwargs)
@@ -731,32 +733,24 @@ def _parallel_crawl(
         # As one could think, because we're downloading a bunch of files, this task is IO-bound, but it is actually
         # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all
         # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading.
-        try:
-            with Manager() as manager, Pool(
-                processes=min(self.processes, len(warc_paths)),
-                initializer=initializer,
-            ) as pool:
-                result_queue: Queue[Union[Article, Exception]] = manager.Queue(maxsize=1000)
+        with Manager() as manager, Pool(
+            processes=min(self.processes, len(warc_paths)),
+            initializer=initializer,
+        ) as pool:
+            result_queue: Queue[Union[Article, Exception]] = manager.Queue(maxsize=1000)
 
-                # Because multiprocessing.Pool does not support iterators as targets,
-                # we wrap the article_task to write the articles to a queue instead of returning them directly.
-                wrapped_article_task: Callable[[str], None] = queue_wrapper(result_queue, article_task)
+            # Because multiprocessing.Pool does not support iterators as targets,
+            # we wrap the article_task to write the articles to a queue instead of returning them directly.
+            wrapped_article_task: Callable[[str], None] = queue_wrapper(result_queue, article_task)
 
-                # To avoid 503 errors we spread tasks to not start all at once
-                spread_article_task = random_sleep(wrapped_article_task, (0, 3))
+            # To avoid 503 errors we spread tasks to not start all at once
+            spread_article_task = random_sleep(wrapped_article_task, (0, 3))
 
-                # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill.
-                serialized_article_task = dill_wrapper(spread_article_task)
+            # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill.
+            serialized_article_task = dill_wrapper(spread_article_task)
 
-                # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished.
-                yield from pool_queue_iter(pool.map_async(serialized_article_task, warc_paths), result_queue)
-        finally:
-            logger.debug(f"Shutting down {type(self).__name__!r} ...")
-            logger.debug("Joining manager ...")
-            manager.join()
-            logger.debug("Joining pool ...")
-            pool.join()
-            logger.debug("Shutdown done")
+            # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished.
+            yield from pool_queue_iter(pool.map_async(serialized_article_task, warc_paths), result_queue)
 
     def _get_warc_paths(self) -> List[str]:
         # Date regex examples: https://regex101.com/r/yDX3G6/1
@@ -790,11 +784,8 @@ def load_paths(url: str) -> List[str]:
                 # use two threads per process, default two threads per core
                 max_number_of_threads = self.processes * 2
 
-                try:
-                    with ThreadPool(processes=min(len(urls), max_number_of_threads)) as pool:
-                        nested_warc_paths = pool.map(random_sleep(load_paths, (0, 3)), urls)
-                finally:
-                    pool.join()
+                with ThreadPoolExecutor(max_workers=min(len(urls), max_number_of_threads)) as pool:
+                    nested_warc_paths = pool.map(random_sleep(load_paths, (0, 3)), urls)
 
         warc_paths: Iterator[str] = more_itertools.flatten(nested_warc_paths)
 
diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 2afa23918..94a248efd 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -1,7 +1,7 @@
 import time
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Callable, Dict, Iterable, Iterator, List, Optional, Protocol
+from typing import BinaryIO, Callable, Dict, Iterable, Iterator, List, Optional, Protocol, cast
 from urllib.parse import urlparse
 
 import chardet
@@ -171,9 +171,11 @@ def __init__(
                         f"Overwriting existing delay."
                     )
 
-                    def delay() -> float:
+                    def _crawl_delay() -> float:
                         return robots_delay
 
+                    delay = _crawl_delay
+
         self.clock = _Clock(delay=delay, sleep=self._sleep)
 
     @property
@@ -333,7 +335,9 @@ def extract_content(record: WarcRecord) -> Optional[str]:
             response = session.get(self.warc_path, stream=True, headers=self.headers)
             response.raise_for_status()
 
-            for warc_record in ArchiveIterator(response.raw, record_types=WarcRecordType.response, verify_digests=True):
+            for warc_record in ArchiveIterator(
+                cast(BinaryIO, response.raw), record_types=WarcRecordType.response, verify_digests=True
+            ):
                 if not warc_record.record_date:
                     continue
 
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index fc95e3f97..727b3a22c 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -1,3 +1,4 @@
+import random
 from typing import Dict, Iterator, List, Literal, Optional, Type
 
 import more_itertools
@@ -34,6 +35,9 @@ def scrape(
             for html in source.fetch(url_filter=url_filter):
                 parser = self.parser_mapping[html.source_info.publisher]
 
+                if random.uniform(0, 1) > 0.9:
+                    raise Exception("TEST")
+
                 try:
                     extraction = parser(html.crawl_date).parse(html.content, error_handling)
 
diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py
index 822354e48..047a45fc4 100644
--- a/src/fundus/scraping/url.py
+++ b/src/fundus/scraping/url.py
@@ -19,6 +19,7 @@
 from urllib.parse import unquote
 
 import feedparser
+import lxml.etree
 import lxml.html
 import validators
 from lxml.etree import XMLParser, XPath
@@ -159,9 +160,9 @@ def __iter__(self) -> Iterator[str]:
             logger.warning(f"Warning! Couldn't parse rss feed {self.url!r} because of {exception}")
             return
         else:
-            urls = filter(bool, (entry.get("link") for entry in rss_feed["entries"]))
-            for url in urls:
-                yield clean_url(url)
+            for entry in rss_feed["entries"]:
+                if isinstance(url := entry.get("link"), str):
+                    yield clean_url(url)
 
 
 @dataclass
diff --git a/src/fundus/utils/timeout.py b/src/fundus/utils/timeout.py
index 92b6e7d48..2194f19c8 100644
--- a/src/fundus/utils/timeout.py
+++ b/src/fundus/utils/timeout.py
@@ -27,8 +27,8 @@ def __init__(
         seconds: float,
         func: Callable[P, None],
         interval: float = 0.1,
-        args: P.args = tuple(),
-        kwargs: P.kwargs = None,
+        *args: P.args,
+        **kwargs: P.kwargs,
     ) -> None:
         """Resettable timer executing <func> after <time> seconds, checking every <interval>.
 
@@ -51,8 +51,8 @@ def run(self) -> None:
             time.sleep(self.interval)
             if self._canceled.is_set():
                 return
-        # noinspection PyUnresolvedReferences
-        self._target(*self._args, **self._kwargs)  # type: ignore[attr-defined]
+
+        super().run()
 
     def reset(self) -> None:
         self.watch.reset()
diff --git a/tests/utility.py b/tests/utility.py
index d30128021..605aa2e9e 100644
--- a/tests/utility.py
+++ b/tests/utility.py
@@ -38,7 +38,6 @@ def get_test_articles(publisher: Publisher) -> List[Article]:
     return articles
 
 
-@dataclass
 class JSONFile(Generic[_T]):
     """Generic file class representing a JSON file.
 
@@ -55,10 +54,17 @@ class JSONFile(Generic[_T]):
         >>> json_file.write(content)
     """
 
-    path: Path
-    encoder: Optional[Type[json.JSONEncoder]] = None
-    decoder: Optional[Type[json.JSONDecoder]] = None
-    encoding: str = "utf-8"
+    def __init__(
+        self,
+        path: Path,
+        encoder: Optional[Type[json.JSONEncoder]] = None,
+        decoder: Optional[Type[json.JSONDecoder]] = None,
+        encoding: str = "utf-8",
+    ):
+        self.path = path
+        self.encoder = encoder
+        self.decoder = decoder
+        self.encoding = encoding
 
     def load(self, **kwargs) -> Optional[_T]:
         """Load file content using json.load().
@@ -109,15 +115,15 @@ def write(self, content: _T, **kwargs) -> None:
 
 
 class ExtractionEncoder(json.JSONEncoder):
-    def default(self, obj: object):
-        if isinstance(obj, datetime.datetime):
-            return str(obj)
-        elif isinstance(obj, TextSequenceTree):
-            return obj.serialize()
-        elif isinstance(obj, Image):
-            return obj.serialize()
+    def default(self, o: object):
+        if isinstance(o, datetime.datetime):
+            return str(o)
+        elif isinstance(o, TextSequenceTree):
+            return o.serialize()
+        elif isinstance(o, Image):
+            return o.serialize()
         else:
-            return json.JSONEncoder.default(self, obj)
+            return json.JSONEncoder.default(self, o)
 
 
 class ExtractionDecoder(json.JSONDecoder):
@@ -129,21 +135,20 @@ class ExtractionDecoder(json.JSONDecoder):
     }
 
     def __init__(self, *args, **kwargs):
-        json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
+        json.JSONDecoder.__init__(self, object_hook=self._object_hook, *args, **kwargs)
 
-    def object_hook(self, obj_dict):
+    def _object_hook(self, obj_dict):
         for key, deserialization_function in self.deserialization_functions.items():
             if (serialized_value := obj_dict.get(key)) is not None:
                 obj_dict[key] = deserialization_function(serialized_value)
         return obj_dict
 
 
-@dataclass
 class JSONFileWithExtractionDecoderEncoder(JSONFile[_T]):
     """Custom JSONFile using default ExtractionEncoder/ExtractionDecoder"""
 
-    encoder: Type[json.JSONEncoder] = ExtractionEncoder
-    decoder: Type[json.JSONDecoder] = ExtractionDecoder
+    def __init__(self, path: Path, encoding: str = "utf-8"):
+        super().__init__(path, encoder=ExtractionEncoder, decoder=ExtractionDecoder, encoding=encoding)
 
 
 @dataclass

From fb46ab08bfa6016796609db12f36602d977c7adc Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 16 Mar 2026 18:24:14 +0100
Subject: [PATCH 3/3] remove type ignores and fix issues

---
 scripts/check_coverage.py                   | 2 +-
 scripts/generate_tables.py                  | 2 +-
 scripts/publisher_coverage.py               | 2 +-
 src/fundus/parser/base_parser.py            | 2 +-
 src/fundus/parser/utility.py                | 2 +-
 src/fundus/publishers/de/winfuture.py       | 2 +-
 src/fundus/publishers/fr/le_monde.py        | 2 +-
 src/fundus/publishers/ind/times_of_india.py | 2 +-
 src/fundus/scraping/article.py              | 4 ++--
 src/fundus/scraping/crawler.py              | 2 +-
 src/fundus/scraping/filter.py               | 2 +-
 src/fundus/scraping/html.py                 | 6 ++++--
 src/fundus/scraping/url.py                  | 2 +-
 src/fundus/utils/regex.py                   | 8 ++++----
 src/fundus/utils/serialization.py           | 4 ++--
 tests/test_article.py                       | 8 ++------
 tests/test_parser.py                        | 4 ++--
 17 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/scripts/check_coverage.py b/scripts/check_coverage.py
index 1bf7f5f65..4fde2ef90 100644
--- a/scripts/check_coverage.py
+++ b/scripts/check_coverage.py
@@ -308,7 +308,7 @@ def main() -> None:
     if (parsed := parse_coverage_file(txt)) is None:
         raise RuntimeError(f"Couldn't parse latest coverage file for run {latest_run.id}")
 
-    failed_publishers = [publisher for publisher, status in parsed.items() if not status]  # type: ignore[union-attr]
+    failed_publishers = [publisher for publisher, status in parsed.items() if not status]
 
     print(f"Latest run on '{run_time}' with {len(failed_publishers)} failed publishers.")
     print(failed_publishers)
diff --git a/scripts/generate_tables.py b/scripts/generate_tables.py
index 41b6e595c..bbf9f8a1e 100644
--- a/scripts/generate_tables.py
+++ b/scripts/generate_tables.py
@@ -91,7 +91,7 @@ def align_tables(tables: Sequence[lxml.html.HtmlElement]) -> None:
 
     for column_index, colum_heads in enumerate(
         more_itertools.transpose(table_heads),
-        start=1,  # type: ignore[attr-defined]
+        start=1,
     ):
         column_texts: List[str] = [
             text for table in tables for text in table.xpath(f"/table/tbody/tr/td[{column_index}]//text()")
diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py
index 03d71c234..d4a1f7484 100644
--- a/scripts/publisher_coverage.py
+++ b/scripts/publisher_coverage.py
@@ -47,7 +47,7 @@ def main() -> None:
                     # skip publishers providing no sources for forward crawling
                     print(f"⏩  SKIPPED: {publisher_name!r} - No sources defined")
                     continue
-                if publisher.deprecated:  # type: ignore[attr-defined]
+                if publisher.deprecated:
                     print(f"⏩  SKIPPED: {publisher_name!r} - Deprecated")
                     continue
                 if publisher.__name__ in parsed_arguments.skip:
diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py
index 7d2e8cbd4..a82c81e4e 100644
--- a/src/fundus/parser/base_parser.py
+++ b/src/fundus/parser/base_parser.py
@@ -408,7 +408,7 @@ def predicate(x: object) -> bool:
         mapping: Dict[date, _ParserCache] = {}
         for versioned_parser in sorted(included_parsers, key=lambda parser: parser.VALID_UNTIL):
             validation_date: date
-            if prev := mapping.get(validation_date := versioned_parser.VALID_UNTIL):  # type: ignore
+            if prev := mapping.get(validation_date := versioned_parser.VALID_UNTIL):
                 raise ValueError(
                     f"Found versions {prev.factory.__name__!r} and {versioned_parser.__name__!r} of "
                     f"{str(self)!r} with same validation date.\nMake sure you use class attribute VALID_UNTIL "
diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index ae6f5a09b..efde80739 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -579,7 +579,7 @@ class CustomParserInfo(parser.parserinfo):
         ("Oct", "October", "Oktober", "Okt"),
         ("Nov", "November"),
         ("Dec", "December", "Dezember", "Dez"),
-    ]  # type: ignore[assignment]
+    ]
     # type ignore due to types-python-dateutil==2.9.0.20251008, see https://github.com/flairNLP/fundus/issues/806
 
 
diff --git a/src/fundus/publishers/de/winfuture.py b/src/fundus/publishers/de/winfuture.py
index 72dd70f07..9b22cadc7 100644
--- a/src/fundus/publishers/de/winfuture.py
+++ b/src/fundus/publishers/de/winfuture.py
@@ -41,7 +41,7 @@ def body(self) -> Optional[ArticleBody]:
             html_as_string = re.sub(r"(?<=<br>)\n(?!([<\W]))", "\n<p>", html_as_string)
             html_as_string = re.sub(r"(?<=(ipt|div)>)\n(?![\W<])", "\n<p>", html_as_string)
             html_as_string = re.sub(r"(?<![\W>])\n(?=<[a-z0-9=_'\"]*>)", "</p>\n", html_as_string)
-            doc: HtmlElement = fromstring(html_as_string)  # type: ignore
+            doc: HtmlElement = fromstring(html_as_string)
             return extract_article_body_with_selector(
                 doc=doc,
                 paragraph_selector=self._paragraph_selector,
diff --git a/src/fundus/publishers/fr/le_monde.py b/src/fundus/publishers/fr/le_monde.py
index fd8f3a99d..a22eb7df0 100644
--- a/src/fundus/publishers/fr/le_monde.py
+++ b/src/fundus/publishers/fr/le_monde.py
@@ -35,7 +35,7 @@ def title(self) -> Optional[str]:
 
         @attribute
         def topics(self) -> List[str]:
-            return self.precomputed.ld.bf_search("keywords")  # type: ignore
+            return self.precomputed.ld.bf_search("keywords")
 
         @attribute
         def publishing_date(self) -> Optional[datetime.datetime]:
diff --git a/src/fundus/publishers/ind/times_of_india.py b/src/fundus/publishers/ind/times_of_india.py
index f0343eaa7..3d6ebc75c 100644
--- a/src/fundus/publishers/ind/times_of_india.py
+++ b/src/fundus/publishers/ind/times_of_india.py
@@ -41,7 +41,7 @@ def body(self) -> Optional[ArticleBody]:
                 r"<div class=\"_s30J clearfix  \">", "<div class=\"_s30J clearfix  \"><p class='intro'>", html_as_string
             )
             return extract_article_body_with_selector(
-                fromstring(html_as_string),  # type: ignore
+                fromstring(html_as_string),
                 summary_selector=self._summary_selector,
                 paragraph_selector=self._paragraph_selector,
                 subheadline_selector=self._subheadline_selector,
diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
index a64502bc0..afdc5548a 100644
--- a/src/fundus/scraping/article.py
+++ b/src/fundus/scraping/article.py
@@ -130,12 +130,12 @@ def to_json(self, *attributes: str) -> Dict[str, JSONVal]:
 
         def serialize(v: Any) -> JSONVal:
             if hasattr(v, "serialize"):
-                return v.serialize()  # type: ignore[no-any-return]
+                return v.serialize()
             elif isinstance(v, datetime):
                 return str(v)
             elif not is_jsonable(v):
                 raise TypeError(f"Attribute {attribute!r} of type {type(v)!r} is not JSON serializable")
-            return v  # type: ignore[no-any-return]
+            return v
 
         serialization: Dict[str, JSONVal] = {}
         for attribute in attributes:
diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 0121bcf89..b94276b80 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -785,7 +785,7 @@ def load_paths(url: str) -> List[str]:
                 max_number_of_threads = self.processes * 2
 
                 with ThreadPoolExecutor(max_workers=min(len(urls), max_number_of_threads)) as pool:
-                    nested_warc_paths = pool.map(random_sleep(load_paths, (0, 3)), urls)
+                    nested_warc_paths = list(pool.map(random_sleep(load_paths, (0, 3)), urls))
 
         warc_paths: Iterator[str] = more_itertools.flatten(nested_warc_paths)
 
diff --git a/src/fundus/scraping/filter.py b/src/fundus/scraping/filter.py
index 35c6f22e2..742ca44d8 100644
--- a/src/fundus/scraping/filter.py
+++ b/src/fundus/scraping/filter.py
@@ -150,7 +150,7 @@ def __init__(self, *required_attributes: str, eval_booleans: bool = True) -> Non
         """
         self.required_attributes = set(required_attributes)
         # somehow mypy does not recognize bool as callable :(
-        self._eval: Callable[[Any], bool] = bool if eval_booleans else _guarded_bool  # type: ignore[assignment]
+        self._eval: Callable[[Any], bool] = bool if eval_booleans else _guarded_bool
 
     def __call__(self, extraction: Dict[str, Any]) -> FilterResultWithMissingAttributes:
         missing_attributes = [
diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 94a248efd..a743318cc 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -306,8 +306,10 @@ def extract_content(record: WarcRecord) -> Optional[str]:
             warc_body: bytes = record.reader.read()
 
             try:
-                return str(warc_body, encoding=record.http_charset)  # type: ignore[arg-type]
-            except (UnicodeDecodeError, TypeError):
+                if record.http_charset is None:
+                    raise UnicodeDecodeError("unknown", warc_body, 0, 1, "no charset")
+                return str(warc_body, encoding=record.http_charset)
+            except UnicodeDecodeError:
                 encoding: Optional[str] = chardet.detect(warc_body)["encoding"]
 
                 if encoding is not None:
diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py
index 047a45fc4..0786ec7d2 100644
--- a/src/fundus/scraping/url.py
+++ b/src/fundus/scraping/url.py
@@ -207,7 +207,7 @@ def yield_recursive(sitemap_url: str) -> Iterator[str]:
             tree = lxml.etree.fromstring(content, parser=self._parser)
             if tree is None:
                 # in case we somehow end up with non xml content
-                logger.warning(f"Warning! Couldn't parse sitemap {sitemap_url!r}")  # type: ignore[unreachable]
+                logger.warning(f"Warning! Couldn't parse sitemap {sitemap_url!r}")
                 return
             urls = [node.text for node in self._url_selector(tree)]
             if urls:
diff --git a/src/fundus/utils/regex.py b/src/fundus/utils/regex.py
index c3d563238..a79894852 100644
--- a/src/fundus/utils/regex.py
+++ b/src/fundus/utils/regex.py
@@ -1,5 +1,5 @@
 import re
-from typing import Callable, Dict, Literal, Optional, Pattern, TypeVar, Union, overload
+from typing import Any, Callable, Dict, Literal, Optional, Pattern, TypeVar, overload
 
 _T = TypeVar("_T")
 
@@ -19,12 +19,12 @@ def _get_match_dict(pattern: Pattern[str], string: str) -> Dict[str, str]: ...
 
 
 @overload
-def _get_match_dict(pattern: Pattern[str], string: str, keep_none: Literal[True]) -> Dict[str, Optional[str]]: ...
+def _get_match_dict(pattern: Pattern[str], string: str, *, keep_none: Literal[True]) -> Dict[str, Optional[str]]: ...
 
 
-def _get_match_dict(  # type: ignore[misc]
+def _get_match_dict(
     pattern: Pattern[str], string: str, conversion: Optional[Callable[[str], _T]] = None, keep_none: bool = False
-) -> Dict[str, Union[str, _T, None]]:
+) -> Any:
     matches = {}
     for match in re.finditer(pattern, string):
         match_dict = match.groupdict()
diff --git a/src/fundus/utils/serialization.py b/src/fundus/utils/serialization.py
index 0b15da0a4..b954e5328 100644
--- a/src/fundus/utils/serialization.py
+++ b/src/fundus/utils/serialization.py
@@ -58,7 +58,7 @@ class DataclassSerializationMixin:
     def serialize(self) -> Dict[str, JSONVal]:
         if not is_dataclass(self):
             raise TypeError(f"{type(self).__name__!r} is not a dataclass")
-        return asdict(self)  # type: ignore[arg-type]
+        return asdict(self)
 
     @classmethod
     def deserialize(cls: Type[_M], serialized: Dict[str, JSONVal]) -> _M:
@@ -72,7 +72,7 @@ def deserialize(cls: Type[_M], serialized: Dict[str, JSONVal]) -> _M:
         for field in fields(cls):
             serialized[field.name] = _inner_deserialize(serialized[field.name], annotations[field.name])
 
-        return cls(**serialized)  # type: ignore[return-value]
+        return cls(**serialized)
 
 
 def _inner_deserialize(data, cls):
diff --git a/tests/test_article.py b/tests/test_article.py
index 49d7d1f33..d1c49be07 100644
--- a/tests/test_article.py
+++ b/tests/test_article.py
@@ -15,10 +15,10 @@ def test_constructor(self):
         extraction = {"authors": ["Author"], "title": "title"}
 
         with pytest.raises(TypeError):
-            Article(extraction, html=html)  # type: ignore[arg-type, misc]
+            Article(extraction, html=html)  # type: ignore[call-overload]
 
         with pytest.raises(TypeError):
-            Article(**extraction)  # type: ignore[arg-type]
+            Article(**extraction)  # type: ignore[call-overload]
 
         Article(**{}, html=html)
         Article(**extraction, html=html, exception=None)
@@ -56,10 +56,6 @@ def test_extraction_view_getter(self):
         assert article.test_attribute
         assert article.test_attribute == "test_value"
 
-        article.__extraction__["test_attribute"] = "very_secret_stuff"  # type: ignore[index]
-
-        assert article.test_attribute == "very_secret_stuff"
-
     def test_extraction_view_setter(self):
         extraction = {"test_attribute": "test_value"}
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 174aec728..d7d2a5220 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -305,9 +305,9 @@ def test_generic_author_parsing(self):
 
         # type list[dict]
         assert generic_author_parsing(
-            [{"name": "Peter Funny"}, {"name": "Funny Peter"}, {"this": "is not a pipe"}, {}]  # type: ignore
+            [{"name": "Peter Funny"}, {"name": "Funny Peter"}, {"this": "is not a pipe"}, {}]
         ) == ["Peter Funny", "Funny Peter"]
-        assert generic_author_parsing([{}]) == generic_author_parsing([{}, {"wrong": "key"}]) == []  # type: ignore
+        assert generic_author_parsing([{}]) == generic_author_parsing([{}, {"wrong": "key"}]) == []
 
 
 class TestMetaInfo: