From ed57e179c5f24321560661179e208b248ab47cce Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Thu, 16 Oct 2025 23:33:06 +0200 Subject: [PATCH 1/9] draft --- src/fundus/parser/data.py | 32 +++++++ src/fundus/parser/utility.py | 116 ++++++++++++++++++++++++- src/fundus/publishers/de/tagesschau.py | 32 +++++-- src/fundus/scraping/article.py | 41 ++++++++- 4 files changed, 209 insertions(+), 12 deletions(-) diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index c8d0dd362..f97f598a8 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -413,6 +413,38 @@ def __bool__(self): return any(bool(section) for section in self.sections) +@dataclass +class LiveTickerBody(TextSequenceTree): + summary: TextSequence + entries: List[ArticleBody] + entry_meta_information: List[Dict[str, Any]] + + def serialize(self) -> Dict[str, Any]: + return { + "summary": list(self.summary), + "entries": [entry.serialize() for entry in self.entries], + "entry_metas": self.entry_meta_information, + } + + @classmethod + def deserialize(cls, serialized: Dict[str, Any]) -> Self: + return cls( + summary=TextSequence(serialized["summary"]), + entries=[ArticleBody.deserialize(entry) for entry in serialized["entries"]], + entry_meta_information=serialized["entry_meta_information"], + ) + + def __bool__(self): + return any(bool(entry) for entry in self.entries) + + def __iter__(self) -> Iterator[Any]: + field_values = [ + getattr(self, f.name) for f in fields(self) if f.name not in ("entry_meta_information", "entries") + ] + field_values.extend([entry.sections for entry in self.entries]) + yield from field_values + + @total_ordering @dataclass class Dimension(DataclassSerializationMixin): diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index b0a600cd4..f297b11f0 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -10,6 +10,7 @@ from datetime import datetime from functools import total_ordering from typing import ( + Any, Callable, ClassVar, Dict, @@ -44,6 +45,7 @@ Image, ImageVersion, LinkedDataMapping, + LiveTickerBody, TextSequence, ) from fundus.utils.regex import _get_match_dict @@ -69,7 +71,7 @@ def normalize_whitespace(text: str) -> str: @total_ordering @dataclass(eq=False) class Node: - position: int + position: float node: lxml.html.HtmlElement = field(compare=False) _break_selector: ClassVar[XPath] = XPath("*//br") @@ -124,10 +126,27 @@ class SummaryNode(Node): pass +class BoundaryNode(Node): + def __post_init__(self): + self.position -= 0.5 # in case a content node is also a boundary node, we want the boundary to come first + + class SubheadNode(Node): pass +class DateNode(Node): + pass + + +class AuthorNode(Node): + pass + + +class ImageNode(Node): + pass + + class ParagraphNode(Node): pass @@ -190,6 +209,101 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]: return ArticleBody(summary=summary, sections=sections) +def extract_live_ticker_body_with_selector( + doc: lxml.html.HtmlElement, + paragraph_selector: XPath, + summary_selector: Optional[XPath] = None, + subheadline_selector: Optional[XPath] = None, + entry_boundary_selector: Optional[XPath] = None, + tag_filter: Optional[XPath] = None, + date_selector: Optional[XPath] = None, + author_selector: Optional[XPath] = None, + image_selector: Optional[XPath] = None, + image_selection_helper: Optional[Callable[[lxml.html.HtmlElement], List[Image]]] = None, +) -> LiveTickerBody: + # depth first index for each element in tree + df_idx_by_ref = {element: i for i, element in enumerate(doc.iter())} + + def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlElement = doc) -> List[Node]: + if not selector and node_type: + raise ValueError("Both a selector and node type are required") + + return [node for element in selector(root) if (node := node_type(df_idx_by_ref[element], element))] + + summary_nodes = extract_nodes(summary_selector, SummaryNode) if summary_selector else [] + boundary_nodes = extract_nodes(entry_boundary_selector, BoundaryNode) if entry_boundary_selector else [] + paragraph_nodes = extract_nodes(paragraph_selector, ParagraphNode) + subhead_nodes = extract_nodes(subheadline_selector, SubheadNode) if subheadline_selector else [] + date_nodes = extract_nodes(date_selector, DateNode) if date_selector else [] + author_nodes = extract_nodes(author_selector, AuthorNode) if author_selector else [] + image_nodes = extract_nodes(image_selector, ImageNode) if image_selector else [] + nodes = sorted( + summary_nodes + boundary_nodes + subhead_nodes + paragraph_nodes + date_nodes + author_nodes + image_nodes + ) + + if not nodes[: len(summary_nodes)] == summary_nodes: + raise ValueError(f"All summary nodes should be at the beginning of the article") + + summary = TextSequence( + map( + lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)), + summary_nodes, + ) + ) + + entries: List[ArticleBody] = [] + entries_meta_information: List[Dict[str, Any]] = [] + entry_nodes = more_itertools.split_at(nodes[len(summary_nodes) :], pred=lambda x: isinstance(x, BoundaryNode)) + + for entry in entry_nodes: + if not entry: + continue + content_nodes = filter(lambda x: isinstance(x, ParagraphNode) or isinstance(x, SubheadNode), entry) + instructions = more_itertools.split_when(content_nodes, pred=lambda x, y: type(x) != type(y)) + subhead_nodes = [] + paragraph_nodes = [] + entry_date = None + entry_authors = [] + entry_images: List[Image] = [] + for node in entry: + if isinstance(node, SubheadNode): + subhead_nodes.append(node) + elif isinstance(node, ParagraphNode): + paragraph_nodes.append(node) + elif isinstance(node, DateNode): + entry_date = generic_date_parsing("".join(generic_nodes_to_text([node.node]))) + elif isinstance(node, AuthorNode): + entry_authors = generic_author_parsing(generic_nodes_to_text([node.node])) + elif isinstance(node, ImageNode): + entry_images = image_selection_helper(node.node) if image_selection_helper else [] + else: + raise ValueError(f"Unsupported node type: {type(node)}") + + if not subhead_nodes or (paragraph_nodes and subhead_nodes[0] > paragraph_nodes[0]): + first = next(instructions) + instructions = itertools.chain([first, []], instructions) + + sections: List[ArticleSection] = [] + + for chunk in more_itertools.chunked(instructions, 2): + if len(chunk) == 1: + chunk.append([]) + texts = [ + list( + map( + lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)), + c, + ) + ) + for c in chunk + ] + sections.append(ArticleSection(*map(TextSequence, texts))) + + entries.append(ArticleBody(summary=TextSequence([]), sections=sections)) + entries_meta_information.append({"date": entry_date, "authors": entry_authors, "images": entry_images}) + return LiveTickerBody(summary=summary, entries=entries, entry_meta_information=entries_meta_information) + + _ld_node_selector = XPath("//script[@type='application/ld+json']") _json_pattern = re.compile(r"(?P{[\s\S]*}|\[\s*{[\s\S]*}\s*](?!\s*}))") _json_undefined = re.compile(r'(?P"[^"]*?"):\s*undefined') diff --git a/src/fundus/publishers/de/tagesschau.py b/src/fundus/publishers/de/tagesschau.py index 75716a914..fde4678cd 100644 --- a/src/fundus/publishers/de/tagesschau.py +++ b/src/fundus/publishers/de/tagesschau.py @@ -1,13 +1,15 @@ import datetime import re -from typing import List, Optional +from typing import List, Optional, Union from lxml.cssselect import CSSSelector from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute +from fundus.parser.data import LiveTickerBody from fundus.parser.utility import ( extract_article_body_with_selector, + extract_live_ticker_body_with_selector, generic_author_parsing, generic_date_parsing, image_extraction, @@ -22,14 +24,28 @@ class V1(BaseParser): _author_selector = XPath('string(//div[contains(@class, "authorline__author")])') _topic_selector = CSSSelector("div.meldungsfooter .taglist a") + _live_ticker_boundary_selector = XPath("//div[contains(@class, 'liveblog--anchor')]") + _live_ticker_paragraph_selector = XPath("//p[contains(@class,'textabsatz ') and not(strong)]") + _live_ticker_subheadline_selector = XPath("//h2[@class='meldung__subhead']") + _live_ticker_date_selector = XPath("//div[@class='liveblog__datetime']") + @attribute - def body(self) -> Optional[ArticleBody]: - return extract_article_body_with_selector( - self.precomputed.doc, - summary_selector=self._summary_selector, - subheadline_selector=self._subheadline_selector, - paragraph_selector=self._paragraph_selector, - ) + def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]: + if not self._live_ticker_boundary_selector(self.precomputed.doc): + return extract_article_body_with_selector( + self.precomputed.doc, + summary_selector=self._summary_selector, + subheadline_selector=self._subheadline_selector, + paragraph_selector=self._paragraph_selector, + ) + else: + return extract_live_ticker_body_with_selector( + doc=self.precomputed.doc, + entry_boundary_selector=self._live_ticker_boundary_selector, + paragraph_selector=self._live_ticker_paragraph_selector, + subheadline_selector=self._live_ticker_subheadline_selector, + date_selector=self._live_ticker_date_selector, + ) @attribute def authors(self) -> List[str]: diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py index b06c40359..3e84462fb 100644 --- a/src/fundus/scraping/article.py +++ b/src/fundus/scraping/article.py @@ -1,6 +1,6 @@ from datetime import datetime from textwrap import TextWrapper, dedent -from typing import Any, Dict, List, Mapping, Optional +from typing import Any, Dict, List, Mapping, Optional, Union import langdetect import lxml.html @@ -8,6 +8,7 @@ from fundus.logging import create_logger from fundus.parser import ArticleBody, Image +from fundus.parser.data import LiveTickerBody from fundus.scraping.html import HTML from fundus.utils.serialization import JSONVal, is_jsonable @@ -27,7 +28,7 @@ def __set__(self, obj, value): raise AttributeError("attribute is read only") -class Article: +class Publication: __extraction__: Mapping[str, Any] = {} def __init__(self, *, html: HTML, exception: Optional[Exception] = None, **extraction: Any) -> None: @@ -45,7 +46,7 @@ def title(self) -> Optional[str]: return self.__extraction__.get("title") @property - def body(self) -> Optional[ArticleBody]: + def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]: return self.__extraction__.get("body") @property @@ -150,6 +151,8 @@ def serialize(v: Any) -> JSONVal: return serialization + +class Article(Publication): def __str__(self): # the subsequent indent here is a bit wacky, but textwrapper.dedent won't work with tabs, so we have to use # whitespaces instead. @@ -176,3 +179,35 @@ def __str__(self): ) return dedent(text) + + +class LiveTicker(Publication): + def __str__(self): + # the subsequent indent here is a bit wacky, but textwrapper.dedent won't work with tabs, so we have to use + # whitespaces instead. + title_wrapper = TextWrapper(width=80, max_lines=1, initial_indent="") + text_wrapper = TextWrapper(width=80, max_lines=2, initial_indent="", subsequent_indent=" ") + wrapped_title = title_wrapper.fill( + f"{Fore.RED}--missing title--{Style.RESET_ALL}" if self.title is None else self.title.strip() + ) + wrapped_plaintext = text_wrapper.fill( + f"{Fore.RED}--missing plaintext--{Style.RESET_ALL}" if self.plaintext is None else self.plaintext.strip() + ) + + summary_text = ( + f" including {len(self.body.entries if hasattr(self.body, 'entries') and self.body is not None else [])} entries" + f" and {len(self.images)} image(s)" + if self.images and not isinstance(self.images, Exception) + else "" + ) + + text = ( + f"Fundus-LiveTicker{summary_text}:" + f'\n- Title: "{wrapped_title}"' + f'\n- Text: "{wrapped_plaintext}"' + f"\n- URL: {self.html.requested_url}" + f"\n- From: {self.publisher}" + f'{" (Newest Entry from: " + self.publishing_date.strftime("%Y-%m-%d %H:%M") + ")" if self.publishing_date else ""}' + ) + + return dedent(text) From 01507dc5f241ada5cf7a278aad03e5c9462f8b15 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Sun, 26 Oct 2025 23:59:03 +0100 Subject: [PATCH 2/9] detect `LiveTicker` in scraper --- src/fundus/publishers/de/tagesschau.py | 2 ++ src/fundus/scraping/scraper.py | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/fundus/publishers/de/tagesschau.py b/src/fundus/publishers/de/tagesschau.py index fde4678cd..7fea43625 100644 --- a/src/fundus/publishers/de/tagesschau.py +++ b/src/fundus/publishers/de/tagesschau.py @@ -28,6 +28,7 @@ class V1(BaseParser): _live_ticker_paragraph_selector = XPath("//p[contains(@class,'textabsatz ') and not(strong)]") _live_ticker_subheadline_selector = XPath("//h2[@class='meldung__subhead']") _live_ticker_date_selector = XPath("//div[@class='liveblog__datetime']") + _live_ticker_summary_selector = XPath("//article/p[strong]|//article/div/ul/li[not(@class)]") @attribute def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]: @@ -42,6 +43,7 @@ def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]: return extract_live_ticker_body_with_selector( doc=self.precomputed.doc, entry_boundary_selector=self._live_ticker_boundary_selector, + summary_selector=self._live_ticker_summary_selector, paragraph_selector=self._live_ticker_paragraph_selector, subheadline_selector=self._live_ticker_subheadline_selector, date_selector=self._live_ticker_date_selector, diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index e903b0c29..01455416a 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -4,8 +4,9 @@ from fundus.logging import create_logger from fundus.parser import ParserProxy +from fundus.parser.data import LiveTickerBody from fundus.publishers.base_objects import Publisher -from fundus.scraping.article import Article +from fundus.scraping.article import Article, LiveTicker from fundus.scraping.delay import Delay from fundus.scraping.filter import ( ExtractionFilter, @@ -61,7 +62,10 @@ def scrape( else: logger.debug(f"Skipped article at {html.requested_url!r} because of extraction filter") else: - article = Article(html=html, **extraction) + if "body" in extraction.keys() and isinstance(extraction["body"], LiveTickerBody): + article = LiveTicker(html=html, **extraction) + else: + article = Article(html=html, **extraction) if language_filter and article.lang not in language_filter: logger.debug( f"Skipped article at {html.requested_url!r} because article language: " From 3a7d2423dc299296af014500f67854badb0e8086 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Tue, 28 Oct 2025 16:06:39 +0100 Subject: [PATCH 3/9] fix images and typing --- src/fundus/parser/data.py | 2 +- src/fundus/parser/utility.py | 6 +++--- src/fundus/scraping/article.py | 23 +++++++++++++++++--- src/fundus/scraping/crawler.py | 38 +++++++++++++++++----------------- src/fundus/scraping/scraper.py | 6 +++--- 5 files changed, 46 insertions(+), 29 deletions(-) diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index f97f598a8..ee21c1f14 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -423,7 +423,7 @@ def serialize(self) -> Dict[str, Any]: return { "summary": list(self.summary), "entries": [entry.serialize() for entry in self.entries], - "entry_metas": self.entry_meta_information, + "entry_meta_information": self.entry_meta_information, } @classmethod diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index f297b11f0..bd88b2253 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -616,10 +616,10 @@ class CustomParserInfo(parser.parserinfo): ("Jul", "July", "Juli"), ("Aug", "August"), ("Sep", "Sept", "September"), - ("Oct", "October", "Oktober", "Okt"), + ("Oct", "October", "Oktober", "Okt"), # type: ignore[list-item] ("Nov", "November"), - ("Dec", "December", "Dezember", "Dez"), - ] + ("Dec", "December", "Dezember", "Dez"), # type: ignore[list-item] + ] # type ignore due to types-python-dateutil==2.9.0.20251008, see https://github.com/flairNLP/fundus/issues/806 def generic_date_parsing(date_str: Optional[str]) -> Optional[datetime]: diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py index 3e84462fb..7fbc1202a 100644 --- a/src/fundus/scraping/article.py +++ b/src/fundus/scraping/article.py @@ -8,7 +8,7 @@ from fundus.logging import create_logger from fundus.parser import ArticleBody, Image -from fundus.parser.data import LiveTickerBody +from fundus.parser.data import LiveTickerBody, TextSequenceTree from fundus.scraping.html import HTML from fundus.utils.serialization import JSONVal, is_jsonable @@ -46,8 +46,8 @@ def title(self) -> Optional[str]: return self.__extraction__.get("title") @property - def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]: - return self.__extraction__.get("body") + def body(self) -> Optional[TextSequenceTree]: + raise NotImplementedError @property def authors(self) -> List[str]: @@ -180,6 +180,10 @@ def __str__(self): return dedent(text) + @property + def body(self) -> Optional[ArticleBody]: + return self.__extraction__.get("body") + class LiveTicker(Publication): def __str__(self): @@ -211,3 +215,16 @@ def __str__(self): ) return dedent(text) + + @property + def body(self) -> Optional[LiveTickerBody]: + return self.__extraction__.get("body") + + @property + def images(self) -> List[Image]: + images: List[Image] = self.__extraction__.get("images", []) + if self.body is None: + return images + for entry_meta in self.body.entry_meta_information: + images.extend(entry_meta.get("images", [])) + return images diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index 03783940b..ae3adb386 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -52,7 +52,7 @@ from fundus.logging import create_logger, get_current_config from fundus.parser.data import remove_query_parameters_from_url from fundus.publishers.base_objects import FilteredPublisher, Publisher, PublisherGroup -from fundus.scraping.article import Article +from fundus.scraping.article import Article, Publication from fundus.scraping.delay import Delay from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter from fundus.scraping.html import CCNewsSource @@ -222,7 +222,7 @@ def _build_article_iterator( extraction_filter: Optional[ExtractionFilter], url_filter: Optional[URLFilter], language_filter: Optional[List[str]], - ) -> Iterator[Article]: + ) -> Iterator[Publication]: raise NotImplementedError def crawl( @@ -236,7 +236,7 @@ def crawl( language_filter: Optional[List[str]] = None, only_unique: bool = True, save_to_file: Union[None, str, Path] = None, - ) -> Iterator[Article]: + ) -> Iterator[Publication]: """Yields articles from initialized scrapers Args: @@ -269,7 +269,7 @@ def crawl( specified file as a JSON list. Returns: - Iterator[Article]: An iterator yielding objects of type Article. + Iterator[Publication]: An iterator yielding objects of type Article. """ if max_articles == 0: @@ -343,7 +343,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: logger.info(f"Publisher language filter: {publisher_language_filter} will be used as the language filter. ") article_count: Dict[str, int] = defaultdict(int) - crawled_articles: Dict[str, List[Article]] = defaultdict(list) + crawled_articles: Dict[str, List[Publication]] = defaultdict(list) # Unfortunately we relly on this little workaround here to terminate the 'Pool' used within # the 'CCNewsCrawler'. The 'Timeout' contextmanager utilizes '_thread.interrupt_main', @@ -465,7 +465,7 @@ def _fetch_articles( extraction_filter: Optional[ExtractionFilter] = None, url_filter: Optional[URLFilter] = None, language_filter: Optional[List[str]] = None, - ) -> Iterator[Article]: + ) -> Iterator[Publication]: def build_delay() -> Optional[Delay]: if isinstance(self.delay, float): delay = self.delay @@ -498,15 +498,15 @@ def constant_delay() -> float: @staticmethod def _single_crawl( - publishers: Tuple[Publisher, ...], article_task: Callable[[Publisher], Iterator[Article]] - ) -> Iterator[Article]: + publishers: Tuple[Publisher, ...], article_task: Callable[[Publisher], Iterator[Publication]] + ) -> Iterator[Publication]: article_iterators = [article_task(publisher) for publisher in publishers] yield from roundrobin(*article_iterators) def _threaded_crawl( - self, publishers: Tuple[Publisher, ...], article_task: Callable[[Publisher], Iterator[Article]] - ) -> Iterator[Article]: - result_queue: Queue[Union[Article, Exception]] = Queue(len(publishers)) + self, publishers: Tuple[Publisher, ...], article_task: Callable[[Publisher], Iterator[Publication]] + ) -> Iterator[Publication]: + result_queue: Queue[Union[Publication, Exception]] = Queue(len(publishers)) wrapped_article_task = queue_wrapper(result_queue, article_task) pool = ThreadPool(processes=len(publishers) or None) try: @@ -529,7 +529,7 @@ def _build_article_iterator( extraction_filter: Optional[ExtractionFilter], url_filter: Optional[URLFilter], language_filter: Optional[List[str]], - ) -> Iterator[Article]: + ) -> Iterator[Publication]: article_task = partial( self._fetch_articles, error_handling=error_handling, @@ -604,7 +604,7 @@ def _fetch_articles( url_filter: Optional[URLFilter] = None, language_filter: Optional[List[str]] = None, bar: Optional[tqdm] = None, - ) -> Iterator[Article]: + ) -> Iterator[Publication]: retries: int = 0 while True: source = CCNewsSource(*publishers, warc_path=warc_path) @@ -630,14 +630,14 @@ def _fetch_articles( @staticmethod def _single_crawl( - warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Article]] - ) -> Iterator[Article]: + warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Publication]] + ) -> Iterator[Publication]: for warc_path in warc_paths: yield from article_task(warc_path) def _parallel_crawl( - self, warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Article]] - ) -> Iterator[Article]: + self, warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Publication]] + ) -> Iterator[Publication]: # because logging configurations are overwritten when using 'spawn' as start method, # we have to get current logging configurations and initialize them in the new process if multiprocessing.get_start_method() == "spawn": @@ -654,7 +654,7 @@ def _parallel_crawl( processes=min(self.processes, len(warc_paths)), initializer=initializer, ) as pool: - result_queue: Queue[Union[Article, Exception]] = manager.Queue(maxsize=1000) + result_queue: Queue[Union[Publication, Exception]] = manager.Queue(maxsize=1000) # Because multiprocessing.Pool does not support iterators as targets, # we wrap the article_task to write the articles to a queue instead of returning them directly. @@ -738,7 +738,7 @@ def _build_article_iterator( url_filter: Optional[URLFilter], language_filter: Optional[List[str]], **kwargs, - ) -> Iterator[Article]: + ) -> Iterator[Publication]: warc_paths = tuple(self._get_warc_paths()) with get_proxy_tqdm(total=len(warc_paths), desc="Process WARC files", disable=self.disable_tqdm) as bar: diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index 01455416a..28412aca1 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -6,7 +6,7 @@ from fundus.parser import ParserProxy from fundus.parser.data import LiveTickerBody from fundus.publishers.base_objects import Publisher -from fundus.scraping.article import Article, LiveTicker +from fundus.scraping.article import Article, LiveTicker, Publication from fundus.scraping.delay import Delay from fundus.scraping.filter import ( ExtractionFilter, @@ -31,7 +31,7 @@ def scrape( extraction_filter: Optional[ExtractionFilter] = None, url_filter: Optional[URLFilter] = None, language_filter: Optional[List[str]] = None, - ) -> Iterator[Article]: + ) -> Iterator[Publication]: for source in self.sources: for html in source.fetch(url_filter=url_filter): parser = self.parser_mapping[html.source_info.publisher] @@ -63,7 +63,7 @@ def scrape( logger.debug(f"Skipped article at {html.requested_url!r} because of extraction filter") else: if "body" in extraction.keys() and isinstance(extraction["body"], LiveTickerBody): - article = LiveTicker(html=html, **extraction) + article: Publication = LiveTicker(html=html, **extraction) else: article = Article(html=html, **extraction) if language_filter and article.lang not in language_filter: From 4a15f5ae87290b2c79c6fda3558e026f98c3745c Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Tue, 28 Oct 2025 23:30:26 +0100 Subject: [PATCH 4/9] add LiveTicker support for SZ --- src/fundus/publishers/de/sz.py | 37 +++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/fundus/publishers/de/sz.py b/src/fundus/publishers/de/sz.py index a677a22c5..15bd0691e 100644 --- a/src/fundus/publishers/de/sz.py +++ b/src/fundus/publishers/de/sz.py @@ -1,12 +1,14 @@ import datetime -from typing import List, Optional +from typing import List, Optional, Union from lxml.cssselect import CSSSelector from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute +from fundus.parser.data import LiveTickerBody from fundus.parser.utility import ( extract_article_body_with_selector, + extract_live_ticker_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, @@ -65,3 +67,36 @@ class V1_1(V1): "//div[@itemprop='articleBody']//h3[@data-manual='subheadline'] |" "//div[@itemprop='articleBody']//h2[@data-manual='subheadline']" ) + + _live_ticker_boundary_selector = XPath("//div[contains(@class, 'event__body')]") + _live_ticker_paragraph_selector = XPath( + "//article//div[contains(@class, 'event__body')]//li|//article//div[contains(@class, 'event__body')]//div[@class='tik4-rich-text tik4-rich-text--de']/div" + ) + _live_ticker_subheadline_selector = XPath( + "//article//div[contains(@class, 'event__body')]//h2|//article//div[contains(@class, 'event__body')]//h3" + ) + _live_ticker_date_selector = XPath("//article//div[contains(@class, 'event__body')]//time") + _live_ticker_author_selector = XPath( + "//article//div[contains(@class, 'event__body')]//div[@class='tik4-author__name']" + ) + _live_ticker_summary_selector = XPath("//p[@data-manual='teaserText']") + + @attribute + def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]: + if not self._live_ticker_boundary_selector(self.precomputed.doc): + return extract_article_body_with_selector( + self.precomputed.doc, + summary_selector=self._summary_selector, + subheadline_selector=self._subheadline_selector, + paragraph_selector=self._paragraph_selector, + ) + else: + return extract_live_ticker_body_with_selector( + self.precomputed.doc, + summary_selector=self._live_ticker_summary_selector, + subheadline_selector=self._live_ticker_subheadline_selector, + paragraph_selector=self._live_ticker_paragraph_selector, + entry_boundary_selector=self._live_ticker_boundary_selector, + author_selector=self._live_ticker_author_selector, + date_selector=self._live_ticker_date_selector, + ) From f369498f12d8fbb07193bca20571ac4d07c28fd6 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Tue, 28 Oct 2025 23:31:17 +0100 Subject: [PATCH 5/9] fix author attribute, entry date parsing --- src/fundus/parser/utility.py | 16 +++++++++++++--- src/fundus/scraping/article.py | 11 ++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index bd88b2253..767b216b5 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -126,6 +126,7 @@ class SummaryNode(Node): pass +@dataclass(eq=False) class BoundaryNode(Node): def __post_init__(self): self.position -= 0.5 # in case a content node is also a boundary node, we want the boundary to come first @@ -135,8 +136,17 @@ class SubheadNode(Node): pass +@dataclass(eq=False) class DateNode(Node): - pass + _datetime_selector = XPath("./@datetime") + _timestamp: Optional[str] = None + + def __post_init__(self): + if (timestamp := self._datetime_selector(self.node)) is not None: + self._timestamp = " ".join(generic_nodes_to_text(timestamp)) + + def text_content(self, excluded_tags: Optional[List[str]] = None, tag_filter: Optional[XPath] = None) -> str: + return self._timestamp if self._timestamp else super().text_content(excluded_tags, tag_filter) class AuthorNode(Node): @@ -271,9 +281,9 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl elif isinstance(node, ParagraphNode): paragraph_nodes.append(node) elif isinstance(node, DateNode): - entry_date = generic_date_parsing("".join(generic_nodes_to_text([node.node]))) + entry_date = generic_date_parsing("".join(node.text_content())) elif isinstance(node, AuthorNode): - entry_authors = generic_author_parsing(generic_nodes_to_text([node.node])) + entry_authors = generic_author_parsing(node.text_content()) elif isinstance(node, ImageNode): entry_images = image_selection_helper(node.node) if image_selection_helper else [] else: diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py index 7fbc1202a..ed35ab774 100644 --- a/src/fundus/scraping/article.py +++ b/src/fundus/scraping/article.py @@ -220,9 +220,18 @@ def __str__(self): def body(self) -> Optional[LiveTickerBody]: return self.__extraction__.get("body") + @property + def authors(self) -> List[str]: + authors: List[str] = super().authors + if self.body is None: + return authors + for entry_meta in self.body.entry_meta_information: + authors.extend(entry_meta.get("authors", [])) + return list(set(authors)) + @property def images(self) -> List[Image]: - images: List[Image] = self.__extraction__.get("images", []) + images: List[Image] = super().images if self.body is None: return images for entry_meta in self.body.entry_meta_information: From d3924e146172efa939ea275466a96b66329b9b5b Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Wed, 5 Nov 2025 21:09:45 +0100 Subject: [PATCH 6/9] rename `article.py` to `publication.py` --- scripts/generate_parser_test_files.py | 2 +- scripts/publisher_coverage.py | 2 +- src/fundus/__init__.py | 2 +- src/fundus/scraping/crawler.py | 2 +- src/fundus/scraping/{article.py => publication.py} | 0 src/fundus/scraping/scraper.py | 2 +- tests/utility.py | 2 +- 7 files changed, 6 insertions(+), 6 deletions(-) rename src/fundus/scraping/{article.py => publication.py} (100%) diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py index daf49e187..25f7fc3f5 100644 --- a/scripts/generate_parser_test_files.py +++ b/scripts/generate_parser_test_files.py @@ -9,7 +9,7 @@ from fundus import Crawler, PublisherCollection from fundus.logging import create_logger, set_log_level from fundus.publishers.base_objects import Publisher -from fundus.scraping.article import Article +from fundus.scraping.publication import Article from fundus.scraping.filter import RequiresAll from fundus.scraping.html import WebSource from fundus.scraping.scraper import BaseScraper diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py index c4783ea3c..aab035ffa 100644 --- a/scripts/publisher_coverage.py +++ b/scripts/publisher_coverage.py @@ -11,7 +11,7 @@ from fundus import Crawler, PublisherCollection from fundus.publishers.base_objects import Publisher, PublisherGroup -from fundus.scraping.article import Article +from fundus.scraping.publication import Article from fundus.scraping.session import socket_timeout diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py index cff53f8cf..ed4d81755 100644 --- a/src/fundus/__init__.py +++ b/src/fundus/__init__.py @@ -3,7 +3,7 @@ from langdetect import DetectorFactory from fundus.publishers import PublisherCollection -from fundus.scraping.article import Article +from fundus.scraping.publication import Article from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase from fundus.scraping.filter import Requires from fundus.scraping.url import NewsMap, RSSFeed, Sitemap diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index ae3adb386..9110b9c77 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -52,7 +52,7 @@ from fundus.logging import create_logger, get_current_config from fundus.parser.data import remove_query_parameters_from_url from fundus.publishers.base_objects import FilteredPublisher, Publisher, PublisherGroup -from fundus.scraping.article import Article, Publication +from fundus.scraping.publication import Article, Publication from fundus.scraping.delay import Delay from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter from fundus.scraping.html import CCNewsSource diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/publication.py similarity index 100% rename from src/fundus/scraping/article.py rename to src/fundus/scraping/publication.py diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index 28412aca1..9ee913100 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -6,7 +6,7 @@ from fundus.parser import ParserProxy from fundus.parser.data import LiveTickerBody from fundus.publishers.base_objects import Publisher -from fundus.scraping.article import Article, LiveTicker, Publication +from fundus.scraping.publication import Article, LiveTicker, Publication from fundus.scraping.delay import Delay from fundus.scraping.filter import ( ExtractionFilter, diff --git a/tests/utility.py b/tests/utility.py index b7716ad03..32003a9dc 100644 --- a/tests/utility.py +++ b/tests/utility.py @@ -13,7 +13,7 @@ from fundus.parser import ArticleBody, BaseParser from fundus.parser.data import Image, TextSequenceTree from fundus.publishers.base_objects import Publisher, PublisherGroup -from fundus.scraping.article import Article +from fundus.scraping.publication import Article from fundus.scraping.html import HTML, SourceInfo from scripts.generate_tables import supported_publishers_markdown_path from tests.resources.parser.test_data import __module_path__ as test_resource_path From 090c5eb2f43623c9ed57bd5be3e76fce6f6da54e Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Wed, 5 Nov 2025 21:52:33 +0100 Subject: [PATCH 7/9] add pretty print --- src/fundus/__init__.py | 2 +- src/fundus/parser/data.py | 26 ++++++++++++++++++++------ src/fundus/scraping/crawler.py | 2 +- src/fundus/scraping/scraper.py | 2 +- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py index ed4d81755..96e4d160f 100644 --- a/src/fundus/__init__.py +++ b/src/fundus/__init__.py @@ -3,9 +3,9 @@ from langdetect import DetectorFactory from fundus.publishers import PublisherCollection -from fundus.scraping.publication import Article from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase from fundus.scraping.filter import Requires +from fundus.scraping.publication import Article from fundus.scraping.url import NewsMap, RSSFeed, Sitemap __module_path__ = pathlib.Path(__file__).parent diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index ee21c1f14..d1a802bab 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -332,14 +332,14 @@ def __eq__(self, other: object) -> bool: class TextSequenceTree(ABC): """Base class to traverse and build trees of TextSequence.""" - def as_text_sequence(self) -> TextSequence: - texts = [text for tl in self.df_traversal() for text in tl] + def as_text_sequence(self, iterator: Optional[Iterator[Any]] = None) -> TextSequence: + texts = [text for tl in self.df_traversal(iterator=iterator) for text in tl] return TextSequence(texts) - def text(self, join_on: str = "\n\n") -> str: - return join_on.join(self.as_text_sequence()) + def text(self, join_on: str = "\n\n", iterator: Optional[Iterator[Any]] = None) -> str: + return join_on.join(self.as_text_sequence(iterator=iterator)) - def df_traversal(self) -> Iterable[TextSequence]: + def df_traversal(self, iterator: Optional[Iterator[Any]] = None) -> Iterable[TextSequence]: def recursion(o: object): if isinstance(o, TextSequence): yield o @@ -349,7 +349,7 @@ def recursion(o: object): else: yield o - for value in self: + for value in iter(self) if not iterator else iterator: yield from recursion(value) @abstractmethod @@ -444,6 +444,20 @@ def __iter__(self) -> Iterator[Any]: field_values.extend([entry.sections for entry in self.entries]) yield from field_values + def __meta_iter__(self) -> Iterator[Any]: + field_values = [ + getattr(self, f.name) for f in fields(self) if f.name not in ("entry_meta_information", "entries") + ] + for entry, meta in zip(self.entries, self.entry_meta_information): + field_values.append( + TextSequence([f"LiveTicker entry from {meta.get('date')} by {', '.join(meta.get('authors', []))}"]) + ) + field_values.extend([entry.sections]) + yield from field_values + + def pretty_print(self): + return self.text(iterator=self.__meta_iter__()) + @total_ordering @dataclass diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index 9110b9c77..01b5c33a2 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -52,10 +52,10 @@ from fundus.logging import create_logger, get_current_config from fundus.parser.data import remove_query_parameters_from_url from fundus.publishers.base_objects import FilteredPublisher, Publisher, PublisherGroup -from fundus.scraping.publication import Article, Publication from fundus.scraping.delay import Delay from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter from fundus.scraping.html import CCNewsSource +from fundus.scraping.publication import Article, Publication from fundus.scraping.scraper import CCNewsScraper, WebScraper from fundus.scraping.session import session_handler from fundus.scraping.url import URLSource diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index 9ee913100..716a16e0d 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -6,7 +6,6 @@ from fundus.parser import ParserProxy from fundus.parser.data import LiveTickerBody from fundus.publishers.base_objects import Publisher -from fundus.scraping.publication import Article, LiveTicker, Publication from fundus.scraping.delay import Delay from fundus.scraping.filter import ( ExtractionFilter, @@ -14,6 +13,7 @@ URLFilter, ) from fundus.scraping.html import CCNewsSource, HTMLSource, WebSource +from fundus.scraping.publication import Article, LiveTicker, Publication from fundus.scraping.url import URLSource from fundus.utils.events import __EVENTS__ From 8c13ce5130243246387e6e05dfd63784bb9b522a Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Wed, 5 Nov 2025 23:17:16 +0100 Subject: [PATCH 8/9] make `LiveTicker` iterable --- src/fundus/parser/data.py | 4 +++- src/fundus/parser/utility.py | 7 ++++++- src/fundus/scraping/publication.py | 24 +++++++++++++++++++++++- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index d1a802bab..8239b435c 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -450,7 +450,9 @@ def __meta_iter__(self) -> Iterator[Any]: ] for entry, meta in zip(self.entries, self.entry_meta_information): field_values.append( - TextSequence([f"LiveTicker entry from {meta.get('date')} by {', '.join(meta.get('authors', []))}"]) + TextSequence( + [f"LiveTicker entry from {meta.get('publishing_date')} by {', '.join(meta.get('authors', []))}"] + ) ) field_values.extend([entry.sections]) yield from field_values diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index 767b216b5..98aaa606a 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -35,6 +35,7 @@ from dateutil import parser from lxml.cssselect import CSSSelector from lxml.etree import XPath +from lxml.html import HtmlElement from fundus.logging import create_logger from fundus.parser.data import ( @@ -275,7 +276,9 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl entry_date = None entry_authors = [] entry_images: List[Image] = [] + wrapper = HtmlElement("div") for node in entry: + wrapper.append(node.node) if isinstance(node, SubheadNode): subhead_nodes.append(node) elif isinstance(node, ParagraphNode): @@ -310,7 +313,9 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl sections.append(ArticleSection(*map(TextSequence, texts))) entries.append(ArticleBody(summary=TextSequence([]), sections=sections)) - entries_meta_information.append({"date": entry_date, "authors": entry_authors, "images": entry_images}) + entries_meta_information.append( + {"publishing_date": entry_date, "authors": entry_authors, "images": entry_images, "html": str(wrapper)} + ) return LiveTickerBody(summary=summary, entries=entries, entry_meta_information=entries_meta_information) diff --git a/src/fundus/scraping/publication.py b/src/fundus/scraping/publication.py index ed35ab774..37d51e8eb 100644 --- a/src/fundus/scraping/publication.py +++ b/src/fundus/scraping/publication.py @@ -1,6 +1,6 @@ from datetime import datetime from textwrap import TextWrapper, dedent -from typing import Any, Dict, List, Mapping, Optional, Union +from typing import Any, Dict, Iterator, List, Mapping, Optional, Union import langdetect import lxml.html @@ -216,6 +216,28 @@ def __str__(self): return dedent(text) + def __iter__(self) -> Iterator[Article]: + if not self.body: + return + for idx, (entry, metas) in enumerate(zip(self.body.entries, self.body.entry_meta_information)): + html_entry = HTML( + requested_url=self.html.requested_url, + responded_url=self.html.responded_url, + content=metas.get("html", ""), + crawl_date=self.html.crawl_date, + source_info=self.html.source_info, + ) + if not (title := str(entry.sections[0].headline)).strip(): + title = f"LiveTicker Entry #{idx + 1}" + yield Article( + html=html_entry, + body=entry, + title=title, + authors=metas.get("authors", []), + images=metas.get("images", []), + publishing_date=metas.get("publishing_date", None), + ) + @property def body(self) -> Optional[LiveTickerBody]: return self.__extraction__.get("body") From 0c71b2291133155eceabdec225a1fb10e72c0ad9 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Wed, 5 Nov 2025 23:34:57 +0100 Subject: [PATCH 9/9] fix html --- src/fundus/parser/utility.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index 98aaa606a..09b7d5601 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -34,8 +34,8 @@ import validators from dateutil import parser from lxml.cssselect import CSSSelector -from lxml.etree import XPath -from lxml.html import HtmlElement +from lxml.etree import XPath, tostring +from lxml.html import Element from fundus.logging import create_logger from fundus.parser.data import ( @@ -276,7 +276,7 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl entry_date = None entry_authors = [] entry_images: List[Image] = [] - wrapper = HtmlElement("div") + wrapper = Element("div") for node in entry: wrapper.append(node.node) if isinstance(node, SubheadNode): @@ -314,7 +314,7 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl entries.append(ArticleBody(summary=TextSequence([]), sections=sections)) entries_meta_information.append( - {"publishing_date": entry_date, "authors": entry_authors, "images": entry_images, "html": str(wrapper)} + {"publishing_date": entry_date, "authors": entry_authors, "images": entry_images, "html": tostring(wrapper)} ) return LiveTickerBody(summary=summary, entries=entries, entry_meta_information=entries_meta_information)