From ed57e179c5f24321560661179e208b248ab47cce Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Thu, 16 Oct 2025 23:33:06 +0200
Subject: [PATCH 1/9] draft

---
 src/fundus/parser/data.py              |  32 +++++++
 src/fundus/parser/utility.py           | 116 ++++++++++++++++++++++++-
 src/fundus/publishers/de/tagesschau.py |  32 +++++--
 src/fundus/scraping/article.py         |  41 ++++++++-
 4 files changed, 209 insertions(+), 12 deletions(-)

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index c8d0dd362..f97f598a8 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -413,6 +413,38 @@ def __bool__(self):
         return any(bool(section) for section in self.sections)
 
 
+@dataclass
+class LiveTickerBody(TextSequenceTree):
+    summary: TextSequence
+    entries: List[ArticleBody]
+    entry_meta_information: List[Dict[str, Any]]
+
+    def serialize(self) -> Dict[str, Any]:
+        return {
+            "summary": list(self.summary),
+            "entries": [entry.serialize() for entry in self.entries],
+            "entry_metas": self.entry_meta_information,
+        }
+
+    @classmethod
+    def deserialize(cls, serialized: Dict[str, Any]) -> Self:
+        return cls(
+            summary=TextSequence(serialized["summary"]),
+            entries=[ArticleBody.deserialize(entry) for entry in serialized["entries"]],
+            entry_meta_information=serialized["entry_meta_information"],
+        )
+
+    def __bool__(self):
+        return any(bool(entry) for entry in self.entries)
+
+    def __iter__(self) -> Iterator[Any]:
+        field_values = [
+            getattr(self, f.name) for f in fields(self) if f.name not in ("entry_meta_information", "entries")
+        ]
+        field_values.extend([entry.sections for entry in self.entries])
+        yield from field_values
+
+
 @total_ordering
 @dataclass
 class Dimension(DataclassSerializationMixin):
diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index b0a600cd4..f297b11f0 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -10,6 +10,7 @@
 from datetime import datetime
 from functools import total_ordering
 from typing import (
+    Any,
     Callable,
     ClassVar,
     Dict,
@@ -44,6 +45,7 @@
     Image,
     ImageVersion,
     LinkedDataMapping,
+    LiveTickerBody,
     TextSequence,
 )
 from fundus.utils.regex import _get_match_dict
@@ -69,7 +71,7 @@ def normalize_whitespace(text: str) -> str:
 @total_ordering
 @dataclass(eq=False)
 class Node:
-    position: int
+    position: float
     node: lxml.html.HtmlElement = field(compare=False)
     _break_selector: ClassVar[XPath] = XPath("*//br")
 
@@ -124,10 +126,27 @@ class SummaryNode(Node):
     pass
 
 
+class BoundaryNode(Node):
+    def __post_init__(self):
+        self.position -= 0.5  # in case a content node is also a boundary node, we want the boundary to come first
+
+
 class SubheadNode(Node):
     pass
 
 
+class DateNode(Node):
+    pass
+
+
+class AuthorNode(Node):
+    pass
+
+
+class ImageNode(Node):
+    pass
+
+
 class ParagraphNode(Node):
     pass
 
@@ -190,6 +209,101 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
     return ArticleBody(summary=summary, sections=sections)
 
 
+def extract_live_ticker_body_with_selector(
+    doc: lxml.html.HtmlElement,
+    paragraph_selector: XPath,
+    summary_selector: Optional[XPath] = None,
+    subheadline_selector: Optional[XPath] = None,
+    entry_boundary_selector: Optional[XPath] = None,
+    tag_filter: Optional[XPath] = None,
+    date_selector: Optional[XPath] = None,
+    author_selector: Optional[XPath] = None,
+    image_selector: Optional[XPath] = None,
+    image_selection_helper: Optional[Callable[[lxml.html.HtmlElement], List[Image]]] = None,
+) -> LiveTickerBody:
+    # depth first index for each element in tree
+    df_idx_by_ref = {element: i for i, element in enumerate(doc.iter())}
+
+    def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlElement = doc) -> List[Node]:
+        if not selector and node_type:
+            raise ValueError("Both a selector and node type are required")
+
+        return [node for element in selector(root) if (node := node_type(df_idx_by_ref[element], element))]
+
+    summary_nodes = extract_nodes(summary_selector, SummaryNode) if summary_selector else []
+    boundary_nodes = extract_nodes(entry_boundary_selector, BoundaryNode) if entry_boundary_selector else []
+    paragraph_nodes = extract_nodes(paragraph_selector, ParagraphNode)
+    subhead_nodes = extract_nodes(subheadline_selector, SubheadNode) if subheadline_selector else []
+    date_nodes = extract_nodes(date_selector, DateNode) if date_selector else []
+    author_nodes = extract_nodes(author_selector, AuthorNode) if author_selector else []
+    image_nodes = extract_nodes(image_selector, ImageNode) if image_selector else []
+    nodes = sorted(
+        summary_nodes + boundary_nodes + subhead_nodes + paragraph_nodes + date_nodes + author_nodes + image_nodes
+    )
+
+    if not nodes[: len(summary_nodes)] == summary_nodes:
+        raise ValueError(f"All summary nodes should be at the beginning of the article")
+
+    summary = TextSequence(
+        map(
+            lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)),
+            summary_nodes,
+        )
+    )
+
+    entries: List[ArticleBody] = []
+    entries_meta_information: List[Dict[str, Any]] = []
+    entry_nodes = more_itertools.split_at(nodes[len(summary_nodes) :], pred=lambda x: isinstance(x, BoundaryNode))
+
+    for entry in entry_nodes:
+        if not entry:
+            continue
+        content_nodes = filter(lambda x: isinstance(x, ParagraphNode) or isinstance(x, SubheadNode), entry)
+        instructions = more_itertools.split_when(content_nodes, pred=lambda x, y: type(x) != type(y))
+        subhead_nodes = []
+        paragraph_nodes = []
+        entry_date = None
+        entry_authors = []
+        entry_images: List[Image] = []
+        for node in entry:
+            if isinstance(node, SubheadNode):
+                subhead_nodes.append(node)
+            elif isinstance(node, ParagraphNode):
+                paragraph_nodes.append(node)
+            elif isinstance(node, DateNode):
+                entry_date = generic_date_parsing("".join(generic_nodes_to_text([node.node])))
+            elif isinstance(node, AuthorNode):
+                entry_authors = generic_author_parsing(generic_nodes_to_text([node.node]))
+            elif isinstance(node, ImageNode):
+                entry_images = image_selection_helper(node.node) if image_selection_helper else []
+            else:
+                raise ValueError(f"Unsupported node type: {type(node)}")
+
+        if not subhead_nodes or (paragraph_nodes and subhead_nodes[0] > paragraph_nodes[0]):
+            first = next(instructions)
+            instructions = itertools.chain([first, []], instructions)
+
+        sections: List[ArticleSection] = []
+
+        for chunk in more_itertools.chunked(instructions, 2):
+            if len(chunk) == 1:
+                chunk.append([])
+            texts = [
+                list(
+                    map(
+                        lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)),
+                        c,
+                    )
+                )
+                for c in chunk
+            ]
+            sections.append(ArticleSection(*map(TextSequence, texts)))
+
+        entries.append(ArticleBody(summary=TextSequence([]), sections=sections))
+        entries_meta_information.append({"date": entry_date, "authors": entry_authors, "images": entry_images})
+    return LiveTickerBody(summary=summary, entries=entries, entry_meta_information=entries_meta_information)
+
+
 _ld_node_selector = XPath("//script[@type='application/ld+json']")
 _json_pattern = re.compile(r"(?P<json>{[\s\S]*}|\[\s*{[\s\S]*}\s*](?!\s*}))")
 _json_undefined = re.compile(r'(?P<key>"[^"]*?"):\s*undefined')
diff --git a/src/fundus/publishers/de/tagesschau.py b/src/fundus/publishers/de/tagesschau.py
index 75716a914..fde4678cd 100644
--- a/src/fundus/publishers/de/tagesschau.py
+++ b/src/fundus/publishers/de/tagesschau.py
@@ -1,13 +1,15 @@
 import datetime
 import re
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from lxml.cssselect import CSSSelector
 from lxml.etree import XPath
 
 from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.data import LiveTickerBody
 from fundus.parser.utility import (
     extract_article_body_with_selector,
+    extract_live_ticker_body_with_selector,
     generic_author_parsing,
     generic_date_parsing,
     image_extraction,
@@ -22,14 +24,28 @@ class V1(BaseParser):
         _author_selector = XPath('string(//div[contains(@class, "authorline__author")])')
         _topic_selector = CSSSelector("div.meldungsfooter .taglist a")
 
+        _live_ticker_boundary_selector = XPath("//div[contains(@class, 'liveblog--anchor')]")
+        _live_ticker_paragraph_selector = XPath("//p[contains(@class,'textabsatz ') and not(strong)]")
+        _live_ticker_subheadline_selector = XPath("//h2[@class='meldung__subhead']")
+        _live_ticker_date_selector = XPath("//div[@class='liveblog__datetime']")
+
         @attribute
-        def body(self) -> Optional[ArticleBody]:
-            return extract_article_body_with_selector(
-                self.precomputed.doc,
-                summary_selector=self._summary_selector,
-                subheadline_selector=self._subheadline_selector,
-                paragraph_selector=self._paragraph_selector,
-            )
+        def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]:
+            if not self._live_ticker_boundary_selector(self.precomputed.doc):
+                return extract_article_body_with_selector(
+                    self.precomputed.doc,
+                    summary_selector=self._summary_selector,
+                    subheadline_selector=self._subheadline_selector,
+                    paragraph_selector=self._paragraph_selector,
+                )
+            else:
+                return extract_live_ticker_body_with_selector(
+                    doc=self.precomputed.doc,
+                    entry_boundary_selector=self._live_ticker_boundary_selector,
+                    paragraph_selector=self._live_ticker_paragraph_selector,
+                    subheadline_selector=self._live_ticker_subheadline_selector,
+                    date_selector=self._live_ticker_date_selector,
+                )
 
         @attribute
         def authors(self) -> List[str]:
diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
index b06c40359..3e84462fb 100644
--- a/src/fundus/scraping/article.py
+++ b/src/fundus/scraping/article.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 from textwrap import TextWrapper, dedent
-from typing import Any, Dict, List, Mapping, Optional
+from typing import Any, Dict, List, Mapping, Optional, Union
 
 import langdetect
 import lxml.html
@@ -8,6 +8,7 @@
 
 from fundus.logging import create_logger
 from fundus.parser import ArticleBody, Image
+from fundus.parser.data import LiveTickerBody
 from fundus.scraping.html import HTML
 from fundus.utils.serialization import JSONVal, is_jsonable
 
@@ -27,7 +28,7 @@ def __set__(self, obj, value):
         raise AttributeError("attribute is read only")
 
 
-class Article:
+class Publication:
     __extraction__: Mapping[str, Any] = {}
 
     def __init__(self, *, html: HTML, exception: Optional[Exception] = None, **extraction: Any) -> None:
@@ -45,7 +46,7 @@ def title(self) -> Optional[str]:
         return self.__extraction__.get("title")
 
     @property
-    def body(self) -> Optional[ArticleBody]:
+    def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]:
         return self.__extraction__.get("body")
 
     @property
@@ -150,6 +151,8 @@ def serialize(v: Any) -> JSONVal:
 
         return serialization
 
+
+class Article(Publication):
     def __str__(self):
         # the subsequent indent here is a bit wacky, but textwrapper.dedent won't work with tabs, so we have to use
         # whitespaces instead.
@@ -176,3 +179,35 @@ def __str__(self):
         )
 
         return dedent(text)
+
+
+class LiveTicker(Publication):
+    def __str__(self):
+        # the subsequent indent here is a bit wacky, but textwrapper.dedent won't work with tabs, so we have to use
+        # whitespaces instead.
+        title_wrapper = TextWrapper(width=80, max_lines=1, initial_indent="")
+        text_wrapper = TextWrapper(width=80, max_lines=2, initial_indent="", subsequent_indent="          ")
+        wrapped_title = title_wrapper.fill(
+            f"{Fore.RED}--missing title--{Style.RESET_ALL}" if self.title is None else self.title.strip()
+        )
+        wrapped_plaintext = text_wrapper.fill(
+            f"{Fore.RED}--missing plaintext--{Style.RESET_ALL}" if self.plaintext is None else self.plaintext.strip()
+        )
+
+        summary_text = (
+            f" including {len(self.body.entries if hasattr(self.body, 'entries') and self.body is not None else [])} entries"
+            f" and {len(self.images)} image(s)"
+            if self.images and not isinstance(self.images, Exception)
+            else ""
+        )
+
+        text = (
+            f"Fundus-LiveTicker{summary_text}:"
+            f'\n- Title: "{wrapped_title}"'
+            f'\n- Text:  "{wrapped_plaintext}"'
+            f"\n- URL:    {self.html.requested_url}"
+            f"\n- From:   {self.publisher}"
+            f'{" (Newest Entry from: " + self.publishing_date.strftime("%Y-%m-%d %H:%M") + ")" if self.publishing_date else ""}'
+        )
+
+        return dedent(text)

From 01507dc5f241ada5cf7a278aad03e5c9462f8b15 Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Sun, 26 Oct 2025 23:59:03 +0100
Subject: [PATCH 2/9] detect `LiveTicker` in scraper

---
 src/fundus/publishers/de/tagesschau.py | 2 ++
 src/fundus/scraping/scraper.py         | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/fundus/publishers/de/tagesschau.py b/src/fundus/publishers/de/tagesschau.py
index fde4678cd..7fea43625 100644
--- a/src/fundus/publishers/de/tagesschau.py
+++ b/src/fundus/publishers/de/tagesschau.py
@@ -28,6 +28,7 @@ class V1(BaseParser):
         _live_ticker_paragraph_selector = XPath("//p[contains(@class,'textabsatz ') and not(strong)]")
         _live_ticker_subheadline_selector = XPath("//h2[@class='meldung__subhead']")
         _live_ticker_date_selector = XPath("//div[@class='liveblog__datetime']")
+        _live_ticker_summary_selector = XPath("//article/p[strong]|//article/div/ul/li[not(@class)]")
 
         @attribute
         def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]:
@@ -42,6 +43,7 @@ def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]:
                 return extract_live_ticker_body_with_selector(
                     doc=self.precomputed.doc,
                     entry_boundary_selector=self._live_ticker_boundary_selector,
+                    summary_selector=self._live_ticker_summary_selector,
                     paragraph_selector=self._live_ticker_paragraph_selector,
                     subheadline_selector=self._live_ticker_subheadline_selector,
                     date_selector=self._live_ticker_date_selector,
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index e903b0c29..01455416a 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -4,8 +4,9 @@
 
 from fundus.logging import create_logger
 from fundus.parser import ParserProxy
+from fundus.parser.data import LiveTickerBody
 from fundus.publishers.base_objects import Publisher
-from fundus.scraping.article import Article
+from fundus.scraping.article import Article, LiveTicker
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import (
     ExtractionFilter,
@@ -61,7 +62,10 @@ def scrape(
                         else:
                             logger.debug(f"Skipped article at {html.requested_url!r} because of extraction filter")
                     else:
-                        article = Article(html=html, **extraction)
+                        if "body" in extraction.keys() and isinstance(extraction["body"], LiveTickerBody):
+                            article = LiveTicker(html=html, **extraction)
+                        else:
+                            article = Article(html=html, **extraction)
                         if language_filter and article.lang not in language_filter:
                             logger.debug(
                                 f"Skipped article at {html.requested_url!r} because article language: "

From 3a7d2423dc299296af014500f67854badb0e8086 Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Tue, 28 Oct 2025 16:06:39 +0100
Subject: [PATCH 3/9] fix images and typing

---
 src/fundus/parser/data.py      |  2 +-
 src/fundus/parser/utility.py   |  6 +++---
 src/fundus/scraping/article.py | 23 +++++++++++++++++---
 src/fundus/scraping/crawler.py | 38 +++++++++++++++++-----------------
 src/fundus/scraping/scraper.py |  6 +++---
 5 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index f97f598a8..ee21c1f14 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -423,7 +423,7 @@ def serialize(self) -> Dict[str, Any]:
         return {
             "summary": list(self.summary),
             "entries": [entry.serialize() for entry in self.entries],
-            "entry_metas": self.entry_meta_information,
+            "entry_meta_information": self.entry_meta_information,
         }
 
     @classmethod
diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index f297b11f0..bd88b2253 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -616,10 +616,10 @@ class CustomParserInfo(parser.parserinfo):
         ("Jul", "July", "Juli"),
         ("Aug", "August"),
         ("Sep", "Sept", "September"),
-        ("Oct", "October", "Oktober", "Okt"),
+        ("Oct", "October", "Oktober", "Okt"),  # type: ignore[list-item]
         ("Nov", "November"),
-        ("Dec", "December", "Dezember", "Dez"),
-    ]
+        ("Dec", "December", "Dezember", "Dez"),  # type: ignore[list-item]
+    ]  # type ignore due to types-python-dateutil==2.9.0.20251008, see https://github.com/flairNLP/fundus/issues/806
 
 
 def generic_date_parsing(date_str: Optional[str]) -> Optional[datetime]:
diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
index 3e84462fb..7fbc1202a 100644
--- a/src/fundus/scraping/article.py
+++ b/src/fundus/scraping/article.py
@@ -8,7 +8,7 @@
 
 from fundus.logging import create_logger
 from fundus.parser import ArticleBody, Image
-from fundus.parser.data import LiveTickerBody
+from fundus.parser.data import LiveTickerBody, TextSequenceTree
 from fundus.scraping.html import HTML
 from fundus.utils.serialization import JSONVal, is_jsonable
 
@@ -46,8 +46,8 @@ def title(self) -> Optional[str]:
         return self.__extraction__.get("title")
 
     @property
-    def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]:
-        return self.__extraction__.get("body")
+    def body(self) -> Optional[TextSequenceTree]:
+        raise NotImplementedError
 
     @property
     def authors(self) -> List[str]:
@@ -180,6 +180,10 @@ def __str__(self):
 
         return dedent(text)
 
+    @property
+    def body(self) -> Optional[ArticleBody]:
+        return self.__extraction__.get("body")
+
 
 class LiveTicker(Publication):
     def __str__(self):
@@ -211,3 +215,16 @@ def __str__(self):
         )
 
         return dedent(text)
+
+    @property
+    def body(self) -> Optional[LiveTickerBody]:
+        return self.__extraction__.get("body")
+
+    @property
+    def images(self) -> List[Image]:
+        images: List[Image] = self.__extraction__.get("images", [])
+        if self.body is None:
+            return images
+        for entry_meta in self.body.entry_meta_information:
+            images.extend(entry_meta.get("images", []))
+        return images
diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 03783940b..ae3adb386 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -52,7 +52,7 @@
 from fundus.logging import create_logger, get_current_config
 from fundus.parser.data import remove_query_parameters_from_url
 from fundus.publishers.base_objects import FilteredPublisher, Publisher, PublisherGroup
-from fundus.scraping.article import Article
+from fundus.scraping.article import Article, Publication
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter
 from fundus.scraping.html import CCNewsSource
@@ -222,7 +222,7 @@ def _build_article_iterator(
         extraction_filter: Optional[ExtractionFilter],
         url_filter: Optional[URLFilter],
         language_filter: Optional[List[str]],
-    ) -> Iterator[Article]:
+    ) -> Iterator[Publication]:
         raise NotImplementedError
 
     def crawl(
@@ -236,7 +236,7 @@ def crawl(
         language_filter: Optional[List[str]] = None,
         only_unique: bool = True,
         save_to_file: Union[None, str, Path] = None,
-    ) -> Iterator[Article]:
+    ) -> Iterator[Publication]:
         """Yields articles from initialized scrapers
 
         Args:
@@ -269,7 +269,7 @@ def crawl(
                 specified file as a JSON list.
 
         Returns:
-            Iterator[Article]: An iterator yielding objects of type Article.
+            Iterator[Publication]: An iterator yielding objects of type Article.
         """
 
         if max_articles == 0:
@@ -343,7 +343,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
             logger.info(f"Publisher language filter: {publisher_language_filter} will be used as the language filter. ")
 
         article_count: Dict[str, int] = defaultdict(int)
-        crawled_articles: Dict[str, List[Article]] = defaultdict(list)
+        crawled_articles: Dict[str, List[Publication]] = defaultdict(list)
 
         # Unfortunately we relly on this little workaround here to terminate the 'Pool' used within
         # the 'CCNewsCrawler'. The 'Timeout' contextmanager utilizes '_thread.interrupt_main',
@@ -465,7 +465,7 @@ def _fetch_articles(
         extraction_filter: Optional[ExtractionFilter] = None,
         url_filter: Optional[URLFilter] = None,
         language_filter: Optional[List[str]] = None,
-    ) -> Iterator[Article]:
+    ) -> Iterator[Publication]:
         def build_delay() -> Optional[Delay]:
             if isinstance(self.delay, float):
                 delay = self.delay
@@ -498,15 +498,15 @@ def constant_delay() -> float:
 
     @staticmethod
     def _single_crawl(
-        publishers: Tuple[Publisher, ...], article_task: Callable[[Publisher], Iterator[Article]]
-    ) -> Iterator[Article]:
+        publishers: Tuple[Publisher, ...], article_task: Callable[[Publisher], Iterator[Publication]]
+    ) -> Iterator[Publication]:
         article_iterators = [article_task(publisher) for publisher in publishers]
         yield from roundrobin(*article_iterators)
 
     def _threaded_crawl(
-        self, publishers: Tuple[Publisher, ...], article_task: Callable[[Publisher], Iterator[Article]]
-    ) -> Iterator[Article]:
-        result_queue: Queue[Union[Article, Exception]] = Queue(len(publishers))
+        self, publishers: Tuple[Publisher, ...], article_task: Callable[[Publisher], Iterator[Publication]]
+    ) -> Iterator[Publication]:
+        result_queue: Queue[Union[Publication, Exception]] = Queue(len(publishers))
         wrapped_article_task = queue_wrapper(result_queue, article_task)
         pool = ThreadPool(processes=len(publishers) or None)
         try:
@@ -529,7 +529,7 @@ def _build_article_iterator(
         extraction_filter: Optional[ExtractionFilter],
         url_filter: Optional[URLFilter],
         language_filter: Optional[List[str]],
-    ) -> Iterator[Article]:
+    ) -> Iterator[Publication]:
         article_task = partial(
             self._fetch_articles,
             error_handling=error_handling,
@@ -604,7 +604,7 @@ def _fetch_articles(
         url_filter: Optional[URLFilter] = None,
         language_filter: Optional[List[str]] = None,
         bar: Optional[tqdm] = None,
-    ) -> Iterator[Article]:
+    ) -> Iterator[Publication]:
         retries: int = 0
         while True:
             source = CCNewsSource(*publishers, warc_path=warc_path)
@@ -630,14 +630,14 @@ def _fetch_articles(
 
     @staticmethod
     def _single_crawl(
-        warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Article]]
-    ) -> Iterator[Article]:
+        warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Publication]]
+    ) -> Iterator[Publication]:
         for warc_path in warc_paths:
             yield from article_task(warc_path)
 
     def _parallel_crawl(
-        self, warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Article]]
-    ) -> Iterator[Article]:
+        self, warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Publication]]
+    ) -> Iterator[Publication]:
         # because logging configurations are overwritten when using 'spawn' as start method,
         # we have to get current logging configurations and initialize them in the new process
         if multiprocessing.get_start_method() == "spawn":
@@ -654,7 +654,7 @@ def _parallel_crawl(
                 processes=min(self.processes, len(warc_paths)),
                 initializer=initializer,
             ) as pool:
-                result_queue: Queue[Union[Article, Exception]] = manager.Queue(maxsize=1000)
+                result_queue: Queue[Union[Publication, Exception]] = manager.Queue(maxsize=1000)
 
                 # Because multiprocessing.Pool does not support iterators as targets,
                 # we wrap the article_task to write the articles to a queue instead of returning them directly.
@@ -738,7 +738,7 @@ def _build_article_iterator(
         url_filter: Optional[URLFilter],
         language_filter: Optional[List[str]],
         **kwargs,
-    ) -> Iterator[Article]:
+    ) -> Iterator[Publication]:
         warc_paths = tuple(self._get_warc_paths())
 
         with get_proxy_tqdm(total=len(warc_paths), desc="Process WARC files", disable=self.disable_tqdm) as bar:
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index 01455416a..28412aca1 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -6,7 +6,7 @@
 from fundus.parser import ParserProxy
 from fundus.parser.data import LiveTickerBody
 from fundus.publishers.base_objects import Publisher
-from fundus.scraping.article import Article, LiveTicker
+from fundus.scraping.article import Article, LiveTicker, Publication
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import (
     ExtractionFilter,
@@ -31,7 +31,7 @@ def scrape(
         extraction_filter: Optional[ExtractionFilter] = None,
         url_filter: Optional[URLFilter] = None,
         language_filter: Optional[List[str]] = None,
-    ) -> Iterator[Article]:
+    ) -> Iterator[Publication]:
         for source in self.sources:
             for html in source.fetch(url_filter=url_filter):
                 parser = self.parser_mapping[html.source_info.publisher]
@@ -63,7 +63,7 @@ def scrape(
                             logger.debug(f"Skipped article at {html.requested_url!r} because of extraction filter")
                     else:
                         if "body" in extraction.keys() and isinstance(extraction["body"], LiveTickerBody):
-                            article = LiveTicker(html=html, **extraction)
+                            article: Publication = LiveTicker(html=html, **extraction)
                         else:
                             article = Article(html=html, **extraction)
                         if language_filter and article.lang not in language_filter:

From 4a15f5ae87290b2c79c6fda3558e026f98c3745c Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Tue, 28 Oct 2025 23:30:26 +0100
Subject: [PATCH 4/9] add LiveTicker support for SZ

---
 src/fundus/publishers/de/sz.py | 37 +++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/fundus/publishers/de/sz.py b/src/fundus/publishers/de/sz.py
index a677a22c5..15bd0691e 100644
--- a/src/fundus/publishers/de/sz.py
+++ b/src/fundus/publishers/de/sz.py
@@ -1,12 +1,14 @@
 import datetime
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from lxml.cssselect import CSSSelector
 from lxml.etree import XPath
 
 from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.data import LiveTickerBody
 from fundus.parser.utility import (
     extract_article_body_with_selector,
+    extract_live_ticker_body_with_selector,
     generic_author_parsing,
     generic_date_parsing,
     generic_topic_parsing,
@@ -65,3 +67,36 @@ class V1_1(V1):
             "//div[@itemprop='articleBody']//h3[@data-manual='subheadline'] |"
             "//div[@itemprop='articleBody']//h2[@data-manual='subheadline']"
         )
+
+        _live_ticker_boundary_selector = XPath("//div[contains(@class, 'event__body')]")
+        _live_ticker_paragraph_selector = XPath(
+            "//article//div[contains(@class, 'event__body')]//li|//article//div[contains(@class, 'event__body')]//div[@class='tik4-rich-text tik4-rich-text--de']/div"
+        )
+        _live_ticker_subheadline_selector = XPath(
+            "//article//div[contains(@class, 'event__body')]//h2|//article//div[contains(@class, 'event__body')]//h3"
+        )
+        _live_ticker_date_selector = XPath("//article//div[contains(@class, 'event__body')]//time")
+        _live_ticker_author_selector = XPath(
+            "//article//div[contains(@class, 'event__body')]//div[@class='tik4-author__name']"
+        )
+        _live_ticker_summary_selector = XPath("//p[@data-manual='teaserText']")
+
+        @attribute
+        def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]:
+            if not self._live_ticker_boundary_selector(self.precomputed.doc):
+                return extract_article_body_with_selector(
+                    self.precomputed.doc,
+                    summary_selector=self._summary_selector,
+                    subheadline_selector=self._subheadline_selector,
+                    paragraph_selector=self._paragraph_selector,
+                )
+            else:
+                return extract_live_ticker_body_with_selector(
+                    self.precomputed.doc,
+                    summary_selector=self._live_ticker_summary_selector,
+                    subheadline_selector=self._live_ticker_subheadline_selector,
+                    paragraph_selector=self._live_ticker_paragraph_selector,
+                    entry_boundary_selector=self._live_ticker_boundary_selector,
+                    author_selector=self._live_ticker_author_selector,
+                    date_selector=self._live_ticker_date_selector,
+                )

From f369498f12d8fbb07193bca20571ac4d07c28fd6 Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Tue, 28 Oct 2025 23:31:17 +0100
Subject: [PATCH 5/9] fix author attribute, entry date parsing

---
 src/fundus/parser/utility.py   | 16 +++++++++++++---
 src/fundus/scraping/article.py | 11 ++++++++++-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index bd88b2253..767b216b5 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -126,6 +126,7 @@ class SummaryNode(Node):
     pass
 
 
+@dataclass(eq=False)
 class BoundaryNode(Node):
     def __post_init__(self):
         self.position -= 0.5  # in case a content node is also a boundary node, we want the boundary to come first
@@ -135,8 +136,17 @@ class SubheadNode(Node):
     pass
 
 
+@dataclass(eq=False)
 class DateNode(Node):
-    pass
+    _datetime_selector = XPath("./@datetime")
+    _timestamp: Optional[str] = None
+
+    def __post_init__(self):
+        if (timestamp := self._datetime_selector(self.node)) is not None:
+            self._timestamp = " ".join(generic_nodes_to_text(timestamp))
+
+    def text_content(self, excluded_tags: Optional[List[str]] = None, tag_filter: Optional[XPath] = None) -> str:
+        return self._timestamp if self._timestamp else super().text_content(excluded_tags, tag_filter)
 
 
 class AuthorNode(Node):
@@ -271,9 +281,9 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl
             elif isinstance(node, ParagraphNode):
                 paragraph_nodes.append(node)
             elif isinstance(node, DateNode):
-                entry_date = generic_date_parsing("".join(generic_nodes_to_text([node.node])))
+                entry_date = generic_date_parsing("".join(node.text_content()))
             elif isinstance(node, AuthorNode):
-                entry_authors = generic_author_parsing(generic_nodes_to_text([node.node]))
+                entry_authors = generic_author_parsing(node.text_content())
             elif isinstance(node, ImageNode):
                 entry_images = image_selection_helper(node.node) if image_selection_helper else []
             else:
diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
index 7fbc1202a..ed35ab774 100644
--- a/src/fundus/scraping/article.py
+++ b/src/fundus/scraping/article.py
@@ -220,9 +220,18 @@ def __str__(self):
     def body(self) -> Optional[LiveTickerBody]:
         return self.__extraction__.get("body")
 
+    @property
+    def authors(self) -> List[str]:
+        authors: List[str] = super().authors
+        if self.body is None:
+            return authors
+        for entry_meta in self.body.entry_meta_information:
+            authors.extend(entry_meta.get("authors", []))
+        return list(set(authors))
+
     @property
     def images(self) -> List[Image]:
-        images: List[Image] = self.__extraction__.get("images", [])
+        images: List[Image] = super().images
         if self.body is None:
             return images
         for entry_meta in self.body.entry_meta_information:

From d3924e146172efa939ea275466a96b66329b9b5b Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Wed, 5 Nov 2025 21:09:45 +0100
Subject: [PATCH 6/9] rename `article.py` to `publication.py`

---
 scripts/generate_parser_test_files.py              | 2 +-
 scripts/publisher_coverage.py                      | 2 +-
 src/fundus/__init__.py                             | 2 +-
 src/fundus/scraping/crawler.py                     | 2 +-
 src/fundus/scraping/{article.py => publication.py} | 0
 src/fundus/scraping/scraper.py                     | 2 +-
 tests/utility.py                                   | 2 +-
 7 files changed, 6 insertions(+), 6 deletions(-)
 rename src/fundus/scraping/{article.py => publication.py} (100%)

diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py
index daf49e187..25f7fc3f5 100644
--- a/scripts/generate_parser_test_files.py
+++ b/scripts/generate_parser_test_files.py
@@ -9,7 +9,7 @@
 from fundus import Crawler, PublisherCollection
 from fundus.logging import create_logger, set_log_level
 from fundus.publishers.base_objects import Publisher
-from fundus.scraping.article import Article
+from fundus.scraping.publication import Article
 from fundus.scraping.filter import RequiresAll
 from fundus.scraping.html import WebSource
 from fundus.scraping.scraper import BaseScraper
diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py
index c4783ea3c..aab035ffa 100644
--- a/scripts/publisher_coverage.py
+++ b/scripts/publisher_coverage.py
@@ -11,7 +11,7 @@
 
 from fundus import Crawler, PublisherCollection
 from fundus.publishers.base_objects import Publisher, PublisherGroup
-from fundus.scraping.article import Article
+from fundus.scraping.publication import Article
 from fundus.scraping.session import socket_timeout
 
 
diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
index cff53f8cf..ed4d81755 100644
--- a/src/fundus/__init__.py
+++ b/src/fundus/__init__.py
@@ -3,7 +3,7 @@
 from langdetect import DetectorFactory
 
 from fundus.publishers import PublisherCollection
-from fundus.scraping.article import Article
+from fundus.scraping.publication import Article
 from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase
 from fundus.scraping.filter import Requires
 from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index ae3adb386..9110b9c77 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -52,7 +52,7 @@
 from fundus.logging import create_logger, get_current_config
 from fundus.parser.data import remove_query_parameters_from_url
 from fundus.publishers.base_objects import FilteredPublisher, Publisher, PublisherGroup
-from fundus.scraping.article import Article, Publication
+from fundus.scraping.publication import Article, Publication
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter
 from fundus.scraping.html import CCNewsSource
diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/publication.py
similarity index 100%
rename from src/fundus/scraping/article.py
rename to src/fundus/scraping/publication.py
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index 28412aca1..9ee913100 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -6,7 +6,7 @@
 from fundus.parser import ParserProxy
 from fundus.parser.data import LiveTickerBody
 from fundus.publishers.base_objects import Publisher
-from fundus.scraping.article import Article, LiveTicker, Publication
+from fundus.scraping.publication import Article, LiveTicker, Publication
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import (
     ExtractionFilter,
diff --git a/tests/utility.py b/tests/utility.py
index b7716ad03..32003a9dc 100644
--- a/tests/utility.py
+++ b/tests/utility.py
@@ -13,7 +13,7 @@
 from fundus.parser import ArticleBody, BaseParser
 from fundus.parser.data import Image, TextSequenceTree
 from fundus.publishers.base_objects import Publisher, PublisherGroup
-from fundus.scraping.article import Article
+from fundus.scraping.publication import Article
 from fundus.scraping.html import HTML, SourceInfo
 from scripts.generate_tables import supported_publishers_markdown_path
 from tests.resources.parser.test_data import __module_path__ as test_resource_path

From 090c5eb2f43623c9ed57bd5be3e76fce6f6da54e Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Wed, 5 Nov 2025 21:52:33 +0100
Subject: [PATCH 7/9] add pretty print

---
 src/fundus/__init__.py         |  2 +-
 src/fundus/parser/data.py      | 26 ++++++++++++++++++++------
 src/fundus/scraping/crawler.py |  2 +-
 src/fundus/scraping/scraper.py |  2 +-
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
index ed4d81755..96e4d160f 100644
--- a/src/fundus/__init__.py
+++ b/src/fundus/__init__.py
@@ -3,9 +3,9 @@
 from langdetect import DetectorFactory
 
 from fundus.publishers import PublisherCollection
-from fundus.scraping.publication import Article
 from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase
 from fundus.scraping.filter import Requires
+from fundus.scraping.publication import Article
 from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 __module_path__ = pathlib.Path(__file__).parent
diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index ee21c1f14..d1a802bab 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -332,14 +332,14 @@ def __eq__(self, other: object) -> bool:
 class TextSequenceTree(ABC):
     """Base class to traverse and build trees of TextSequence."""
 
-    def as_text_sequence(self) -> TextSequence:
-        texts = [text for tl in self.df_traversal() for text in tl]
+    def as_text_sequence(self, iterator: Optional[Iterator[Any]] = None) -> TextSequence:
+        texts = [text for tl in self.df_traversal(iterator=iterator) for text in tl]
         return TextSequence(texts)
 
-    def text(self, join_on: str = "\n\n") -> str:
-        return join_on.join(self.as_text_sequence())
+    def text(self, join_on: str = "\n\n", iterator: Optional[Iterator[Any]] = None) -> str:
+        return join_on.join(self.as_text_sequence(iterator=iterator))
 
-    def df_traversal(self) -> Iterable[TextSequence]:
+    def df_traversal(self, iterator: Optional[Iterator[Any]] = None) -> Iterable[TextSequence]:
         def recursion(o: object):
             if isinstance(o, TextSequence):
                 yield o
@@ -349,7 +349,7 @@ def recursion(o: object):
             else:
                 yield o
 
-        for value in self:
+        for value in iter(self) if not iterator else iterator:
             yield from recursion(value)
 
     @abstractmethod
@@ -444,6 +444,20 @@ def __iter__(self) -> Iterator[Any]:
         field_values.extend([entry.sections for entry in self.entries])
         yield from field_values
 
+    def __meta_iter__(self) -> Iterator[Any]:
+        field_values = [
+            getattr(self, f.name) for f in fields(self) if f.name not in ("entry_meta_information", "entries")
+        ]
+        for entry, meta in zip(self.entries, self.entry_meta_information):
+            field_values.append(
+                TextSequence([f"LiveTicker entry from {meta.get('date')} by {', '.join(meta.get('authors', []))}"])
+            )
+            field_values.extend([entry.sections])
+        yield from field_values
+
+    def pretty_print(self):
+        return self.text(iterator=self.__meta_iter__())
+
 
 @total_ordering
 @dataclass
diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 9110b9c77..01b5c33a2 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -52,10 +52,10 @@
 from fundus.logging import create_logger, get_current_config
 from fundus.parser.data import remove_query_parameters_from_url
 from fundus.publishers.base_objects import FilteredPublisher, Publisher, PublisherGroup
-from fundus.scraping.publication import Article, Publication
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter
 from fundus.scraping.html import CCNewsSource
+from fundus.scraping.publication import Article, Publication
 from fundus.scraping.scraper import CCNewsScraper, WebScraper
 from fundus.scraping.session import session_handler
 from fundus.scraping.url import URLSource
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index 9ee913100..716a16e0d 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -6,7 +6,6 @@
 from fundus.parser import ParserProxy
 from fundus.parser.data import LiveTickerBody
 from fundus.publishers.base_objects import Publisher
-from fundus.scraping.publication import Article, LiveTicker, Publication
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import (
     ExtractionFilter,
@@ -14,6 +13,7 @@
     URLFilter,
 )
 from fundus.scraping.html import CCNewsSource, HTMLSource, WebSource
+from fundus.scraping.publication import Article, LiveTicker, Publication
 from fundus.scraping.url import URLSource
 from fundus.utils.events import __EVENTS__
 

From 8c13ce5130243246387e6e05dfd63784bb9b522a Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Wed, 5 Nov 2025 23:17:16 +0100
Subject: [PATCH 8/9] make `LiveTicker` iterable

---
 src/fundus/parser/data.py          |  4 +++-
 src/fundus/parser/utility.py       |  7 ++++++-
 src/fundus/scraping/publication.py | 24 +++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index d1a802bab..8239b435c 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -450,7 +450,9 @@ def __meta_iter__(self) -> Iterator[Any]:
         ]
         for entry, meta in zip(self.entries, self.entry_meta_information):
             field_values.append(
-                TextSequence([f"LiveTicker entry from {meta.get('date')} by {', '.join(meta.get('authors', []))}"])
+                TextSequence(
+                    [f"LiveTicker entry from {meta.get('publishing_date')} by {', '.join(meta.get('authors', []))}"]
+                )
             )
             field_values.extend([entry.sections])
         yield from field_values
diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index 767b216b5..98aaa606a 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -35,6 +35,7 @@
 from dateutil import parser
 from lxml.cssselect import CSSSelector
 from lxml.etree import XPath
+from lxml.html import HtmlElement
 
 from fundus.logging import create_logger
 from fundus.parser.data import (
@@ -275,7 +276,9 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl
         entry_date = None
         entry_authors = []
         entry_images: List[Image] = []
+        wrapper = HtmlElement("div")
         for node in entry:
+            wrapper.append(node.node)
             if isinstance(node, SubheadNode):
                 subhead_nodes.append(node)
             elif isinstance(node, ParagraphNode):
@@ -310,7 +313,9 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl
             sections.append(ArticleSection(*map(TextSequence, texts)))
 
         entries.append(ArticleBody(summary=TextSequence([]), sections=sections))
-        entries_meta_information.append({"date": entry_date, "authors": entry_authors, "images": entry_images})
+        entries_meta_information.append(
+            {"publishing_date": entry_date, "authors": entry_authors, "images": entry_images, "html": str(wrapper)}
+        )
     return LiveTickerBody(summary=summary, entries=entries, entry_meta_information=entries_meta_information)
 
 
diff --git a/src/fundus/scraping/publication.py b/src/fundus/scraping/publication.py
index ed35ab774..37d51e8eb 100644
--- a/src/fundus/scraping/publication.py
+++ b/src/fundus/scraping/publication.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 from textwrap import TextWrapper, dedent
-from typing import Any, Dict, List, Mapping, Optional, Union
+from typing import Any, Dict, Iterator, List, Mapping, Optional, Union
 
 import langdetect
 import lxml.html
@@ -216,6 +216,28 @@ def __str__(self):
 
         return dedent(text)
 
+    def __iter__(self) -> Iterator[Article]:
+        if not self.body:
+            return
+        for idx, (entry, metas) in enumerate(zip(self.body.entries, self.body.entry_meta_information)):
+            html_entry = HTML(
+                requested_url=self.html.requested_url,
+                responded_url=self.html.responded_url,
+                content=metas.get("html", ""),
+                crawl_date=self.html.crawl_date,
+                source_info=self.html.source_info,
+            )
+            if not (title := str(entry.sections[0].headline)).strip():
+                title = f"LiveTicker Entry #{idx + 1}"
+            yield Article(
+                html=html_entry,
+                body=entry,
+                title=title,
+                authors=metas.get("authors", []),
+                images=metas.get("images", []),
+                publishing_date=metas.get("publishing_date", None),
+            )
+
     @property
     def body(self) -> Optional[LiveTickerBody]:
         return self.__extraction__.get("body")

From 0c71b2291133155eceabdec225a1fb10e72c0ad9 Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Wed, 5 Nov 2025 23:34:57 +0100
Subject: [PATCH 9/9] fix html

---
 src/fundus/parser/utility.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index 98aaa606a..09b7d5601 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -34,8 +34,8 @@
 import validators
 from dateutil import parser
 from lxml.cssselect import CSSSelector
-from lxml.etree import XPath
-from lxml.html import HtmlElement
+from lxml.etree import XPath, tostring
+from lxml.html import Element
 
 from fundus.logging import create_logger
 from fundus.parser.data import (
@@ -276,7 +276,7 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl
         entry_date = None
         entry_authors = []
         entry_images: List[Image] = []
-        wrapper = HtmlElement("div")
+        wrapper = Element("div")
         for node in entry:
             wrapper.append(node.node)
             if isinstance(node, SubheadNode):
@@ -314,7 +314,7 @@ def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlEl
 
         entries.append(ArticleBody(summary=TextSequence([]), sections=sections))
         entries_meta_information.append(
-            {"publishing_date": entry_date, "authors": entry_authors, "images": entry_images, "html": str(wrapper)}
+            {"publishing_date": entry_date, "authors": entry_authors, "images": entry_images, "html": tostring(wrapper)}
         )
     return LiveTickerBody(summary=summary, entries=entries, entry_meta_information=entries_meta_information)