Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 43 additions & 3 deletions pygexml/page.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path
from re import Pattern, compile
from warnings import warn
from dataclasses import dataclass
from dataclasses import dataclass, field
from dataclasses_json import DataClassJsonMixin
from typing import ClassVar, TypeAlias
from collections.abc import Iterable, Mapping
Expand Down Expand Up @@ -232,9 +232,26 @@ def all_words(self) -> Iterable[str]:
return (w for tl in self.textlines.values() for w in tl.words())


def _parse_reading_order_group(element: Element) -> list[ID]:
children = list(element)
if QName(element).localname in ("OrderedGroup", "OrderedGroupIndexed"):
children.sort(key=lambda c: int(c.attrib.get("index", 0)))
result: list[ID] = []
for child in children:
name = QName(child).localname
if name in ("RegionRef", "RegionRefIndexed") and "regionRef" in child.attrib:
result.append(
str(child.attrib["regionRef"])
) # silently skip malformed entries without regionRef
elif "Group" in name:
result.extend(_parse_reading_order_group(child))
return result


@dataclass
class Page(PageLayout, DataClassJsonMixin):
regions: Mapping[ID, TextRegion] # pyright: ignore[reportIncompatibleVariableOverride] # fmt: skip
reading_order: list[ID] | None = field(default=None)

@classmethod
def from_xml(cls, element: Element) -> "Page":
Expand All @@ -246,6 +263,14 @@ def from_xml(cls, element: Element) -> "Page":

regions = find_children(element, "TextRegion")

reading_order_element = find_child(element, "ReadingOrder")
reading_order: list[ID] | None = None
if reading_order_element is not None:
for child in reading_order_element:
if "Group" in QName(child).localname:
reading_order = _parse_reading_order_group(child)
break

return Page(
image=Image(
filename=str(element.attrib["imageFilename"]),
Expand All @@ -263,6 +288,7 @@ def from_xml(cls, element: Element) -> "Page":
regions={
tr.id: tr for tr in (TextRegion.from_xml(region) for region in regions)
},
reading_order=reading_order,
)

@classmethod
Expand Down Expand Up @@ -344,8 +370,22 @@ def from_alto_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page":
def lookup_region(self, id: ID) -> TextRegion | None:
return self.regions.get(id)

def regions_ordered(self) -> list[TextRegion]:
if self.reading_order is None:
return list(self.regions.values())
seen: set[ID] = set()
result: list[TextRegion] = []
for rid in self.reading_order:
if (region := self.regions.get(rid)) is not None:
result.append(region)
seen.add(rid)
result.extend(r for rid, r in self.regions.items() if rid not in seen)
return result

def all_text(self) -> Iterable[str]:
return (line for region in self.regions.values() for line in region.all_text())
return (line for region in self.regions_ordered() for line in region.all_text())

def all_words(self) -> Iterable[str]:
return (word for region in self.regions.values() for word in region.all_words())
return (
word for region in self.regions_ordered() for word in region.all_words()
)
8 changes: 4 additions & 4 deletions pygexml/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,13 @@ def st_simple_text(**kwargs):
def st_pages(draw):
image = draw(st_images)
regions = {tr.id: tr for tr in draw(st.lists(st_text_regions))}
page = Page(image=image, regions=regions)
return page
reading_order = draw(st.one_of(st.none(), st.permutations(list(regions.keys()))))
return Page(image=image, regions=regions, reading_order=reading_order)


@st.composite
def st_pages_with_dimensions(draw):
image = draw(st_images_with_dimensions)
regions = {tr.id: tr for tr in draw(st.lists(st_text_regions))}
page = Page(image=image, regions=regions)
return page
reading_order = draw(st.one_of(st.none(), st.permutations(list(regions.keys()))))
return Page(image=image, regions=regions, reading_order=reading_order)
212 changes: 209 additions & 3 deletions test/test_page.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path

import pytest
from hypothesis import given, assume
from hypothesis import given, assume, settings
import hypothesis.strategies as st

from lxml import etree
Expand Down Expand Up @@ -550,6 +550,106 @@ def test_from_missing_xml_file(tmp_path: Path) -> None:
Page.from_xml_file(missing_file)


READING_ORDER_XML_TEMPLATE = """
<Page imageFilename="a.jpg">
<ReadingOrder>{ro}</ReadingOrder>
<TextRegion id="tr-1">
<Coords points="1,2 3,4"/>
<TextLine id="tl-1">
<Coords points="1,2 3,4"/>
<TextEquiv><Unicode>foo</Unicode></TextEquiv>
</TextLine>
</TextRegion>
<TextRegion id="tr-2">
<Coords points="5,6 7,8"/>
<TextLine id="tl-2">
<Coords points="5,6 7,8"/>
<TextEquiv><Unicode>bar</Unicode></TextEquiv>
</TextLine>
</TextRegion>
<TextRegion id="tr-3">
<Coords points="9,10 11,12"/>
<TextLine id="tl-3">
<Coords points="9,10 11,12"/>
<TextEquiv><Unicode>baz</Unicode></TextEquiv>
</TextLine>
</TextRegion>
</Page>
"""


def parse_with_ro(ro_xml: str) -> Page:
return Page.from_xml(etree.fromstring(READING_ORDER_XML_TEMPLATE.format(ro=ro_xml)))


def test_page_without_reading_order() -> None:
pa = Page.from_xml(etree.fromstring("""
<Page imageFilename="a.jpg">
<TextRegion id="tr-1">
<Coords points="1,2 3,4"/>
<TextLine id="tl-1">
<Coords points="1,2 3,4"/>
<TextEquiv><Unicode>foo</Unicode></TextEquiv>
</TextLine>
</TextRegion>
</Page>
"""))
assert pa.reading_order is None


def test_page_ordered_group() -> None:
pa = parse_with_ro("""
<OrderedGroup id="ro">
<RegionRefIndexed index="0" regionRef="tr-1"/>
<RegionRefIndexed index="1" regionRef="tr-2"/>
</OrderedGroup>
""")
assert pa.reading_order == ["tr-1", "tr-2"]


def test_page_ordered_group_sorts_by_index() -> None:
pa = parse_with_ro("""
<OrderedGroup id="ro">
<RegionRefIndexed index="1" regionRef="tr-2"/>
<RegionRefIndexed index="0" regionRef="tr-1"/>
</OrderedGroup>
""")
assert pa.reading_order == ["tr-1", "tr-2"]


def test_page_unordered_group() -> None:
pa = parse_with_ro("""
<UnorderedGroup id="ug">
<RegionRef regionRef="tr-1"/>
<RegionRef regionRef="tr-2"/>
</UnorderedGroup>
""")
assert pa.reading_order == ["tr-1", "tr-2"]


def test_page_nested_ordered_group() -> None:
pa = parse_with_ro("""
<OrderedGroup id="ro">
<RegionRefIndexed index="0" regionRef="tr-1"/>
<OrderedGroupIndexed index="1" id="sub">
<RegionRefIndexed index="0" regionRef="tr-2"/>
<RegionRefIndexed index="1" regionRef="tr-3"/>
</OrderedGroupIndexed>
</OrderedGroup>
""")
assert pa.reading_order == ["tr-1", "tr-2", "tr-3"]


def test_page_unordered_group_indexed() -> None:
pa = parse_with_ro("""
<UnorderedGroupIndexed id="ug">
<RegionRefIndexed index="0" regionRef="tr-1"/>
<RegionRefIndexed index="1" regionRef="tr-2"/>
</UnorderedGroupIndexed>
""")
assert pa.reading_order == ["tr-1", "tr-2"]


def test_page_from_alto_example() -> None:
pa = Page.from_alto(etree.fromstring("""
<alto>
Expand Down Expand Up @@ -795,6 +895,18 @@ def test_page_alto_from_missing_file(tmp_path: Path) -> None:
Page.from_alto_file(missing_file)


def test_page_alto_has_no_reading_order() -> None:
pa = Page.from_alto(etree.fromstring("""
<alto>
<Description>
<sourceImageInformation><fileName>a.jpg</fileName></sourceImageInformation>
</Description>
<Layout><Page><PrintSpace/></Page></Layout>
</alto>
"""))
assert pa.reading_order is None


@given(st_text_regions, st_pages())
def test_page_region_lookup(region: TextRegion, page: Page) -> None:
assume(region.id not in page.regions)
Expand All @@ -811,6 +923,65 @@ def test_page_region_lookup_not_found(id: str, page: Page) -> None:
assert page.lookup_region(id) is None


def test_regions_ordered_without_reading_order() -> None:
tr1 = TextRegion(id="tr-1", coords=Coords.parse("1,2 3,4"), textlines={})
tr2 = TextRegion(id="tr-2", coords=Coords.parse("1,2 3,4"), textlines={})
pa = Page(
image=Image(filename="a.jpg", width=None, height=None),
regions={"tr-1": tr1, "tr-2": tr2},
)
assert pa.regions_ordered() == [tr1, tr2]


def test_regions_ordered_with_reading_order() -> None:
tr1 = TextRegion(id="tr-1", coords=Coords.parse("1,2 3,4"), textlines={})
tr2 = TextRegion(id="tr-2", coords=Coords.parse("1,2 3,4"), textlines={})
pa = Page(
image=Image(filename="a.jpg", width=None, height=None),
regions={"tr-1": tr1, "tr-2": tr2},
reading_order=["tr-2", "tr-1"],
)
assert pa.regions_ordered() == [tr2, tr1]


def test_regions_ordered_skips_missing_ids() -> None:
tr1 = TextRegion(id="tr-1", coords=Coords.parse("1,2 3,4"), textlines={})
pa = Page(
image=Image(filename="a.jpg", width=None, height=None),
regions={"tr-1": tr1},
reading_order=["tr-1", "nonexistent"],
)
assert pa.regions_ordered() == [tr1]


def test_regions_ordered_appends_unlisted_regions() -> None:
tr1 = TextRegion(id="tr-1", coords=Coords.parse("1,2 3,4"), textlines={})
tr2 = TextRegion(id="tr-2", coords=Coords.parse("1,2 3,4"), textlines={})
pa = Page(
image=Image(filename="a.jpg", width=None, height=None),
regions={"tr-1": tr1, "tr-2": tr2},
reading_order=["tr-1"],
)
assert pa.regions_ordered() == [tr1, tr2]


@given(st_pages())
def test_regions_ordered_covers_all_regions(page: Page) -> None:
ordered = page.regions_ordered()
assert {r.id for r in ordered} == set(page.regions.keys())
assert len(ordered) == len(page.regions)


@given(st_pages())
@settings(max_examples=25)
def test_regions_ordered_preserves_reading_order(page: Page) -> None:
assume(page.reading_order is not None)
assert page.reading_order is not None # type narrowing for static analysis
ordered_ids = [r.id for r in page.regions_ordered()]
existing_in_order = [rid for rid in page.reading_order if rid in page.regions]
assert ordered_ids[: len(existing_in_order)] == existing_in_order


def test_page_all_text_and_words() -> None:
pa = Page(
image=Image(filename="a", width=None, height=None),
Expand Down Expand Up @@ -839,13 +1010,38 @@ def test_page_all_text_and_words() -> None:
assert list(pa.all_words()) == ["foo", "bar", "bla", "blub", "42"]


def test_page_all_text_and_words_respect_reading_order() -> None:
pa = Page(
image=Image(filename="a", width=None, height=None),
regions={
"a": TextRegion(
id="a",
coords=Coords.parse("1,2 3,4"),
textlines={
"b": TextLine(id="b", coords=Coords.parse("2,3 4,5"), text="foo")
},
),
"b": TextRegion(
id="b",
coords=Coords.parse("5,6 7,8"),
textlines={
"c": TextLine(id="c", coords=Coords.parse("6,7 8,9"), text="bar")
},
),
},
reading_order=["b", "a"],
)
assert list(pa.all_text()) == ["bar", "foo"]
assert list(pa.all_words()) == ["bar", "foo"]


@given(st_pages())
def test_page_all_arbitrary_text_and_words(page: Page) -> None:
assert list(page.all_text()) == [
t for r in page.regions.values() for t in r.all_text()
t for r in page.regions_ordered() for t in r.all_text()
]
assert list(page.all_words()) == [
w for r in page.regions.values() for w in r.all_words()
w for r in page.regions_ordered() for w in r.all_words()
]


Expand All @@ -865,3 +1061,13 @@ def test_page_serialization_roundtrip() -> None:
},
)
assert Page.from_dict(pa.to_dict()) == pa


def test_page_reading_order_serialization_roundtrip() -> None:
pa = parse_with_ro("""
<OrderedGroup id="ro">
<RegionRefIndexed index="0" regionRef="tr-2"/>
<RegionRefIndexed index="1" regionRef="tr-1"/>
</OrderedGroup>
""")
assert Page.from_dict(pa.to_dict()) == pa