Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions src/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
from collections.abc import Sequence
from dataclasses import dataclass
from operator import itemgetter
from typing import Literal
import weakref
import pymupdf
from pymupdf import mupdf
Expand Down Expand Up @@ -2036,6 +2037,16 @@ def __init__(self, page, settings=None):
self.textpage = None
self.settings = TableSettings.resolve(settings)
self.edges = self.get_edges()
if (
self.settings.horizontal_strategy == "text"
and self.settings.vertical_strategy != "text"
):
extend_edges(self.edges, "h", self.settings.intersection_x_tolerance)
elif (
self.settings.vertical_strategy == "text"
and self.settings.horizontal_strategy != "text"
):
extend_edges(self.edges, "v", self.settings.intersection_y_tolerance)
self.intersections = edges_to_intersections(
self.edges,
self.settings.intersection_x_tolerance,
Expand Down Expand Up @@ -2726,3 +2737,68 @@ def find_tables(
for table in tbf.tables:
table.textpage = TEXTPAGE
return tbf


def extend_edges(
edges: list,
extend_orientation: Literal["h", "v"],
intersection_tolerance: float,
) -> None:
"""
Extend the edges to the nearest edge vertical to them
"""
v_edges, h_edges = [
list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
]

v_edges = sorted(v_edges, key=itemgetter("x0", "top"))
h_edges = sorted(h_edges, key=itemgetter("top", "x0"))

if extend_orientation == "h":
edges_to_extend = h_edges
other_edges = v_edges
first_prop_to_extend, second_prop_to_extend = "x0", "x1"
loc_prop = "top"
loc_prop_others = "x0"
first_prop_range, second_prop_range = "top", "bottom"
else:
edges_to_extend = v_edges
other_edges = h_edges
first_prop_to_extend, second_prop_to_extend = "top", "bottom"
loc_prop = "x0"
loc_prop_others = "top"
first_prop_range, second_prop_range = "x0", "x1"

for edge_to_extend in edges_to_extend:
loc = edge_to_extend[loc_prop]
edges_intersect_to_this_edge = [
edge
for edge in other_edges
if (loc - edge[second_prop_range] <= intersection_tolerance)
and (edge[first_prop_range] - loc <= intersection_tolerance)
]
n_edges_intersect_to_this_edge = len(edges_intersect_to_this_edge)
if n_edges_intersect_to_this_edge > 1:
first_val_to_extend, second_val_to_extend = (
edge_to_extend[first_prop_to_extend],
edge_to_extend[second_prop_to_extend],
)
# Extend first value (left for horizontal, top for vertical)
for i in range(n_edges_intersect_to_this_edge):
loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others]
if first_val_to_extend - loc_edge_i < -intersection_tolerance:
if i != 0:
edge_to_extend[first_prop_to_extend] = (
edges_intersect_to_this_edge[i - 1][loc_prop_others]
)
break

# Extend second value (right for horizontal, bottom for vertical)
for i in range(n_edges_intersect_to_this_edge - 1, -1, -1):
loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others]
if second_val_to_extend - loc_edge_i > -intersection_tolerance:
if i != n_edges_intersect_to_this_edge - 1:
edge_to_extend[second_prop_to_extend] = (
edges_intersect_to_this_edge[i + 1][loc_prop_others]
)
break
Binary file added tests/resources/text-lines-tables.pdf
Binary file not shown.
12 changes: 12 additions & 0 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,3 +458,15 @@ def test_md_styles():
tabs = page.find_tables()[0]
text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n"""
assert tabs.to_markdown() == text


def test_one_strat_text_the_other_strat_non_text():
filename = os.path.join(scriptdir, "resources", "text-lines-tables.pdf")
doc = pymupdf.open(filename)
page = doc[0]
tabs = page.find_tables(horizontal_strategy="text", vertical_strategy="lines_strict").tables
assert len(tabs) == 1
assert tabs[0].extract() == [["AAAA", "BBBB"], ["", ""], ["CCCC", "DDDD"]]
tabs = page.find_tables(vertical_strategy="text", horizontal_strategy="lines_strict")
assert len(tabs) == 1
assert tabs[0].extract() == [["1111", "2222"], ["3333", "4444"]]