diff --git a/src/table.py b/src/table.py index 25406a9e4..47affcea4 100644 --- a/src/table.py +++ b/src/table.py @@ -79,6 +79,7 @@ from collections.abc import Sequence from dataclasses import dataclass from operator import itemgetter +from typing import Literal import weakref import pymupdf from pymupdf import mupdf @@ -2036,6 +2037,16 @@ def __init__(self, page, settings=None): self.textpage = None self.settings = TableSettings.resolve(settings) self.edges = self.get_edges() + if ( + self.settings.horizontal_strategy == "text" + and self.settings.vertical_strategy != "text" + ): + extend_edges(self.edges, "h", self.settings.intersection_x_tolerance) + elif ( + self.settings.vertical_strategy == "text" + and self.settings.horizontal_strategy != "text" + ): + extend_edges(self.edges, "v", self.settings.intersection_y_tolerance) self.intersections = edges_to_intersections( self.edges, self.settings.intersection_x_tolerance, @@ -2726,3 +2737,68 @@ def find_tables( for table in tbf.tables: table.textpage = TEXTPAGE return tbf + + +def extend_edges( + edges: list, + extend_orientation: Literal["h", "v"], + intersection_tolerance: float, +) -> None: + """ + Extend the edges to the nearest edge vertical to them + """ + v_edges, h_edges = [ + list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h") + ] + + v_edges = sorted(v_edges, key=itemgetter("x0", "top")) + h_edges = sorted(h_edges, key=itemgetter("top", "x0")) + + if extend_orientation == "h": + edges_to_extend = h_edges + other_edges = v_edges + first_prop_to_extend, second_prop_to_extend = "x0", "x1" + loc_prop = "top" + loc_prop_others = "x0" + first_prop_range, second_prop_range = "top", "bottom" + else: + edges_to_extend = v_edges + other_edges = h_edges + first_prop_to_extend, second_prop_to_extend = "top", "bottom" + loc_prop = "x0" + loc_prop_others = "top" + first_prop_range, second_prop_range = "x0", "x1" + + for edge_to_extend in edges_to_extend: + loc = edge_to_extend[loc_prop] + edges_intersect_to_this_edge = [ + edge + for edge in other_edges + if (loc - edge[second_prop_range] <= intersection_tolerance) + and (edge[first_prop_range] - loc <= intersection_tolerance) + ] + n_edges_intersect_to_this_edge = len(edges_intersect_to_this_edge) + if n_edges_intersect_to_this_edge > 1: + first_val_to_extend, second_val_to_extend = ( + edge_to_extend[first_prop_to_extend], + edge_to_extend[second_prop_to_extend], + ) + # Extend first value (left for horizontal, top for vertical) + for i in range(n_edges_intersect_to_this_edge): + loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others] + if first_val_to_extend - loc_edge_i < -intersection_tolerance: + if i != 0: + edge_to_extend[first_prop_to_extend] = ( + edges_intersect_to_this_edge[i - 1][loc_prop_others] + ) + break + + # Extend second value (right for horizontal, bottom for vertical) + for i in range(n_edges_intersect_to_this_edge - 1, -1, -1): + loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others] + if second_val_to_extend - loc_edge_i > -intersection_tolerance: + if i != n_edges_intersect_to_this_edge - 1: + edge_to_extend[second_prop_to_extend] = ( + edges_intersect_to_this_edge[i + 1][loc_prop_others] + ) + break diff --git a/tests/resources/text-lines-tables.pdf b/tests/resources/text-lines-tables.pdf new file mode 100644 index 000000000..85a11d304 Binary files /dev/null and b/tests/resources/text-lines-tables.pdf differ diff --git a/tests/test_tables.py b/tests/test_tables.py index 2c537de52..d5fb00279 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -458,3 +458,15 @@ def test_md_styles(): tabs = page.find_tables()[0] text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~
~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**
**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**
**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~
~~**(3,1)**~~|Zelle (3,2)|\n\n""" assert tabs.to_markdown() == text + + +def test_one_strat_text_the_other_strat_non_text(): + filename = os.path.join(scriptdir, "resources", "text-lines-tables.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables(horizontal_strategy="text", vertical_strategy="lines_strict").tables + assert len(tabs) == 1 + assert tabs[0].extract() == [["AAAA", "BBBB"], ["", ""], ["CCCC", "DDDD"]] + tabs = page.find_tables(vertical_strategy="text", horizontal_strategy="lines_strict") + assert len(tabs) == 1 + assert tabs[0].extract() == [["1111", "2222"], ["3333", "4444"]]