diff --git a/src/table.py b/src/table.py
index 25406a9e4..47affcea4 100644
--- a/src/table.py
+++ b/src/table.py
@@ -79,6 +79,7 @@
from collections.abc import Sequence
from dataclasses import dataclass
from operator import itemgetter
+from typing import Literal
import weakref
import pymupdf
from pymupdf import mupdf
@@ -2036,6 +2037,16 @@ def __init__(self, page, settings=None):
self.textpage = None
self.settings = TableSettings.resolve(settings)
self.edges = self.get_edges()
+ if (
+ self.settings.horizontal_strategy == "text"
+ and self.settings.vertical_strategy != "text"
+ ):
+ extend_edges(self.edges, "h", self.settings.intersection_x_tolerance)
+ elif (
+ self.settings.vertical_strategy == "text"
+ and self.settings.horizontal_strategy != "text"
+ ):
+ extend_edges(self.edges, "v", self.settings.intersection_y_tolerance)
self.intersections = edges_to_intersections(
self.edges,
self.settings.intersection_x_tolerance,
@@ -2726,3 +2737,68 @@ def find_tables(
for table in tbf.tables:
table.textpage = TEXTPAGE
return tbf
+
+
+def extend_edges(
+ edges: list,
+ extend_orientation: Literal["h", "v"],
+ intersection_tolerance: float,
+) -> None:
+ """
+ Extend the edges to the nearest edge vertical to them
+ """
+ v_edges, h_edges = [
+ list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
+ ]
+
+ v_edges = sorted(v_edges, key=itemgetter("x0", "top"))
+ h_edges = sorted(h_edges, key=itemgetter("top", "x0"))
+
+ if extend_orientation == "h":
+ edges_to_extend = h_edges
+ other_edges = v_edges
+ first_prop_to_extend, second_prop_to_extend = "x0", "x1"
+ loc_prop = "top"
+ loc_prop_others = "x0"
+ first_prop_range, second_prop_range = "top", "bottom"
+ else:
+ edges_to_extend = v_edges
+ other_edges = h_edges
+ first_prop_to_extend, second_prop_to_extend = "top", "bottom"
+ loc_prop = "x0"
+ loc_prop_others = "top"
+ first_prop_range, second_prop_range = "x0", "x1"
+
+ for edge_to_extend in edges_to_extend:
+ loc = edge_to_extend[loc_prop]
+ edges_intersect_to_this_edge = [
+ edge
+ for edge in other_edges
+ if (loc - edge[second_prop_range] <= intersection_tolerance)
+ and (edge[first_prop_range] - loc <= intersection_tolerance)
+ ]
+ n_edges_intersect_to_this_edge = len(edges_intersect_to_this_edge)
+ if n_edges_intersect_to_this_edge > 1:
+ first_val_to_extend, second_val_to_extend = (
+ edge_to_extend[first_prop_to_extend],
+ edge_to_extend[second_prop_to_extend],
+ )
+ # Extend first value (left for horizontal, top for vertical)
+ for i in range(n_edges_intersect_to_this_edge):
+ loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others]
+ if first_val_to_extend - loc_edge_i < -intersection_tolerance:
+ if i != 0:
+ edge_to_extend[first_prop_to_extend] = (
+ edges_intersect_to_this_edge[i - 1][loc_prop_others]
+ )
+ break
+
+ # Extend second value (right for horizontal, bottom for vertical)
+ for i in range(n_edges_intersect_to_this_edge - 1, -1, -1):
+ loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others]
+ if second_val_to_extend - loc_edge_i > -intersection_tolerance:
+ if i != n_edges_intersect_to_this_edge - 1:
+ edge_to_extend[second_prop_to_extend] = (
+ edges_intersect_to_this_edge[i + 1][loc_prop_others]
+ )
+ break
diff --git a/tests/resources/text-lines-tables.pdf b/tests/resources/text-lines-tables.pdf
new file mode 100644
index 000000000..85a11d304
Binary files /dev/null and b/tests/resources/text-lines-tables.pdf differ
diff --git a/tests/test_tables.py b/tests/test_tables.py
index 2c537de52..d5fb00279 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -458,3 +458,15 @@ def test_md_styles():
tabs = page.find_tables()[0]
text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~
~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**
**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**
**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~
~~**(3,1)**~~|Zelle (3,2)|\n\n"""
assert tabs.to_markdown() == text
+
+
+def test_one_strat_text_the_other_strat_non_text():
+ filename = os.path.join(scriptdir, "resources", "text-lines-tables.pdf")
+ doc = pymupdf.open(filename)
+ page = doc[0]
+ tabs = page.find_tables(horizontal_strategy="text", vertical_strategy="lines_strict").tables
+ assert len(tabs) == 1
+ assert tabs[0].extract() == [["AAAA", "BBBB"], ["", ""], ["CCCC", "DDDD"]]
+ tabs = page.find_tables(vertical_strategy="text", horizontal_strategy="lines_strict")
+ assert len(tabs) == 1
+ assert tabs[0].extract() == [["1111", "2222"], ["3333", "4444"]]