Skip to content

Commit 719a1d4

Browse files
committed
Table Cell Markdown Support
1 parent 7f50cb1 commit 719a1d4

File tree

3 files changed

+137
-3
lines changed

3 files changed

+137
-3
lines changed

src/table.py

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,18 +89,130 @@
8989
Matrix,
9090
TEXTFLAGS_TEXT,
9191
TEXT_FONT_BOLD,
92+
TEXT_FONT_ITALIC,
93+
TEXT_FONT_MONOSPACED,
9294
TEXT_FONT_SUPERSCRIPT,
95+
TEXT_COLLECT_STYLES,
9396
TOOLS,
9497
EMPTY_RECT,
9598
sRGB_to_pdf,
9699
Point,
97100
message,
101+
mupdf,
98102
)
99103

100104
EDGES = [] # vector graphics from PyMuPDF
101105
CHARS = [] # text characters from PyMuPDF
102106
TEXTPAGE = None
107+
TEXT_BOLD = mupdf.FZ_STEXT_BOLD
108+
TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
109+
FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES
110+
103111
white_spaces = set(string.whitespace) # for checking white space only cells
112+
113+
114+
def extract_cells(textpage, cell, markdown=False):
115+
"""Extract text from a rect-like 'cell' as plain or MD style text.
116+
117+
This function should ultimately be used to extract text from a table cell.
118+
Markdown output will only work correctly if extraction flag bit
119+
TEXT_COLLECT_STYLES is set.
120+
121+
Args:
122+
textpage: A PyMuPDF TextPage object. Must have been created with
123+
TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
124+
cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
125+
markdown: If True, return text formatted for Markdown.
126+
127+
Returns:
128+
A string with the text extracted from the cell.
129+
"""
130+
text = ""
131+
for block in textpage.extractRAWDICT()["blocks"]:
132+
if block["type"] != 0:
133+
continue
134+
block_bbox = block["bbox"]
135+
if (
136+
0
137+
or block_bbox[0] > cell[2]
138+
or block_bbox[2] < cell[0]
139+
or block_bbox[1] > cell[3]
140+
or block_bbox[3] < cell[1]
141+
):
142+
continue # skip block outside cell
143+
line_count = len(block["lines"])
144+
for line in block["lines"]:
145+
lbbox = line["bbox"]
146+
if (
147+
0
148+
or lbbox[0] > cell[2]
149+
or lbbox[2] < cell[0]
150+
or lbbox[1] > cell[3]
151+
or lbbox[3] < cell[1]
152+
):
153+
continue # skip line outside cell
154+
155+
if text: # must be a new line in the cell
156+
text += "<br>" if markdown else "\n"
157+
158+
# strikeout detection only works with horizontal text
159+
horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
160+
161+
for span in line["spans"]:
162+
sbbox = span["bbox"]
163+
if (
164+
0
165+
or sbbox[0] > cell[2]
166+
or sbbox[2] < cell[0]
167+
or sbbox[1] > cell[3]
168+
or sbbox[3] < cell[1]
169+
):
170+
continue # skip spans outside cell
171+
172+
# only include chars with more than 50% bbox overlap
173+
span_text = ""
174+
for char in span["chars"]:
175+
bbox = Rect(char["bbox"])
176+
if abs(bbox & cell) > 0.5 * abs(bbox):
177+
span_text += char["c"]
178+
179+
if not span_text:
180+
continue # skip empty span
181+
182+
if not markdown: # no MD styling
183+
text += span_text
184+
continue
185+
186+
prefix = ""
187+
suffix = ""
188+
if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
189+
prefix += "~~"
190+
suffix = "~~" + suffix
191+
if span["char_flags"] & TEXT_BOLD:
192+
prefix += "**"
193+
suffix = "**" + suffix
194+
if span["flags"] & TEXT_FONT_ITALIC:
195+
prefix += "_"
196+
suffix = "_" + suffix
197+
if span["flags"] & TEXT_FONT_MONOSPACED:
198+
prefix += "`"
199+
suffix = "`" + suffix
200+
201+
if len(span["chars"]) > 2:
202+
span_text = span_text.rstrip()
203+
204+
# if span continues previous styling: extend cell text
205+
if (ls := len(suffix)) and text.endswith(suffix):
206+
text = text[:-ls] + span_text + suffix
207+
else: # append the span with new styling
208+
if not span_text.strip():
209+
text += " "
210+
else:
211+
text += prefix + span_text + suffix
212+
213+
return text.strip()
214+
215+
104216
# -------------------------------------------------------------------
105217
# End of PyMuPDF interface code
106218
# -------------------------------------------------------------------
@@ -1382,7 +1494,18 @@ def to_markdown(self, clean=False, fill_empty=True):
13821494
output = "|"
13831495
rows = self.row_count
13841496
cols = self.col_count
1385-
cells = self.extract()[:] # make local copy of table text content
1497+
1498+
# cell coordinates
1499+
cell_boxes = [[c for c in r.cells] for r in self.rows]
1500+
1501+
# cell text strings
1502+
cells = [[None for i in range(cols)] for j in range(rows)]
1503+
for i, row in enumerate(cell_boxes):
1504+
for j, cell in enumerate(row):
1505+
if cell is not None:
1506+
cells[i][j] = extract_cells(
1507+
TEXTPAGE, cell_boxes[i][j], markdown=True
1508+
)
13861509

13871510
if fill_empty: # fill "None" cells where possible
13881511

@@ -1420,7 +1543,8 @@ def to_markdown(self, clean=False, fill_empty=True):
14201543
for i, cell in enumerate(row):
14211544
# replace None cells with empty string
14221545
# use HTML line break tag
1423-
cell = "" if not cell else cell.replace("\n", "<br>")
1546+
if cell is None:
1547+
cell = ""
14241548
if clean: # remove sensitive syntax
14251549
cell = html.escape(cell.replace("-", "&#45;"))
14261550
line += cell + "|"
@@ -1944,7 +2068,7 @@ def make_chars(page, clip=None):
19442068
page_number = page.number + 1
19452069
page_height = page.rect.height
19462070
ctm = page.transformation_matrix
1947-
TEXTPAGE = page.get_textpage(clip=clip, flags=TEXTFLAGS_TEXT)
2071+
TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
19482072
blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
19492073
doctop_base = page_height * page.number
19502074
for block in blocks:
73 KB
Binary file not shown.

tests/test_tables.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,3 +423,13 @@ def test_4017():
423423
["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"],
424424
]
425425
assert tables[-1].extract() == expected_b
426+
427+
428+
def test_md_styles():
429+
"""Test output of table with MD-styled cells."""
430+
filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf")
431+
doc = pymupdf.open(filename)
432+
page = doc[0]
433+
tabs = page.find_tables()[0]
434+
text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n"""
435+
assert tabs.to_markdown() == text

0 commit comments

Comments
 (0)