|
89 | 89 | Matrix, |
90 | 90 | TEXTFLAGS_TEXT, |
91 | 91 | TEXT_FONT_BOLD, |
| 92 | + TEXT_FONT_ITALIC, |
| 93 | + TEXT_FONT_MONOSPACED, |
92 | 94 | TEXT_FONT_SUPERSCRIPT, |
| 95 | + TEXT_COLLECT_STYLES, |
93 | 96 | TOOLS, |
94 | 97 | EMPTY_RECT, |
95 | 98 | sRGB_to_pdf, |
96 | 99 | Point, |
97 | 100 | message, |
| 101 | + mupdf, |
98 | 102 | ) |
99 | 103 |
|
100 | 104 | EDGES = [] # vector graphics from PyMuPDF |
101 | 105 | CHARS = [] # text characters from PyMuPDF |
102 | 106 | TEXTPAGE = None |
| 107 | +TEXT_BOLD = mupdf.FZ_STEXT_BOLD |
| 108 | +TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT |
| 109 | +FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES |
| 110 | + |
103 | 111 | white_spaces = set(string.whitespace) # for checking white space only cells |
| 112 | + |
| 113 | + |
| 114 | +def extract_cells(textpage, cell, markdown=False): |
| 115 | + """Extract text from a rect-like 'cell' as plain or MD style text. |
| 116 | +
|
| 117 | + This function should ultimately be used to extract text from a table cell. |
| 118 | + Markdown output will only work correctly if extraction flag bit |
| 119 | + TEXT_COLLECT_STYLES is set. |
| 120 | +
|
| 121 | + Args: |
| 122 | + textpage: A PyMuPDF TextPage object. Must have been created with |
| 123 | + TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES. |
| 124 | + cell: A tuple (x0, y0, x1, y1) defining the cell's bbox. |
| 125 | + markdown: If True, return text formatted for Markdown. |
| 126 | +
|
| 127 | + Returns: |
| 128 | + A string with the text extracted from the cell. |
| 129 | + """ |
| 130 | + text = "" |
| 131 | + for block in textpage.extractRAWDICT()["blocks"]: |
| 132 | + if block["type"] != 0: |
| 133 | + continue |
| 134 | + block_bbox = block["bbox"] |
| 135 | + if ( |
| 136 | + 0 |
| 137 | + or block_bbox[0] > cell[2] |
| 138 | + or block_bbox[2] < cell[0] |
| 139 | + or block_bbox[1] > cell[3] |
| 140 | + or block_bbox[3] < cell[1] |
| 141 | + ): |
| 142 | + continue # skip block outside cell |
| 143 | + line_count = len(block["lines"]) |
| 144 | + for line in block["lines"]: |
| 145 | + lbbox = line["bbox"] |
| 146 | + if ( |
| 147 | + 0 |
| 148 | + or lbbox[0] > cell[2] |
| 149 | + or lbbox[2] < cell[0] |
| 150 | + or lbbox[1] > cell[3] |
| 151 | + or lbbox[3] < cell[1] |
| 152 | + ): |
| 153 | + continue # skip line outside cell |
| 154 | + |
| 155 | + if text: # must be a new line in the cell |
| 156 | + text += "<br>" if markdown else "\n" |
| 157 | + |
| 158 | + # strikeout detection only works with horizontal text |
| 159 | + horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0) |
| 160 | + |
| 161 | + for span in line["spans"]: |
| 162 | + sbbox = span["bbox"] |
| 163 | + if ( |
| 164 | + 0 |
| 165 | + or sbbox[0] > cell[2] |
| 166 | + or sbbox[2] < cell[0] |
| 167 | + or sbbox[1] > cell[3] |
| 168 | + or sbbox[3] < cell[1] |
| 169 | + ): |
| 170 | + continue # skip spans outside cell |
| 171 | + |
| 172 | + # only include chars with more than 50% bbox overlap |
| 173 | + span_text = "" |
| 174 | + for char in span["chars"]: |
| 175 | + bbox = Rect(char["bbox"]) |
| 176 | + if abs(bbox & cell) > 0.5 * abs(bbox): |
| 177 | + span_text += char["c"] |
| 178 | + |
| 179 | + if not span_text: |
| 180 | + continue # skip empty span |
| 181 | + |
| 182 | + if not markdown: # no MD styling |
| 183 | + text += span_text |
| 184 | + continue |
| 185 | + |
| 186 | + prefix = "" |
| 187 | + suffix = "" |
| 188 | + if horizontal and span["char_flags"] & TEXT_STRIKEOUT: |
| 189 | + prefix += "~~" |
| 190 | + suffix = "~~" + suffix |
| 191 | + if span["char_flags"] & TEXT_BOLD: |
| 192 | + prefix += "**" |
| 193 | + suffix = "**" + suffix |
| 194 | + if span["flags"] & TEXT_FONT_ITALIC: |
| 195 | + prefix += "_" |
| 196 | + suffix = "_" + suffix |
| 197 | + if span["flags"] & TEXT_FONT_MONOSPACED: |
| 198 | + prefix += "`" |
| 199 | + suffix = "`" + suffix |
| 200 | + |
| 201 | + if len(span["chars"]) > 2: |
| 202 | + span_text = span_text.rstrip() |
| 203 | + |
| 204 | + # if span continues previous styling: extend cell text |
| 205 | + if (ls := len(suffix)) and text.endswith(suffix): |
| 206 | + text = text[:-ls] + span_text + suffix |
| 207 | + else: # append the span with new styling |
| 208 | + if not span_text.strip(): |
| 209 | + text += " " |
| 210 | + else: |
| 211 | + text += prefix + span_text + suffix |
| 212 | + |
| 213 | + return text.strip() |
| 214 | + |
| 215 | + |
104 | 216 | # ------------------------------------------------------------------- |
105 | 217 | # End of PyMuPDF interface code |
106 | 218 | # ------------------------------------------------------------------- |
@@ -1382,7 +1494,18 @@ def to_markdown(self, clean=False, fill_empty=True): |
1382 | 1494 | output = "|" |
1383 | 1495 | rows = self.row_count |
1384 | 1496 | cols = self.col_count |
1385 | | - cells = self.extract()[:] # make local copy of table text content |
| 1497 | + |
| 1498 | + # cell coordinates |
| 1499 | + cell_boxes = [[c for c in r.cells] for r in self.rows] |
| 1500 | + |
| 1501 | + # cell text strings |
| 1502 | + cells = [[None for i in range(cols)] for j in range(rows)] |
| 1503 | + for i, row in enumerate(cell_boxes): |
| 1504 | + for j, cell in enumerate(row): |
| 1505 | + if cell is not None: |
| 1506 | + cells[i][j] = extract_cells( |
| 1507 | + TEXTPAGE, cell_boxes[i][j], markdown=True |
| 1508 | + ) |
1386 | 1509 |
|
1387 | 1510 | if fill_empty: # fill "None" cells where possible |
1388 | 1511 |
|
@@ -1420,7 +1543,8 @@ def to_markdown(self, clean=False, fill_empty=True): |
1420 | 1543 | for i, cell in enumerate(row): |
1421 | 1544 | # replace None cells with empty string |
1422 | 1545 | # use HTML line break tag |
1423 | | - cell = "" if not cell else cell.replace("\n", "<br>") |
| 1546 | + if cell is None: |
| 1547 | + cell = "" |
1424 | 1548 | if clean: # remove sensitive syntax |
1425 | 1549 | cell = html.escape(cell.replace("-", "-")) |
1426 | 1550 | line += cell + "|" |
@@ -1944,7 +2068,7 @@ def make_chars(page, clip=None): |
1944 | 2068 | page_number = page.number + 1 |
1945 | 2069 | page_height = page.rect.height |
1946 | 2070 | ctm = page.transformation_matrix |
1947 | | - TEXTPAGE = page.get_textpage(clip=clip, flags=TEXTFLAGS_TEXT) |
| 2071 | + TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS) |
1948 | 2072 | blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"] |
1949 | 2073 | doctop_base = page_height * page.number |
1950 | 2074 | for block in blocks: |
|
0 commit comments