From 004447feff9e8e81b7e3f1e272d3c87146072a7c Mon Sep 17 00:00:00 2001 From: monchin Date: Sun, 25 Jan 2026 22:47:44 +0800 Subject: [PATCH] Fix: Add extend_edges function to fix table extraction with one strat text and the other non-text --- src/table.py | 76 ++++++++++++++++++++++++++ tests/resources/text-lines-tables.pdf | Bin 0 -> 22152 bytes tests/test_tables.py | 12 ++++ 3 files changed, 88 insertions(+) create mode 100644 tests/resources/text-lines-tables.pdf diff --git a/src/table.py b/src/table.py index 25406a9e4..47affcea4 100644 --- a/src/table.py +++ b/src/table.py @@ -79,6 +79,7 @@ from collections.abc import Sequence from dataclasses import dataclass from operator import itemgetter +from typing import Literal import weakref import pymupdf from pymupdf import mupdf @@ -2036,6 +2037,16 @@ def __init__(self, page, settings=None): self.textpage = None self.settings = TableSettings.resolve(settings) self.edges = self.get_edges() + if ( + self.settings.horizontal_strategy == "text" + and self.settings.vertical_strategy != "text" + ): + extend_edges(self.edges, "h", self.settings.intersection_x_tolerance) + elif ( + self.settings.vertical_strategy == "text" + and self.settings.horizontal_strategy != "text" + ): + extend_edges(self.edges, "v", self.settings.intersection_y_tolerance) self.intersections = edges_to_intersections( self.edges, self.settings.intersection_x_tolerance, @@ -2726,3 +2737,68 @@ def find_tables( for table in tbf.tables: table.textpage = TEXTPAGE return tbf + + +def extend_edges( + edges: list, + extend_orientation: Literal["h", "v"], + intersection_tolerance: float, +) -> None: + """ + Extend the edges to the nearest edge vertical to them + """ + v_edges, h_edges = [ + list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h") + ] + + v_edges = sorted(v_edges, key=itemgetter("x0", "top")) + h_edges = sorted(h_edges, key=itemgetter("top", "x0")) + + if extend_orientation == "h": + edges_to_extend = h_edges + other_edges = v_edges + first_prop_to_extend, second_prop_to_extend = "x0", "x1" + loc_prop = "top" + loc_prop_others = "x0" + first_prop_range, second_prop_range = "top", "bottom" + else: + edges_to_extend = v_edges + other_edges = h_edges + first_prop_to_extend, second_prop_to_extend = "top", "bottom" + loc_prop = "x0" + loc_prop_others = "top" + first_prop_range, second_prop_range = "x0", "x1" + + for edge_to_extend in edges_to_extend: + loc = edge_to_extend[loc_prop] + edges_intersect_to_this_edge = [ + edge + for edge in other_edges + if (loc - edge[second_prop_range] <= intersection_tolerance) + and (edge[first_prop_range] - loc <= intersection_tolerance) + ] + n_edges_intersect_to_this_edge = len(edges_intersect_to_this_edge) + if n_edges_intersect_to_this_edge > 1: + first_val_to_extend, second_val_to_extend = ( + edge_to_extend[first_prop_to_extend], + edge_to_extend[second_prop_to_extend], + ) + # Extend first value (left for horizontal, top for vertical) + for i in range(n_edges_intersect_to_this_edge): + loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others] + if first_val_to_extend - loc_edge_i < -intersection_tolerance: + if i != 0: + edge_to_extend[first_prop_to_extend] = ( + edges_intersect_to_this_edge[i - 1][loc_prop_others] + ) + break + + # Extend second value (right for horizontal, bottom for vertical) + for i in range(n_edges_intersect_to_this_edge - 1, -1, -1): + loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others] + if second_val_to_extend - loc_edge_i > -intersection_tolerance: + if i != n_edges_intersect_to_this_edge - 1: + edge_to_extend[second_prop_to_extend] = ( + edges_intersect_to_this_edge[i + 1][loc_prop_others] + ) + break diff --git a/tests/resources/text-lines-tables.pdf b/tests/resources/text-lines-tables.pdf new file mode 100644 index 0000000000000000000000000000000000000000..85a11d304a0104fc84cf574b1e4d5e77811d828d GIT binary patch literal 22152 zcmd>mWl&wqwk{GN5Ofg&1Px^2?hb*4ySux~!kwTC3-0bN!5xCTy9Rf6ec5N-~ANde-dOGJ5pr@qN{^XLSSRKq9n^bnGZd!0ejdc@!iLVg_Pc0}B)+9v*rr za~orQ2Xl90BVtBkdMRQiCPr2!VtO%RO=3m{1{PutPGW6hdRbz|Ke@uhj2w(?48-&@ zf0b1tW@6-E<|O9hLqRgOG5Uu<;{T@?3KHkP^hC+c&X|~9O~KfNUR9Hkn1z^0n_k?; z$kq-@zLE z$2>x|R!-J7U}E;asu`IZI@&rAGcxh<(MuWIm^zvfvoq5x*xEY&vDF_<6#lelVoS{S zcNC<_!^z0Z!pOnK0AlC(#tsq`1hKHQi!h2Xe-jZFWc|j<&i6l;(B|UFJ^WxnUHL3yE1IKgFFvW#ifknN^UyAN`1mb>~;|;rm3Nx4F4=G5z~R z*ws-~(eV$2|EeRZ$Uw}*{2z5hY;FFkKrg~d%*66{+MhKUmE_?H2=X1Ost%2 z|Hj~dn3MTmjh>$1qNKR^DR{h%y-h9#Jv45@(o6wH45dvhnvRMQ5nZA)3W|gcq+|z@ zk1lx6M6uzUD->N#5*i3CUw!lr7c^sU_Yp5a+^8IPz18h(Gq<@-uwF|rgkzze@F2kfj-;qYFtkl zHYkdi6}nxpqLCG?VGIS+ciJsCs130}-Sa7WwUKemVYJ?{5e+)P*TN87GA+fRh*-?9 z-{a%bdttpJ45;i8q??Qj$++tHza5}V#-G(cx>@7?B;j7*;N)PirnPRTnDNnmPK~*9 zPS6SPImHDL-FM%rdMBemRTUW5Ej|WYn`1#~A3@8tyabQx(L;s)Yx;d^$d({+C3oe zW(MCAA-p;fe!_VE+8#n5Ckz@ckRp~!sM8$441r4)3=+7{Ou0wve$88KN`NfgV@xEd z!Q_AWEZ|KDML~r@jObxRJ#pssp-&X{VKi(qIVt^V{QIkyYNOQaz^AW1k%pomrbdAL z*m26U1`G=^l&}Eop#a`KEN17s`c8~SD6|b&m~Rvz@9gy$H@<(2f(8*|fAp=(!ioD{ zu18)3@05kX;?HJ>GJNzh@o$-d@3h2!UDC={WFEO|4)BbGY?DFb3b zmqrza68b0!WVy6+90@piDD=%)d)-H96*YDoNkg&4l*7+g7F3>cq& zi3mo!enrIiQW^!wgE1FaiXzM-Y6=yLE*#MO2rn$mC?^&7VTcGTa5E<%TP2$nO#~tbAPrB)7SI?oV z#Um3!-J`srVMlJKYKPf~+X`4iKMP0ey*eMhFuoV@MDu>nADQ`u0y7-3B0x-#N}o16 zW*O8sSyXCN?( zWQ`S@W}Ie34%H7iP=uHZ;3gEz^J{I*{A($(LPOIfY9%rxB8Kd-C<(IhCSmsvu9+V7 zADte#m1&h_mD>-=X4z(`ODbn~i`IG zh>tWcUtb6w+a49+x!}LR!@-xsH=tYu(R6?8)((0Js*DJaFbHSEal++gQej4A7-P3# zZMPI;vu9T~nbLC@{b)GEB+a~P?Y@Z2R>LmISjvdU6g)~W0=L^T8caL!N#>Ja7 z6ks|wx!cM(Qe%qNpY{W73r(gvu3E3Abxg!zKr$F8S*^XnuF8?8?1 zPBAkOY!Pf@&W7msWWP3%#gp?&I7_c(@FZ_$C`4Z>b$(L%kWu(pIG5ip^Few!^OlmA ztc@2oF*?a!NerU^@rkL9`HCsHW~3&xCT}<`@kEOahy-9Y7FrDZ+0x6hI6mXs1Ny1M zl(Lne7<)y=&UGi2;gG>#tJk!6DLqVn>2=kiQ@&~XKv|3QAA+N0d$q+n^j z#jU0C_+nvNrBGQ_;t5Vy-hD1V`;IF}iKB@O7H{IWYJ$_V0 z)_R%VYg;4K<>p}?YOQh3#!mrHB9939KD*X7^ye3Dc?_T35l!=|fVFm97ZHC!axgPx zb~DrX;++l;4z{Na+a%r3(jdIxa{5-=bE}EPt3A*0^n<%&ez&!o=jsn**pWI$+Q-e? zcE#XM@2o9^Cn7F*0(kCV{NQGKZtYXt40g}W(5Lum)warvmQQvkQv=V@zmMeSZP&ya zzqAduJnXAHmhUxyZN^vB!Ps66+Zj`Bp-(Ayc-|~8)!K)|ynKheT<3aS{@jKu)#Uuf zZ`axwMr++^g|7#BV-)Efc4F6`r`LC1B@|ZT69tQLPkFRU`IvZ))Z<>Ot=vj@Z zy#zU-AV0A#ZTrd});rM*DhgCG!@b@G*_!N}}l)UzWcLtpy zM8>j>;WfCA&o9Wg-#?~(lHu+lZ{{e=*Y$x>nN^S#G3=d&7#RBts?I6bm! zFF($WOa3j(bm(fx^wa{b>M_wueT5ss@b}iTJ9CYbX;w++WCP6;7Qf<8hJh|61&u0C zvp>Zy%IBmBxu2F-kLS@O`@|&>OtdcW?3eCChhKx9pv`^YX8*le|N2Du?`CD?{2$D! z9X)E%LyYcw<`&9ngBJE+*%FMdR{zmqA5HoRN`$<4q6h~(usrXyslbI0EV~zLZ5TC3aAI(`m>pjPW|o33rgfy=6Vq*eQbiZ2&OQ*V?HD>Q=Q zcyO%1@Sa`=zsMuRFP4|+v@GLaGzR2`->}F5!SbG9Q9+CVo=TxmH9!KlO2BL+tA5*- zR}WPaTEL*!SE*A-Rl%6Thk3FsIa@>YQm{UZ{q4ohJUewn!&`Nqh^?+SG;KpL$-lSd zU!P6?&6bS+lZQU7LqWm#Li_%EiGQuD{C9~=91P6=!}y$dAv(^?KHRp|7-bs$1TaNC z1b*@3_@Ynl8-u7WMvNy84|q3dWD+lan*kW?np3USCI7uGxENWCwXtuvASf zv9>N(X<1S!tFN!4+V(j0@?rmwQ>Z)s;PY0w$8kP#wQ|yS*M8R~DN|*f#%Q9CkQ7~a zgK&4l>$WJJ8hs^cq_azKjeJLVl&MF<8`8Q1>;2;_-`49MdWxO6%>fEmx`10D+8goU z0oH$#xq=~$kLUnRm<#99J;yRzEhwlm&_9DY=q@uZF3=HnYAMQe;w{j&Ln-VU$}o(7 zc}c$dderHMR=3)03&j~Os&EcABi^HnQ|RLwuYCDypxx&w`Mwdf8PYx zMx8yo1%A&byWeo(PFA9KZw4lPk;38Qsnd)tp!yWnnoqa+3xQ)5!AXG)H9Yk}>tI~J^x1s$?YWFbIxU%E^F{YPIRmN|poj9e!ZUtLQUy1{V#Y6>iTd}Pd zKOE)C@SgNRdF;Ds+PD%&rv0yQ@I{PGhI$-jbgYe-)QGs<2sL||n%qx^#h>{{$kYjm z9J~2=Mr)VxrE|E6w6rWZV5KoGe=+dZ6_E{dJ)ty1+E&rh&k-%PsAxlR2E5&EQ(CU3 zg(Z87i@_Z*W&{LL1gx<6OZjNe8fk9yN!Va_Wu>)AU0hrpeC4obLPqGdnD+XrJ9vtKjjzwRtBG2am%pJ4Ms+q(#4B&BdvKoWh1qLb<00=;m%6=dyz*NFWL|A!KkVI5)1T!duOIsnXXT#N+C)>1!65Y zK`hPQ24U~QLECvj9h%T#1-C0tQSplw^J4Rdt6eSy8lLgEzNs?vpceceTp3O{1m=;o zWlbmQHZ`fr%F4#&I@;yNrou5*syut$+z=f`KjDOhABgm~#>B*za?s9{fr6_CSYmq0d!0S(oqtj~Uf;EZ zQ!Ykwg2R|y0Y6e{YOp_AkYnYgj%{{q%BIDbO0wf(PRBef8LrFLe@z`ZsOM4SVdWh( zlNu7^b-kv!pBp&U8|lGV4jdVzTI3IO(T2PAq_(ZsWbR3X(|s_%pW2{)YRq1#t0JBc zywt)T0)rKYK|4(HO;Sp9nDKw2KM{~6aZ_E73C&b~B2zozuy)xJjZG|)JP-v>f9xXR zl_f~3t7Buw!PI*?m=Qx}Q)zXRvF3U_rkam?IHPY-g!3ee1P)h?YW7c^8Bhjk(ZuQOR|Fjh?v zSywKqt3K`Y^F?hZP5C6;emB{5InIQxCz*6SwHv&kQFto+C!DAL3fpK1z)o+*PXB?E zQ0bQP!GB<9Cw#eSuS1BI1H`|}Qxte-CFB09cAE5yD!+6OXKbpX8TuNa%Tj_l zDb_)Yd)$I&FF29SMeAh?OzmP_+5k+BD9D@1yUs5!Y_Q?n*R1>Hu~Yja(Wg zNdmnvjVU8{i-^eON(o5@?IyEkkEt^%PkcHYyYXJP=ptI^y%F5@B+P0qoV(VieS7sR zkJ^oILS&LbqSKoPsn%PqMk7+yMLhR8p)CaU)sBsfC#QOkCN)L4Xel?YxpK@S@9TQZ z{hSUGQrFve7Brd4LTx>^EK|?a37cPXTKdM?d0UsxwbIW70kde8`daF1eQ(#m{t=W`XW- z#bN9Efv7lrNrXwp)Z3IA{)v5~U})^~o^mk3N&cLD21f!hesk^N{`A74V;WxsgXV2y z+&V@^@57Lbl($uIjW(+O?+jw8ukZ0L^u9aZC}>5o!oLVfy+_nZ7L$lwTc)mxWIfna# z+kuvj!hrU#hnj#ZZ64VGvkcbJU#XV?%vXIVVW-+E;rv#0?Ifi{3~9p(gMp2kI^cYh z_l2qzbG5S})t4TLJHR*>HPs(ifkQtU)gw2Ap*J}`>ALXIR^T5Ueaw}q7jm6)CQ7Ej zW;?GZrbhfV5=X7yB_0%cw_kGX)Q2@}-f^@i@QcaGlPq*U2sVZ}=0}Ct_^235fl;1U z6>AljS1x#)tTxxswNXD$Z`0t#dSv@-hFmz@ zw}she&HrL+bdR*VVQc@2IHpA+;}4u#mkj7g498PxRM6m((9pQ7abr%{MNvg8i8H%* z_|nj0EY$!>V4IvG(_8{crnY~vQmgh*i&+rb6#m|>)=YSR@Ten!{UIY_M^cyCW~`1* zKhJeqd?27j;?PgDOVkVYk*p6dpaIl)r zUYLv%;+SCjK-aWjMZK0yDmASktt_j=a>P_4a3MSXg_dG8^=iA#ETP{czV%c3Pyh77 z6S?5I*2{OTc)gcXmJ0Zq7N*j>X%y%VA0qmO=Y?(8wY}WuQ^ND!88!pmghh*-+bW$i zY7BL+)YZ;<0%oqar{@tqV}GPsZ#9T0G&e{3-fbo(Hl(wK+ElOYa7bdkT3^SST`$bD z;$^;GveI;WeVx|SbhT&JyVUA*nqS(ibyYlEc|K6raXmOOb=cHHf!H&Cl_;CLW(BTPA zTxb*`%werD zrKE3&l=a+C3Hz*4eN#?SQnq;`t$ zSJ$765j@&SZtAyT!RL-&9|{hWh-`}}^c8PKXk_iuNXpda@JQ%QijPYOAY!E%B;wm) zc)r;Gy!2ay%zIPHkgOg(4Qf|yc&E%6hJUXbSmthEu-OagH=?!h(>S8n9*9hKF-d!N zVdEAz;lJOj@%efmf7gC7gK$Q~8})25CGj;@KdjUl@fc@IwqWJF>*;Jg>UlR$_(q&9 ze{0sB6r5CF%`H_oV(5;UHg@{%mjHj28c`4QJA7|4wsgNtZd?DIR2<2Y00@p@@ujJ; zIqza^(CFsFJ;qdLS=o2(5j7Q7Z&c~H{4nIJ?H28KmC7lsL-qy3aq@kR&DnDEB5dn& z1(wsbTqX&&X46i;YM(*%{R~-0UymV<+T_fm=Oq=2$m}?g2VC*ijJkzKEY2|x+S-~^ zn1C6asYQ{Q$(O!Z@UHA58>35f1QCPgOrL}k~|iox-` zzk~GjVN|9#xzsQ@e=OO54fT0@mgUW4{64YfR+J<*l+J#ADeAs%7pREU?h)|b3=;y|s z#7Y=SSGA%!iyHm-R^-hIKzW}%##gKCWfp6wGm`KvUaCN)nrX2E;Zu;js!#{vHK~>B zqD|YWNFIC?wS*```@ps*S2R%+_`UbB>th}fOFiL%PF#rAkLzC}oH6N9e2UVyOy3nM z(<(1db!t(D(=cW(#VgGRh@#v^%P zqz=-sIAI!X#cm;&6e1V`!|z+=NKG-Lm=Oo+j{6tBJERdsM6xpR6*Mf6?4c(gn*y!q zdhbRwhWH6NH8E8AII$;(Z4MgRalP-&lx~Nw-|yyGgeen(G|F5B#+van@V>nYZVRgw z^Jzv{M_CJ1U0vjU262`nlxK*#D@~Jf6MGyBon~FH@a5eK3LDP|Pgv4B%H4`=eJsW6 zbm<_(He#hRmG(j%ytx{3!kucP$oVNh)_}%oa2GM)3K47Z=Dmb*fHU7AbwC5<(VM%3 z-s8_*DbxEqNar$|@wi_Wt%_i3K&i+l|;K zFkP&N(hT0*_{M=eH_g0}*xe`fizD+NgxTH)>%M7UA}UlPh03ok_6(J5zeQueipi{R z;}5--gCVyevX3k!XTr@Nir0S*dPinL_s-}XkXNpJBfnhK@hX6JtAyJ@5otIU^`c*z z;L{9w%x69g!_+Kfd@gKH_r{x4FFYtzR;oyI=_|H>MlKi&+3K%z)<6zPp3vGqRyvD0 z-{w4yC7+DqiRK~VRt)7G4=>8`!k_9eSIHCI_RgO7l|4Q5^^T1tkY}cl4f^pz-klNU zfSTD@`y{Hfmf2Y#!vK19d=t(%x)QU455Z}Tihex&kg17@u-GfMVf3Ohbj#(}udOk- z4o|F;_&rO3igq<+w9A0tM|q)m<$X36!aD`ZS(mdJb4Dk&lF8@wR=Bs(hD5eK4m|0h zAT}4K6qheQ=EQ|j(r3TDjIv>^W?p9~$v<4!N>n>o$8wW1ofw0Tq~$l}pK>rg#B}eN zIQObv=M*dPn}LrvPAc|N_N-+cctycco`J60Jj!{l3p1&Y_gBY0dbDJ`+tFZb?+^Q? zZL;g7IxjnnU;DXU0zXID#)gDTdxaBq)K3sF>&~j3OqugtN_)^tucN)y^RnIaDb~q) zHpdgVG*hkMd?3$Mj3L&OLFd> zD)QJ-dw!H=Wok;89tUW2BYE|bIwZa$@6Wou8Bu$@YvXzeDpZ&0FgNMDjw8R7;75H8 zJ}R>D$9R|{8^14%=p^qD&Q$bX4E(%i?jm}PIcCW9;N<{+*r&<7^|5e%fv{Nin)x)U zm2`%d`K7Mp6oQ#f$*d{7t^Un&wsicd$E9M=P=$qu^C={J>Wg{0^Q5Q`>b)$fz^W}u znM^|cPxrj~&3dau&WM3&yU_YvqYnyo1LeYFFf;oT!`K7yOH^ms@H60H?ntZJtroZg z8@Ik$X6XW93(UiY&IB{f{$?X0r>eOxmi3a$qXX73^9MJeu+Q6#(VyJ?dEkmWKGo)A zF5um8M~f^$Jgpsqm_=tPJd{W+MM?;6@vf&V7Hr~fd+vqbZWJ1%B5QPr%0wwa@-7u8|TX*11j{6l#eaB(atI&b0;IN$vjzveZdTExIp`ktA!)Z)M& zo->8=ZPs_0!9m{#R)Tv6Wim9O&Y^%{%EPI7yvHil$ihOUR)N)l?N{k9gu02*&LHvi zx?$VTl20s_%w7%339_>}56VaDtW)4}CgDIw#_g*^*@IJN+ z9jcZDw`fkMc(<0)8D81ZMW2vw5L#2%Sz767G|YZoZW?PqvXqTATF5zgvRGY%TECpx zY}+8CAe*C*wukH&f{D>5zfYfB2mV@m6tHF3xOvCgCOU(dtw;;_jOg)_f^(`PyHFW{ z{P3(#%e66|g2!D~;5*O%@uG5;#Jgjc==kQz(>wE1p4RBz57%x#eMa8J;Blgxci*EO zBqPXi_E8X)US3GQ$+}JvS+cb0MI`ly(!Sb(ShUsK9rJaFgLWo!^nZ-=?<1bg33{=_ ziT}6_79#B?S;qT}e~S$F9DP}UaV996_p!|q?^m{*V^DDydV#;Xdk{&I5XVH}jT<~{ zm)FvofTKWI__c_?Hws&@r(d>r<2&Wt*Kf7>m)T+G=s{`~tRi)M9pqeqThZDoB0gl$ zWH7rzfiOI;nsTKnjs^=ZKB5S0Hv=aN(>_|RNe@vxJjs$BM}Qi7`}Z3?hFVp;&M6CM zscyVk7$FU%G8T~_vh#EF$6qWSurE^#P9JYFF@(1|ZG4&>cD{$O?c**Lx{0F8vDo~s z{z7BF;NKJgi&357J)Q;MpN*$>EGqNWj;usfC~1O_5%?o$hbXT-Ah!)Rg8_aQEfEcr z+a1k{h1DL#uE1BMl7s(24r7%a{}9b12;o-Q)g8k&hspw8v^7Ia*X+HbXnApOSU55o z0_=NYd6f#MU=fB}uch~blvLkUjKaRS{oXF>kh+2+>3(nIaAL>dt3Y3J-$lqVLqv!S z%4L{uksVVH1j0vPAqU3qWAes%u_RQHv}4%TO108jc#8@rM7S<><~E?LW++7-P;t%+Am}sSxC%25<$`_PIG6gd=_&Y8-l>`x8n+tO zZ%sfcO))_RxrzSkrV6sUi&OI45k#*nJ9`jQi1wEt@26*>9S_(R*lus#PDMhbJJk>+ zIE&Mn@dO2$O5_i?5x+(tXx*3u2IEJpDWC}KfK-=22J+vZK~CrhX^wXzTMCU1eSYTE zwUA!OK==ErpbyVxft^97p2ixwIKi154pqQMM$_Mh_ej*CyBbBLJi>B+*gq8yFXUUP0Z46o=VhR`dYVPe@_RCZ$gMgjIja2+fJ*bv1-%>|_Ch(N+^G5|9TW}!uvY?UsTk?T9mP~-uW57TX=23I1W0zzPt zJ{%e0MOZfz3w~D3iD9q=9-;PB!6Yv&LbVGq%RM6ab$oY`udxmjn- zDs2vq=QNJ$4ijvw6%8e=lU3$t=XB;Lv+L?DBSx*IBb){-t%}VZ=Gp44CgyUrC8o#Mri!)#=)uFA2!y;;_l7iyq>L#hua+``ZIr2XXa1t$L1wU*oh0;d^eP-`9 zrxzENNDAbqm!=jYjc`pC*Ekm)Ee%DrlT}vN7QWystgj(m{Yp2Rq>9MWmLyQesF*V; zdtVY#+mdTzK~v{6D_u4F<+E(1md%6D_3&1ltdG~T?$yzjtdIWn0ev{%OW@=iS_gwf z^!;_B$Xj{`OU~@mHMVHx>*oHnF22`eSnihWm8_2&%T=Q25O?M)%)zuSvDf2=ytgxN zzkoK%*Yx(mL3%@z1f$oVZn%c6Q49C@Ed#`_TY<9OCpDPiKG*d>7&-o= zhtHfm)C5y(70VXSte@_B)Y#e(sm-X*%a#$EWXI?j zWOX64>C}J9<$Pq%w=hy)t+?o1?|(B;+DlF!s_j|^Z!eBKIDztt!O1bCh4BdiW>de>@;B||sn3=AWdkj9lQrn}ilH^w*N{enr_GurW6XnqwtT0fm zu|&t#HJ&F-d;%uFVL21D5HgYuRS1sVC_tuy7Zi^Poyl9mR!iTi+9mHoE+sRsvm5)y z`;f;o*$gh%*Tikp{hPv?A`Y2(VITISEvrTEa*N#fEMl-U5cG}T>@ymX!Nk0*q!QY| zJ^py^RqD*R(r5bgNJ$PxwdZw9bGmC z?1r>9@e+(XD==QvEhrvO&dwpLe*B8=%_QxyUXm4B;Stpw+#{!b5^KY!hFeT;u|qO( zK5FAXWOrA_3H8uO1PS$j{LXWO@lhJ<9`H~a_fpuEe?+{s&J(7U;O%V>!#qSd4n7t= zR=*|4%ZOfI7dqs;9(fHp%RT12vQ_6oH8t^u@@`K^58{9GCjvtc8?#Hb zvRAuo@*2V(ZKXepo*&Ksjvog{f(HXkD0ZlBYnEQ5foKu$`=uJGZYnx9;vUOzrjA3%~N@KI30 zKuf>B1(|Ay$Vv&Ryo_)$mW!%Po6I?8o2=WjYAa2%s>f>9#V!tp4b^f5fAVOa%Z|9R zPHo9To8xM(Tfitz&1Z`r2!Mm+kkn2bULsi3;haq#8r*omiyXOI=-bk;=J&~O$*L>$ z^(iQPQOq{@K+tXI`J0O_P+xyFg;A^S)1>u?n%3S$?-HBK?Cer#XtOb4X>`2-B5O6s zT=GK*W6ij^q4AFhZ|{{McaJo4zTokiHfodWtjVHk8^`NSwE||9!=i}zjE7}UsYN1c z@XTCq8Y<564BsO3kL0j&RO{h=(spV{cAcRWdX+lSse8!16w%y6*U#W z@L{Df?1lt`yQ(Q{>fW#SlW7H#smsU&sRW@Su*pW=K_Fg=?acloB){Zn^6eCN;0M9} z?5Fsyr%PML*Bq@dTuFELHrVgHukxbSv=8Mq)Vn2tc5c^Aa}l%^0$#FcWEki+3oBTq zcF6q>_(#PsOf1##evx#GAlpyF#gJ;k$~7t#c~$DZ{Mm@xnJK71+Wy^&r^TO;`Mw0p z`tIQchdJ6ljN#I*oVb|?=Z8mwTf$Vdi^W2x#ZZF^o5srJ$(^%LrQ20~61qYz!aqe9 zk-*#)^(8bW8ER>HPPkTmHq{yf6NeW&YCcRzWomTEAE}#PjmpE8lS+bgu0IQoR`~~1 zHx_*Rh$}pu0$ib8>V{w~F<*JzKKnR=iH{WxJ40KhhuSocqCf4@~XXdX9;-*1prc zi0R0yp|;rwO3AocHN29abL!|zlHC0asu=i)DE&1c-}!5dk=ODIT_5UzLhMMC`5A{P z6TSG3{VoB!Tt++>RWw0U5;TyE%D#AsY#S)Q23^u7SD79FD5Y@o{Y|3(Gm6hpien+( zz!}&Sqiqf!22F>hZokdr+dP;?AhY$!2G4)gIhl71EgaZlbtfr#;ywynM0trSwX46K zv&15_|AXu_BgM}{bN)9A{L>{K>s%OG4ucOAW583M%oXIGyTwDVwoo&TkOr+fw1cOQ z;7Dy`LoFDCJg-*)|MjL?soyeA7hR(fl6JpQcl4zEywz7ZH*RBLF zpU;?p{9&*eZl);Fy2o3Hl-^);D+;j4!G-GB-<+N^T_Mo0QaOiXX;%gT#zjekA^Sx+ z9xTJvN`!n1N0hzTA0+w~)I_m&7NS6=tJ`@!Ea59h0?`ekxi|-%It9Zdy&|06X%7W} z4p68xy^(=N-=>nJ1Wgk@{DtcNjD!7dQW$1Si}^W|Q?(AXm!@Gcj{G2t>LJPo!AI86 z>W#BroRXr)Mur-W;ql-&IuqH)VOARB0 zJO}&m-nmza#V*7A{!xUPy6dZYbSPZat>-mv_EO}Ue4Sw+G=@u7P0#TiA*jdE z0AZ;%!=@ZTRreAw*KccKrq)CKDsi^(Cv(6KJstcQgE@hz2S82Ms{-BR( z6BG)o3?Qu7Fk#^(7y}mnKYy?xy^5Tdwgckvv3bhjWo^`R1tkM7UwaP5lJr%8NWI%` zPxW+8GPuI_q$@{)N{b*%T9f_bsYKCw+mF)L7F|WS4%-AdXO)A?BCA5n*5?QPyIT$n zDlRzfrmmsNOfA1#xH^7Op>*ikbdPuO!*=9d$#(HWcf9w8e&)IF-w^Hk;Y$S1-?K7e zS{@dF925JntHM@Ko>C+paXP|Kc|???U4b>C>jIE}%txk-R}0U{<0ru1LWk z9Z*PN{}W&tua?_m0i=po%kPl@s!&vGL5wA6lno$&u=pam94&~jL>%RBrOmHEiFmc5 z9vGlR9EFNOI{+29ODUfhP!OOE5d`j1F^VFl0^lS@R7i9o>JrvTpP)dBIj?M$+01zdT;u$FQm40QtQ~iGMU6#UL3X&_~S=56d(OTKl z5$7rk@u9jv1HM-Dv`9QlLprD~A_2D47udixh0PFvE!Bl7@HOwx@`I}ogrDNV1L$4Y zV;g@T0q9WKbOP!qZmLSOmh@0dJS#vFk?9WYLPVF=7n z+*Ai<$Zwhhz4Lm|;_v+c>#Ca`5}vA%!npfLfGpL8H84YN(;i5uxG4`Ll;6|`5-Mzd z0}{$@dIS678)YFL@r|O8mUxabkQRgyXdtnY-;*xkDGA|@T6EXOq}Lp0+W z)gh^I_dx(%)lCtgjohX?utIUuQbMb=M>YOF6riiJ`5D-r*E1`jmERLDp_SdUBjG6z z8IEgIg3wdg%R&C=%Lx>Tzb^wMQrK4muqo}c0f&p-vo*n`rf90;gcTXQTD%^O?zeU~ z$G`uFjNo{=Gop$$eyuj2M)x&lKH|{ZN zA2#Q%_*gg2{l|VL=#BW%gQo-Y+J>hC_L^PdC4QeEr=j?mKF&RC|NUgD(yeI@zx*x{ zWR3U{cJi!v=#1h);32U-6?g43%9->3`2&eqOnXg0gsO~wZG9kNF zQ}637cuiv5E^y5=`2&)XOEnJQ6-t)HdLsGYeA1&=xmVjfx+Jw2K)hVrWZT!{zvcG7D2Ob7v?fO(Aq&nv?c(Tq_V9 z^?4ew9ib;5E>`#^+(4O-z2ALX9m>l2*-4 z&OQH>EzU3jX(eOK$v#)qHMrJj1r!h-MGapD7`2El1I$>&mI0ZsF$;>n7(vNWz8ImK7;@cZn?Rz{l&Pi3P>0u$ zU`^~{?(O;1&)mCKMod+>8&(wj$Bl8hAw_ZFv|lv@lmwOV$3N<%$Ziv-UJaOF;7$*O zrWZvmjRI4Ij7X`7QwSPPp?3Pq3;j%b%elX~fto;^phb}S3~}sdHA-fGiu3`&I1yIG zViP1|GE5)_5CD{uW}an{FNhux=z%0zlH|_Q+0k@c@~8Z+@p=COea?4Izugwx>I3rz z^@`dW_S^yOBfQ8h)`9X2=lX%;`!l;Af9|oY*c*l`>^#?u(xMwk0-J@Ck^^%h#U5utQR=#L~w-AgfiT# zuYZ23zUpc8yb5T{c=I!$t^_}O_<6`Vvk#$?(xZk$Gj3GOc-?Xd z*BEkxOXrw=yaqPwvHm=W3yH*8C-@yK;(GF4m**FXRS#EA1kQBMX%)8v#j(ln`g1=! z8Y>OF4;`P_su*N3-YOCFHTB!>*GD=ytg7cSPf?qK(E?Y5y!=nKarCl&uJhUzt*@#c zfS=GFR&d*qGP^Vvk8YK28(maR?J}Lsf;~KT*zSsh>zkPCm93cboK&5g8gdR2hGZI^ zl;J#w58)o zH^*hVqmRQhiuIN%=5D;hHM{0+eOip8t-E9HgUWS1k}3n{%UAfo8{8%y()kR_q9;kZ zju70YKF%$bC`Lr5)7_?9_pNTN=d+kp+`YtVnZ1IGuXF8`2{XFOo|ZTl@0_7Uy%HcDp!vqZ^5#BAIW zM{e1$a8R0dg`(Q7WwNd5XE0G6pCzl2(Rp|{1lgY&)_+{Ap%AZU9@IP;-)^Mr71V3P z7*dHC#yD>LEi!5Sdu{#LW4Mw>Ntbr~Qi_FLzRK)Oivl)n#oGW6T9}){_}ISrJr^y& zX{bZY=?Wv-F;we<9U!dnj2msfn|6Uw-b>EJYZ+j58oq(4oGicRa})e#i?-46CiGTO z=>E$N`U-apVGNcJ79VO1ZVdVs))dwh+7#x=m)WU@EUF%%+&eR00*ZILFkmP=Ij9l%OFwl1!l)11 zP>TYeq7c>mkckPZpv(oRqF}zliVKiLK_x*e-n?s%@P*!hGUSWo@kZuA@_g?J)r$7~ z!4>VE{2XoLth1$a-M0hg49XVP2YL-A63Q0F2mbZl^WR+MvEkBb>$eWo0k4Cg1MB@4 zO~E_^TygK=&cPeN4bx6pzf3547<#C2Sbq47zqkoH1K#`HGx|OBxz~nAr;A^q?-`UV z%sPAq6d?j3Ea6{V1WyP<_>K_j6rs=W$@dQh`L_6V3*h6x8gy!Rrgt{@a{CH(o^-PN zE&FQvY5J!6rTW@J=_1&?vw^BWsDN#U*ZSazb&qszyzzNMvU9dmxih{Kt8=Fl+i%!c z(vKbr!EYQk6IR4;93~M88x|WHn@RTuI`U<_`i=9JzyEb_;SK*O)9?*^?DKz5bi?Vy z?X-utOM}KYh3fxbjs8=CGw`ujWZQW49p^1y|JvTd6TU~L;h)5pw-fMv$Mf_{2e0qj zC0ZD$fOn*a+^6vVhuSMz@Zs?Jzu!&ruLmXkeRPPdfrX-@^RRn;sACeW@h~7 z@d?cAEdR01 zuQNpgk$2lgU+w~)9Et2VAR_SLw!}#%PrY5Of(|^rpl#b*@8#&cuJWD=pBj=+`hS)c ztJ}pjcl$q!y@K2_-(F}vwz)3h^)7wNUm;oU{Cnl|IOm_JH~pNnG57k>N7qX1l^*S6 zaqUPm(NW}LPH=H#c*1Z*t4Yv^jqSK(<73grNAYJTh!p!}sLg)-;A6m!>(hSiey{x| z^U}$3Vev2PUQ5{suxiR}*w*~MfAW^B@`{Nn$+_PaZJ(Q89ml@gfG@gYR^I$wyBw=d z1-)>ry{Qt`yON#bOGHAd^p!2=g1h`St@*2wDCNm}xxs8ziH4)6ukzDnyB(wVS9UO$ z%6+(V#N+)L#`@iJKAo~Tqf{9+zkQ|JCPn6XFQQvLKXDegZGD72YBBc-85j#%6 zm4FVbi177+obmuWtpX8?Hue?3vnR4sOB9k)(=+pImEP~(ucVNfVyhHx>TBRz;GCL~ z=}}db8eHWUl3bOYY?-2DZ^va*VO5b^kPAHfLZKozC$HG5!d58}cpy%y6_5=Q)>l#h zD=EpgRdNJLC`9-MC^+XAr78dqpU_Px|@GwYQrE*}O25ZL(`J6lSE;HzO zK~R_364qljG=m+?fV0PJ9BL3?5Nxn5V8h0X{3~@luJU{M`0HN#e$Gd;FU-J%)h@jVQ-_)pFc#Urn57M^%z)T4F%-0S0JH)dgdtjoKXvGqXi_MEoy3_ zPuH_W85;lppD7_JamUWU$cVABW+D%d!mNqB9{~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**
**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**
**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~
~~**(3,1)**~~|Zelle (3,2)|\n\n""" assert tabs.to_markdown() == text + + +def test_one_strat_text_the_other_strat_non_text(): + filename = os.path.join(scriptdir, "resources", "text-lines-tables.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables(horizontal_strategy="text", vertical_strategy="lines_strict").tables + assert len(tabs) == 1 + assert tabs[0].extract() == [["AAAA", "BBBB"], ["", ""], ["CCCC", "DDDD"]] + tabs = page.find_tables(vertical_strategy="text", horizontal_strategy="lines_strict") + assert len(tabs) == 1 + assert tabs[0].extract() == [["1111", "2222"], ["3333", "4444"]]