From 75927d9477f061aeab77c0f4a004bbb8f8a453a2 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 7 Oct 2025 12:40:54 -0400 Subject: [PATCH] fix: reset table bboxes on each page in hybrid parser fixes: #632 --- camelot/parsers/hybrid.py | 1 + tests/files/hybrid_multipage.pdf | Bin 0 -> 12137 bytes tests/test_hybrid.py | 7 +++++++ 3 files changed, 8 insertions(+) create mode 100644 tests/files/hybrid_multipage.pdf diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index bfa4cf38..5bf70c0e 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -247,6 +247,7 @@ def _merge_bbox_analysis(self, lattice_bbox, network_bbox): self.table_bbox_parses[augmented_bbox] = self.network_parser def _generate_table_bbox(self): + self.table_bbox_parses = {} # Collect bboxes from both parsers self.lattice_parser._generate_table_bbox() _lattice_bboxes = sorted( diff --git a/tests/files/hybrid_multipage.pdf b/tests/files/hybrid_multipage.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3d0f256d6fb46d08fea7da8be9044bb3aa0a5f2f GIT binary patch literal 12137 zcmb_?WmH^S(sm$Nupq%gAh=6o-9V6Fjk~)x4h=LG+zAfBJxD@u2p%kGkl+wJ1b26T zo6Nm4_s-0_-tWhE*6Kd{)G67!>ZxiL>!DQ=2eWdp0nuqY@;jP3YCE#gIVk`XP!nr( z0ReV7h`oigB?T8;qQVZgvUP?yv4d@mogv~7Q>Yn4NC+L~>;y5kLwC>ce%Ei^0l<+s zzY5nTDjDqgff4joAzswg3QIp|@5*nOFiqa4HE&A4GpW;$rLEa5nj0+bfZOWlhnM(R zjHa2k!sjY$p{t+l22Bp#-9t{LaIrm95GNxDI2u69S{ee)sY%+sKg7uDGVWXB_YF6G zHY5931Gl&vRKI{58*sQx9Gi>%$;1?~R(U{{$nab9RCMu+57o|>6qy11Y=MMil7v(` z- z8Qlo_YVL^9!l=%-wi5`L2|_tUW=^=~U#sW_>TNctep8fAE9R{w;WEuy8KDI+|N#$Pd4Bmg_Yk@R^gATDb`y*-a)kB`qAJY4<;N;_dc>7z% z^B+Y3{y!8U`0quK`ezZ4^@Dc)EW$Fp2yuESEEs#uuzXyZfT{}wq=ZibT2v%2{4th2 zJn-4w4p<$OzK$(ZB%%y9O(V7`6El-(?bDK6;B^xh-q&Uyu(+B6CpX1Tv{NWpr`}H| z7%uvVy+%K}7NVlXSN_lq9+#HS29h0VlpM=PtAMYZVo3sY*dnk6B{is=l~(;?Z)kH& zLqhItS^b9wF$V{!sLmK)_K$ygzn)a=KfNfHIDuN&_rt|{!|5kPrr0b+**=K@dU7o# zl~AvF{uDAVGhrMVYxis29a-Ar$q{%cQ>R6SQ*WYDJ_Y%hH;{pmXhiJ4l;I)454q<80RCA9-v1~A-~UpEKYE3Of`bRZ z2mEVita{kE>vq*_lr*1Kn{69QC$Y%R)S7&2d&=jU(HR?>dy$A*xir;=PLpKYTiI6o zOje<9yYsa>pS&r>hu4hta|upX=$ucu9T-!9Cyu%@mar=k%7Q)0>biwRTuYe-?w>6- z=1@LYseZRXx4Zew?>}vRl%#5xYgd_W znl@=hTx1ljH$10t#TGaTv>C+BW0l;Ck(InCBHUEY%A z3vuE|%uXTCm2cIz&nUl`QDw&4*;p72ez7v0OtI4CQPS%sf0+2BE`vOTz(_QT zV=r~w6;Q~rWOo2gW9|z^@q~Iyya}x1$RMN=eJ&i12ZeEoy`i?EWXYF3z)uatF1VGN zhT(oN%sm-W2~gG&(HLfI61}SND0g^KYq$6M;&n5KCSTa-AQ4j|p+W|zfsRp$x*YDr z%$-l_b{-83ijzzkhLoO`MM2aP;=@?T0{DYQKm6=ON+?4_arl6htmq&fFJB-bO-eQ5 z>;RJ#Yp$vQs42IwWiFRm$eXc}a+d0(YgADN3-$3&v&$tYM*E!B$2kZ0pD07Qo#Z9c zVvH7)Ta2Ag2CEBMntOD!4V*hjUdfQlCVOs^jzXNru}Z6sZR-KF`H5ZNa=?{lz>>SL zyRQ%(m8n`+cPk$H$A%g!%Uy>2m?~TsLH?Nyg6EPQd2!2J)cOW(8)TWo+miTLpX~8= zzEVaHHB~I6+TU0+*kPWxuTpBCbpOEGoEy+|%Wqf$=YRr;Z(3(`>*usB(>^@8%QFwK z7Fm5Cf`L)^{Osjs^9Em*NlQLn3J_?zQdq;H-wce z&r0I(%cnX7o)+PED@rlTDoIb-*rT41 z&OpX{nE0|~vY%$~iiGvDjK6Ut=2r-;qhz^Xn78?cfTZROOI<;;BeU`qM7(=zTNx6@ z^z!U&(1?-u>Gu??Y*1XS`{Sw~A8dI}>^6<`SM06&XSkoZ^*4{UpwyNHg^V~fcOsJY zH#RK4QQ_j^sT6l0FQt@~`jmw?`y)3wkvq~=+;cRmf-8KxOmf44@Z&9IwL{M=E{1ZQ z7#3@q6X!Y_hoRN>R7F>#@o{bRDkrtdV=@KhIfoIzPwWD2OnxIB)e>BOUiz1!qSRDD zV5BC0f(wP>2p+A-PYxj;qDi$Vo&Dm?KE*|yP~}i+u#b&gHHjpSXZ`?MfK^ABqN{4B zwTYP$vcjFP(~;kp?BwLg*bQTWddXCRwnekv7v z>|!OVKh-+7Vpi$LX35@d(Dqg~jdf$>NozLS+Ry$?ik~hc%KJ&t1d-2|dj|$dH}uRu zBLJ(Bnff@Yk?EQoCfA+_&(%P5za4|WW9(aRyN5isx8uc0)AF+;8^Wl!06sA{BoTpLpkkJko*e$i&}ZwPOEAzu?Tu= z=c4{D(MS7*AC77y8b}vn0h~md8UwI`q zE%Vcu%Q0vhQ~T|kFbu-_uSU*5w9X8dJhHJlE)ZVBJh?t+v#=|TFV4IR zt|E+jmhmPVx?II-#)Yt-}WfR|Tr-(h3GTo+&l+)7Lr^O(&i z9knNR>qsF5&&4{3O<{?Ea6l#PF$-Ag_BeA!&K=s|Q1RpZdA2Y;@J_L^0H>jnwnBx@ zoRVs@b1DyNlLYNYT3H*Uc!F@XAMLcX@3k7-)W*^O`rVMj$phogtStI~Ww56qd* z5^03slD=v4jgT&+mU(9}xq>A2oZ8|^2yeO(s!1s0E2Lc|2x=bL;O8yrS8_9ycA<{1 zBBzZzv7fvO+Wl1*IKCP#Ny5~lq>gE0h6THhsmf9TdG!bMf?K4xrQtoV9np^+1qKol zCXVM?729pm(i(7=b%g-evvJcEn=jX1wcgHG`Zg@rE(+G|kown2#9thSuHT;}gM3fg z#~XzOMB<$&?6~%s_tora5K~X~Yd#&Fic6qJAQkjalQ=avAf)q)=^$rMk}Sd8h(r`| zxd}z2alLeqTGXN_mW}nL@y(a!)GTkDY;o;?2 zdhz7~|B&_YbQH0k&v$dw>m>bj1LWJz6DzA{*12G!yz#cfSdDl-OV1RLV8gTFIDzjX z0>}sQJ2X4INgqjki<%Zde&G4O?{NTXh0vxyZb+OTprij|64~n`OZ-zSlh&S-cb!b- zMmhm_Yl$UD&*H$G@3AYss|B>*n@@r|n_FK^ywns#JY_uO8sYj3ova;>b#HMby;+1l zdJ|liU8U)D*OZ57;JN9`x^ich`%Tcp=BscZ=T=H&`hay!#7e`_T$Yy|AOWhHIzWig z%4fzQ(|(ouh(TpO1vQd9#f0;NARsAjXH-Q-C<>f)YNI{Hhd?Tr(NHrZKA+ZWsjLdE z)8WOm#^7^YiDp*0wCl(N$?vz4dKGOvjxWDmG^}fBc``^YY(Am5PO_k;HpAR`GFmC` z>*K3)(@IOec8*;ccONM%ivCOgK2kc{W^#5Odk}LPy=o9L#Ip%g0@`4wIlWzy@c|l%FAzb#nt0CF-i+rHD-4Hc8Kh!y;-4JB^ufxSoV*3U57p0^=DJ4flMCm@{$>Y{y!kJCFqXwqt#pJ8%#2>A}*sw{#I}CQYi(>BOqVyk&X$ zZ7s$Zx4CoRNRi}}1r*AecU_*=KZMx*bUwmiKh>$D7gG!_luRud-B>3^px?Eun$?aV z#k3#ZNon=4AaY$FZnE(CG%J+tZZFrw-TaKWs1O5Edzbcn?x_u1Q14uBa1GC!4bfsF zk_xR~2|}?Qx6Wf=!yJq0chN5WouTaxJFyp=-W7SWO&0RJw7@i9+(F&SYXl*G&rlfu z;C!gz_GNJ^SFJ&K_T2ZE<8G$pX8we&Mipj^tTO3|V`dN#%){HYwk!678uKeLI-cyj zF|zE-@4krX&H^|R_TwnJc1BhHeJH-QZL`3OFf8)Awp-wav3wLz%Q_GbJbYe=b!Qp4 zF!SpT>74arsi{Y(yrzMwz>6%b7OTKpAj!1(<5sMsDIZz|5k7Mk9nhV&(56WsmrD{*3*{3iD zf_@bBrj&H=6(3G~pPxxEV0EWuw#orBi}aDzEEylB*O=S_Bn^TmhBXl;Ur&%8ELDX1 zUU;dc%EPLUS#M0&-45`dB^@WBI+WMN4&0%~>tJ~_PkiKD)G%IhN}$WCB95AFiye9Y z!(y>$=(8&)rLyd37V`&!@#Yl6)TuJlvf`xAEc>F*v&^fan$(p#1eILhoX$@y5I*)i z^V4ozsv)>3UV1+UBzsSS#?H$WQd}N)vrbHTv)gC>T?}}^JT6{1z8N<2iEz9rN%Oi& zjk{4M#J$n*srXY>58_ENt*2UvYN^KT_dILbL3x1i_itSOCT-x{zwF*ENJ<@itn+cWIOwE9?DXV&sz=86iXdhc76B1W_eT$4oh^F%u>d%CMMhMU`3dhwKA2#vdH9VLaj0hY^?xuTp zXtYJX#(!e0B27jU^QAJC4@M?0S?K#33n09R@#YuCB2VQe!o{WR!CCy6u(0d($@DAZ z`$TYWKk62?sqi$=}8GtDdl9)%-i1{LGz3Mce zUyld0Ezo}?&p0i9Gv>O{X?EP+H_wp{8q)P*NzRo-Ns=+#n0wj7YxF-Z}%L#xWc}3^k-QfwXss`VqH{`5pPy9?j1?ZG3g#9_T?^< zx#B+w<(GHF*EB}VViheNHB+&w>2{DR?skqxO%$CVQAvs6`&IE6o6o2dfe}@l?7hQR zZPX0MuZ$s*W>W}u*q_|KN~2~te6>yXPGy_ga(kWl7ztHt3e9DG3I~tu2R+%wYwXv{ z_j$XJK{{^Ir))2p00)T>maJYKHYB8?R!#r`Td@dQ*2V1!a}4>SFeX}m4ITTg4W?I@Qk}o4^sFx$X z>K$cOI4l6>hPLD58j@jR(G7SdH5h}pvuhfcyC=F8dBYYk11YZ8NwOjNYV%H|x;_QKUjxqa&?Tks#BK?GKw^dPB96f=LPoucty_E8o3gRez+cSn!8E;-QrGRKn zrrictU5lPFw9p-|vKp=g8*=vGN)=+pU~$GE*)~IT)ei>@9Y#Do08z%9mS=P%AH&t9 z8TIb?ZGcJb+CU^@LB*tYUQ*dDKAQy$aJGdFA8V#tJJ|A+D6%=|Iq~tB9?GzsI^(p# zYuci5dx3Cf^oYx3@~gXTsPLK5PS@m6w9rj6{m^X8i$#+Z+XdKcl`!r-zgQ>mu8I^t z*GasZtlsV_xybWpe8v)KsGG1s?DcJiPgNs_S7n4m~5? zA7g1pWsQ-w*m-NtMchuKqZD32C?~Fbl6>h`uCPR{KuVJr+Gv>=ZLK&sQ$CAE(sjI; z#w60Kk2Sz*OpG4Lr#rz+&^W7Jra9l#{`cR;*^phZ`=k|h54y$9F(XX&hC9D)cfaHv zt`_?8!Sd4dwS8mWv5~hM&wXF(9KUyCp2;_yDYv=$5Y0KFhHlVJnx->z!Yh<9@>fwW z`!^Ok0K!AGiWm493QymUq!MUEME5T#MYBBKc(ONPZX#%j+)9ANjKiE!EjokVTGyUJ zfTw|Ge!z_VdFYu9x#@wnMyAP?=BN?Dl|SNRzceGA8Xfwe{e8 zl$tcRi+=xv4mG~WWU=}e6=`9JJVKc2WtD{_nXNr+wnf|Fy#oaB`Qf6Tk7+&m*50D zQ5wp`*8589k-OR05we7+y)y0s2~*#|h--+`vwS3tYuVQf{`!rswyh5DODahf-)GDg zb1h=mtyp_N0}9Nmdf$dhMS}=OZ64Dms(xW9MsjxWe|?zl>$UF&&AwbthdQ8AK1%O| zs=Liq=kh?y%dG&8vjI$pkVBF^+{4hNXZ1M7gtMXTM$r86=!{$NwKUcQn(=PX3t$(u10ihlVwu=^_p$>e>vA_9v zS1vL@ekdkChEY7^gB(2Z^3cK8NiD;-N4e6`m`tmlToU=%l&fr3I44rUsqdxg=J#n2 zh)-EpjsriLhLEO}ipBP4bUij-nwb&@ZM`|fNj%MF=B=4^{vG~B{#E|vQI#7+ag}in z@hrzo7-WJGN3s{2QlrLCc?{*l_Qm%VWldwQ+bedi6`pJTueh3t7@BW8Oa_5Q=H=t- z>rB zx}P^m?hzN`kH*6-mJe12hO(NFb*{UH-q+^lQavquMqffe0Cs=0m-9{f5FaqQoPUw8 zC4K%K{#UE7>>>?DFWyq9{pqC?Od!Jt{y1 zkUXLh`u_HOfIf*q`q3crFHCA7j%?nmV2iINe^ww+(0o0~W^If2RA0k$tj8^zAO>5nIj^pq`TP9Y)N;tI+@5z~OX#fI z)A2pqp5~8dsi#oElORfF17Pi=)(qgb~!Q=q0y@zX+Dy81+c7X!XL3#(S&QoHA6btg#rQmQx!A@|>GNU1cYDJI|qe zb{|7|Wb3@cz6US#erNr4@#2PPA8QSn{u0r_)!5*sfVAS1<4_z7=X(Xb?rRxMIm+`l zffJxPZW;r(HrFpQl`Ys&Juydhslpj*IjQr&c53&9MC+7t;^d8> z7uiOEA*;ukJ{D8GQLi{Cad5Et=`&RoCS>xsc2^h z#qYV~><;XBLK;s>OHB=Zm8X}?Ls^NPa7=5v(`-Iy9`jivm-p8c`Gp$21$`$!wEzi> zqa-YjNuR&cS+jq$F|H;&^Bv_|0wOGN457}st=stUh4aZ<&${rXC}qd}tlJ7=gJ8+b z`qi+P3ce|!Y5wI4j6R%=J5&4oeBT0vZ3v*d+$9&xhf=MKH`=$ohWZy>DN~}2Fh;F}^yjGWddsMK3e)s5!lZ#ngK&-qLv}51nN=%&v`xK$w zQSo!F>6E$1jC?pv1LF2Ri zpHsfn*ip6bdU3X(&i*NC%Yqv`AUngfi?5xi{dDu>3;ik7T(T;=kCM-j3r!A@NIURa z__~seV<}~l;=uHd{%u~VJ#j`NY`f$;aYO}*6FbjloH6-ww;v{R>>3-1#yi_1Kg{B{ z{$7$H6DufAav?mOh>4sfw0t| zTZu*7J0=noDh79wHL2)!<_-bvA}y8<^!#bY4w-l5^{OH2ouwDGn8m$(UHzOTBDyaU zjaeC$T39=@=U(--_Eto*Ox}P)PFJ3nDuh(^*35j@<1-68i@Ivbgijk%6J^Th(h@;? z?n_rWEt$=XIn|zVO8PbRGM3@r+|IrQ)g0!CdojG*ig>DFUfP^eqGQ=bnMPgcWIS)= zw%XlNmLkq<8-{gyTHWo6eJ&aDgQn3maAYc0iBeC_(b3z+mO+|%A~Ay5hMn6$ZZ`QD zm8dlXUKEyiNE!Z;;PE9+h{>z#cZE|pRM#lMXML=RqJVedSRaFWjMOaCcc(_j#wN*3 zd-kT3m;yQv)!3apb`O*cIcFy2Eo3zY(E*aAx+<0P9E*KYOa!vssE^}l27@zdPO-Y< zSKG6Gw(J=gE7CBEoA$P322M@6Gy_^={7P{XAP#^Q($G0FX2}-M5R=}YEp3^i@4{Km z300coo)LIZBt75Sjy^dzjG-fyGAd`Q9?5D^V^LT>j~n z+6Ji{xLc(11c5h1QO4`L@x~Tt<*MLSi$r=WI_)*tI?EdiikslYrnMr5ShHR%uGd8$~Zb zu>P_r&2x!1W!=NQMFeYlH5gNeUcM(ov|ZSp6uKmdl*ZXdm9T#A7D3N5HL%&3a~_1r z(QHB3Mpi%jZSNUzZXV_dRQ&j<$`}^cwF+fz$t*)PjgoLwPX+~k0tU&Do?e4i7UINI zvJ@?Iy!UP5VNPaCQg>8-)=6BZmWq}=1qU82ViMX9YBZO4Ug_BFUYxuwi4wmsZIE`* zg0+eSp((BuzDieR^UG@tGuC#osfZS8+c{nmn#7cz(%aFIB3$dl221nieq5bU_>qI6 zrphsN*Q1$ZSuw{#jg!viq(kCj&`f^B(wv1;$|oqJ36)xM63>03sdd& zvjb~Q?-s2DduI@}6#JZ{`E5+~I&o!PDpC1zj?XxG+fPU!&4n>oeS(d{+d;nkF%;)f zJ+35Xu$`oaeD6Mp_j!feyMb44VQ7)Uy?w{)geh56a@+4dpjPm7SvKM>#N3x9YfeA1 zmPjj>HIVe2D%we;B{y0|H^cWi2b90%YQ!QWAl>!}QzSkq zJ6~>oTSJSq-RRJT{2+;EtRrb=qVg#4!&b7zJDNRHp)gJPx+ZfbNS62gs!u{W(%7%LQx5L_{zpgk$_BTARmJx1gI5DuCK%oCqgV`R z%{@5==*P&nwZ6g8V@e$E;)lthPyVF&- z@#y1FKWeUsU%`7C)JwNPvs)2Tf+Xy7J4N_)U#DaYN?SrhEmeniSX|M9c&KT#bMniR zCy9^+wUVJi>Y2LKVn?}RnWj@4mN9@Pe@;1Y}(~%G+cVN$DppJK5PS@Cy&0@@u z+@&cgx?}Ze*eUh)rAt(Vz<_+eUb1cfaIa?@>mo*=;*Fnq6O@m z*H}^cetO!zxUx3=>#%(ZM>WhZq>OS`s3X#!iwmNMY% z@lG?f^hN|QNjlSVKVRU*HcKVBicHDrT@YYdSL!M@Pqg`xVipG5aIy*iy@@; z#sR_(hT1!eLtv&(Ru0ZkI9m?>s9Y zq5fEk89PWptSl^@A575MRh=Pr8Wg;6od?fw@*aef;(^Yjf(|Dq{Yf+9;P}_}|0MhI z0R9iEpZ>dk`@jdPpI315E*0Xj98FL{EJfvpnEW+DC%H<#35yuvjkh3|6|)NhKl#F| zFK4_Zv!*s7v|cXp{2f$RZI-CV&M@>&taog2JoevY5)tc;DMCns zpgvH8IVSQmk1t>;amG2g=#ES6)?T7+^re+khgx=N=2|g;hiIXp{}W(zDr) z&-P8LiU%z^I2@(4lFIQR)2`@x7G*dI%4%Vst%@E+-qjI7r=2z@EY_RUJglC5nC*kf zWmxn)fkkC-f_g}{F;+6R8J*f*P(#Ba$d00Usc*Bk^Z9V-@fy~$IwkkwDYEOcs6i!? z<&pIY-!EY&GJ}RYaeTf3-Zbk&QXQ8V0(mPp$XWV7k4+VNFJVQnJES27K3^F@K3^Zv zwVO+Swi}-+_tOiY5146ufKOP#<2AQr#xJ8^=TXzWxe+bUmUkst{{?p(e<5W46JqSD zE+)G+Hm@a?<#K}p) z4M*Pt?sx$Icz(G5`Qd}V&)*fe|Ap#@lu&T|X0W2OD>*?; zRUyv0>~PjAyBfsZS^sa%@Gq<18C5d2fUqkWJ3aIl=phEX3IqmqaWaL#C^#NMsIbdJ z%&d$bn)YD~0O30U@p0<2i@{^UjbIeNclMWd`^&!mKSg-piK_f1!oxlvh7>$C|BL`b z|BsEM|7+)PgMV(F0$vbVD>Ilb#lu`sq4<5s2ca0$1)c=XzXam=3q$pHBr#+7jDlLg z6J`O=&+mMpvnx74>_to;#+~jzJhE$P>rmLc*xLT%3jp^mXKZi5VD7{!Cc^l)YW&H^ zgdar=PBMm1Qvt997z_aLfB*mxCjbECgWtK~cQ~9toI=0LYvy7K`BRTu>>oNnxbAO# zxE?p$9tiy71Gfjl*`5#fa1{U-+%KmD{LT&c^*bbd-{Nq#GlMukCxDZO1MY@{6OMmQ zRsc6W06@?9mwdQ8LCnzs6dWLQxTilZ3Lp;;7Y~Iw#Xn>m9Pls{zu!^V{~?1valn6A z9@Sy-= zSAjy|jr|+{=Xhnk96LQHL7E_M{`T;kl~oV;8B_#lJ>PZZ284o|nZ2!IP9Dj~tm c4SWDF3{EU{{tX8X4jw)(bXr;oMKJpR0CCgnegFUf literal 0 HcmV?d00001 diff --git a/tests/test_hybrid.py b/tests/test_hybrid.py index a33f71ea..4a398737 100644 --- a/tests/test_hybrid.py +++ b/tests/test_hybrid.py @@ -142,3 +142,10 @@ def test_hybrid_keyerror(testdir): filename = os.path.join(testdir, "tabula/schools.pdf") tables = camelot.read_pdf(filename, flavor="hybrid", pages="4-5") assert len(tables) >= 1 + + +def test_hybrid_multipage(testdir): + """Hybrid parser should clear table bboxes on each new page.""" + filename = os.path.join(testdir, "hybrid_multipage.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid", pages="1-2") + assert len(tables) == 2 # not 3