@@ -35,7 +35,7 @@ export class SitemapReader extends EventEmitter {
3535 queue : PQueue ;
3636
3737 seenSitemapSet : Set < string > ;
38- pending : Set < string > ;
38+ pending = 0 ;
3939
4040 count = 0 ;
4141 limit : number ;
@@ -52,8 +52,6 @@ export class SitemapReader extends EventEmitter {
5252 this . seenSitemapSet = new Set < string > ( ) ;
5353
5454 this . limit = opts . limit || 0 ;
55-
56- this . pending = new Set < string > ( ) ;
5755 }
5856
5957 getCT ( headers : Headers ) {
@@ -191,8 +189,10 @@ export class SitemapReader extends EventEmitter {
191189 { fullUrl, seedUrl } ,
192190 "sitemap" ,
193191 ) ;
194- this . _parseSitemapFromResponse ( fullUrl , resp ) ;
192+ await this . parseSitemapFromResponse ( fullUrl , resp ) ;
195193 }
194+
195+ await this . checkIfDone ( ) ;
196196 }
197197
198198 async parseFromRobots ( url : string ) {
@@ -218,23 +218,61 @@ export class SitemapReader extends EventEmitter {
218218
219219 async parseSitemap ( url : string ) {
220220 this . seenSitemapSet . add ( url ) ;
221- this . pending . add ( url ) ;
222221
223222 const resp = await this . _fetchWithRetry ( url , "Sitemap parse failed" ) ;
224223 if ( ! resp ) {
225224 return ;
226225 }
227226
228- this . _parseSitemapFromResponse ( url , resp ) ;
227+ await this . parseSitemapFromResponse ( url , resp ) ;
228+
229+ await this . checkIfDone ( ) ;
230+ }
231+
232+ private async parseSitemapFromResponse ( url : string , resp : Response ) {
233+ let resolve : ( ) => void ;
234+ let reject : ( ) => void ;
235+
236+ const promise = new Promise < void > ( ( res , rej ) => {
237+ resolve = res ;
238+ reject = rej ;
239+ } ) ;
240+
241+ this . pending ++ ;
242+
243+ try {
244+ this . doParseSitemapFromResponse ( url , resp , resolve ! , reject ! ) ;
245+
246+ await promise ;
247+ } finally {
248+ this . pending -- ;
249+ }
250+ }
251+
252+ async checkIfDone ( ) {
253+ if ( ! this . pending ) {
254+ // this needs to happen async since if its in the current task,
255+ // queue won't be idle until this completes
256+ setTimeout ( async ( ) => {
257+ await this . queue . onIdle ( ) ;
258+ this . emit ( "end" ) ;
259+ } , 100 ) ;
260+ }
229261 }
230262
231- private _parseSitemapFromResponse ( url : string , resp : Response ) {
263+ private doParseSitemapFromResponse (
264+ url : string ,
265+ resp : Response ,
266+ resolve : ( ) => void ,
267+ reject : ( ) => void ,
268+ ) {
232269 let stream ;
233270
234271 const { body } = resp ;
235272 if ( ! body ) {
236- void this . closeSitemap ( url ) ;
237- throw new Error ( "missing response body" ) ;
273+ logger . warn ( "Sitemap missing response body" , { } , "sitemap" ) ;
274+ reject ( ) ;
275+ return ;
238276 }
239277 // decompress .gz sitemaps
240278 // if content-encoding is gzip, then likely already being decompressed by fetch api
@@ -255,23 +293,18 @@ export class SitemapReader extends EventEmitter {
255293
256294 readableNodeStream . on ( "error" , ( e : Error ) => {
257295 logger . warn ( "Error parsing sitemap" , formatErr ( e ) , "sitemap" ) ;
258- void this . closeSitemap ( url ) ;
296+ reject ( ) ;
259297 } ) ;
260298
261- this . initSaxParser ( url , readableNodeStream ) ;
262- }
263-
264- private async closeSitemap ( url : string ) {
265- this . pending . delete ( url ) ;
266- if ( ! this . pending . size ) {
267- await this . queue . onIdle ( ) ;
268- this . emit ( "end" ) ;
269- }
299+ this . initSaxParser ( url , readableNodeStream , resolve , reject ) ;
270300 }
271301
272- initSaxParser ( url : string , sourceStream : Readable ) {
273- this . pending . add ( url ) ;
274-
302+ private initSaxParser (
303+ url : string ,
304+ sourceStream : Readable ,
305+ resolve : ( ) => void ,
306+ reject : ( ) => void ,
307+ ) {
275308 const parserStream = sax . createStream ( false , {
276309 trim : true ,
277310 normalize : true ,
@@ -315,7 +348,7 @@ export class SitemapReader extends EventEmitter {
315348 }
316349 } ;
317350
318- parserStream . on ( "end" , ( ) => this . closeSitemap ( url ) ) ;
351+ parserStream . on ( "end" , ( ) => resolve ( ) ) ;
319352
320353 parserStream . on ( "opentag" , ( node : sax . Tag ) => {
321354 switch ( node . name ) {
@@ -398,19 +431,23 @@ export class SitemapReader extends EventEmitter {
398431 parserStream . on ( "cdata" , ( text : string ) => processText ( text ) ) ;
399432
400433 parserStream . on ( "error" , ( err : Error ) => {
434+ const msg = { url, err, errCount } ;
401435 if ( this . atLimit ( ) ) {
402- this . pending . delete ( url ) ;
403- return ;
404- }
405- logger . warn (
406- "Sitemap error parsing XML" ,
407- { url, err, errCount } ,
408- "sitemap" ,
409- ) ;
410- if ( errCount ++ < 3 ) {
411- // eslint-disable-next-line @typescript-eslint/no-explicit-any
412- ( parserStream . _parser as any ) . error = null ;
413- parserStream . _parser . resume ( ) ;
436+ logger . warn (
437+ "Sitemap parsing aborting, page limit reached" ,
438+ msg ,
439+ "sitemap" ,
440+ ) ;
441+ resolve ( ) ;
442+ } else {
443+ logger . warn ( "Sitemap error parsing XML" , msg , "sitemap" ) ;
444+ if ( errCount ++ < 3 ) {
445+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
446+ ( parserStream . _parser as any ) . error = null ;
447+ parserStream . _parser . resume ( ) ;
448+ } else {
449+ reject ( ) ;
450+ }
414451 }
415452 } ) ;
416453
@@ -488,4 +525,8 @@ export class SitemapReader extends EventEmitter {
488525 getSitemapsQueued ( ) {
489526 return this . queue . size ;
490527 }
528+
529+ getNumPending ( ) {
530+ return this . pending ;
531+ }
491532}
0 commit comments