Skip to content

Commit 2914e93

Browse files
authored
sitemapper refactor to fix concurrency: (#930)
- original implementation did not actually wait for sitemap to complete before queuing new ones, resulting in concurrency resource leak - refactor to await completion of sitemap parser, replacing pending list with counter - also, don't parse sitemap if single-page and no extra hops! - fixes issues in #928
1 parent 59df6bb commit 2914e93

File tree

2 files changed

+102
-49
lines changed

2 files changed

+102
-49
lines changed

src/crawler.ts

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2714,8 +2714,17 @@ self.__bx_behaviors.selectMainBehavior();
27142714
return;
27152715
}
27162716

2717+
if (
2718+
(this.params.scopeType === "page" ||
2719+
this.params.scopeType === "page-spa") &&
2720+
!this.params.extraHops
2721+
) {
2722+
logger.info("Single page crawl, skipping sitemap", {}, "sitemap");
2723+
return;
2724+
}
2725+
27172726
if (await this.crawlState.isSitemapDone()) {
2718-
logger.info("Sitemap already processed, skipping", "sitemap");
2727+
logger.info("Sitemap already processed, skipping", {}, "sitemap");
27192728
return;
27202729
}
27212730

@@ -2739,23 +2748,12 @@ self.__bx_behaviors.selectMainBehavior();
27392748
limit: this.pageLimit,
27402749
});
27412750

2742-
try {
2743-
await sitemapper.parse(sitemap, url);
2744-
} catch (e) {
2745-
logger.warn(
2746-
"Sitemap for seed failed",
2747-
{ url, sitemap, ...formatErr(e) },
2748-
"sitemap",
2749-
);
2750-
return;
2751-
}
2752-
27532751
let power = 1;
27542752
let resolved = false;
27552753

27562754
let finished = false;
27572755

2758-
await new Promise<void>((resolve) => {
2756+
const p = new Promise<void>((resolve) => {
27592757
sitemapper.on("end", () => {
27602758
resolve();
27612759
if (!finished) {
@@ -2778,9 +2776,10 @@ self.__bx_behaviors.selectMainBehavior();
27782776
power++;
27792777
}
27802778
const sitemapsQueued = sitemapper.getSitemapsQueued();
2779+
const pending = sitemapper.getNumPending();
27812780
logger.debug(
27822781
"Sitemap URLs processed so far",
2783-
{ count, sitemapsQueued },
2782+
{ count, sitemapsQueued, pending },
27842783
"sitemap",
27852784
);
27862785
}
@@ -2798,6 +2797,19 @@ self.__bx_behaviors.selectMainBehavior();
27982797
}
27992798
});
28002799
});
2800+
2801+
try {
2802+
await sitemapper.parse(sitemap, url);
2803+
} catch (e) {
2804+
logger.warn(
2805+
"Sitemap for seed failed",
2806+
{ url, sitemap, ...formatErr(e) },
2807+
"sitemap",
2808+
);
2809+
return;
2810+
}
2811+
2812+
await p;
28012813
}
28022814

28032815
async combineWARC() {

src/util/sitemapper.ts

Lines changed: 76 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ export class SitemapReader extends EventEmitter {
3535
queue: PQueue;
3636

3737
seenSitemapSet: Set<string>;
38-
pending: Set<string>;
38+
pending = 0;
3939

4040
count = 0;
4141
limit: number;
@@ -52,8 +52,6 @@ export class SitemapReader extends EventEmitter {
5252
this.seenSitemapSet = new Set<string>();
5353

5454
this.limit = opts.limit || 0;
55-
56-
this.pending = new Set<string>();
5755
}
5856

5957
getCT(headers: Headers) {
@@ -191,8 +189,10 @@ export class SitemapReader extends EventEmitter {
191189
{ fullUrl, seedUrl },
192190
"sitemap",
193191
);
194-
this._parseSitemapFromResponse(fullUrl, resp);
192+
await this.parseSitemapFromResponse(fullUrl, resp);
195193
}
194+
195+
await this.checkIfDone();
196196
}
197197

198198
async parseFromRobots(url: string) {
@@ -218,23 +218,61 @@ export class SitemapReader extends EventEmitter {
218218

219219
async parseSitemap(url: string) {
220220
this.seenSitemapSet.add(url);
221-
this.pending.add(url);
222221

223222
const resp = await this._fetchWithRetry(url, "Sitemap parse failed");
224223
if (!resp) {
225224
return;
226225
}
227226

228-
this._parseSitemapFromResponse(url, resp);
227+
await this.parseSitemapFromResponse(url, resp);
228+
229+
await this.checkIfDone();
230+
}
231+
232+
private async parseSitemapFromResponse(url: string, resp: Response) {
233+
let resolve: () => void;
234+
let reject: () => void;
235+
236+
const promise = new Promise<void>((res, rej) => {
237+
resolve = res;
238+
reject = rej;
239+
});
240+
241+
this.pending++;
242+
243+
try {
244+
this.doParseSitemapFromResponse(url, resp, resolve!, reject!);
245+
246+
await promise;
247+
} finally {
248+
this.pending--;
249+
}
250+
}
251+
252+
async checkIfDone() {
253+
if (!this.pending) {
254+
// this needs to happen async since if its in the current task,
255+
// queue won't be idle until this completes
256+
setTimeout(async () => {
257+
await this.queue.onIdle();
258+
this.emit("end");
259+
}, 100);
260+
}
229261
}
230262

231-
private _parseSitemapFromResponse(url: string, resp: Response) {
263+
private doParseSitemapFromResponse(
264+
url: string,
265+
resp: Response,
266+
resolve: () => void,
267+
reject: () => void,
268+
) {
232269
let stream;
233270

234271
const { body } = resp;
235272
if (!body) {
236-
void this.closeSitemap(url);
237-
throw new Error("missing response body");
273+
logger.warn("Sitemap missing response body", {}, "sitemap");
274+
reject();
275+
return;
238276
}
239277
// decompress .gz sitemaps
240278
// if content-encoding is gzip, then likely already being decompressed by fetch api
@@ -255,23 +293,18 @@ export class SitemapReader extends EventEmitter {
255293

256294
readableNodeStream.on("error", (e: Error) => {
257295
logger.warn("Error parsing sitemap", formatErr(e), "sitemap");
258-
void this.closeSitemap(url);
296+
reject();
259297
});
260298

261-
this.initSaxParser(url, readableNodeStream);
262-
}
263-
264-
private async closeSitemap(url: string) {
265-
this.pending.delete(url);
266-
if (!this.pending.size) {
267-
await this.queue.onIdle();
268-
this.emit("end");
269-
}
299+
this.initSaxParser(url, readableNodeStream, resolve, reject);
270300
}
271301

272-
initSaxParser(url: string, sourceStream: Readable) {
273-
this.pending.add(url);
274-
302+
private initSaxParser(
303+
url: string,
304+
sourceStream: Readable,
305+
resolve: () => void,
306+
reject: () => void,
307+
) {
275308
const parserStream = sax.createStream(false, {
276309
trim: true,
277310
normalize: true,
@@ -315,7 +348,7 @@ export class SitemapReader extends EventEmitter {
315348
}
316349
};
317350

318-
parserStream.on("end", () => this.closeSitemap(url));
351+
parserStream.on("end", () => resolve());
319352

320353
parserStream.on("opentag", (node: sax.Tag) => {
321354
switch (node.name) {
@@ -398,19 +431,23 @@ export class SitemapReader extends EventEmitter {
398431
parserStream.on("cdata", (text: string) => processText(text));
399432

400433
parserStream.on("error", (err: Error) => {
434+
const msg = { url, err, errCount };
401435
if (this.atLimit()) {
402-
this.pending.delete(url);
403-
return;
404-
}
405-
logger.warn(
406-
"Sitemap error parsing XML",
407-
{ url, err, errCount },
408-
"sitemap",
409-
);
410-
if (errCount++ < 3) {
411-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
412-
(parserStream._parser as any).error = null;
413-
parserStream._parser.resume();
436+
logger.warn(
437+
"Sitemap parsing aborting, page limit reached",
438+
msg,
439+
"sitemap",
440+
);
441+
resolve();
442+
} else {
443+
logger.warn("Sitemap error parsing XML", msg, "sitemap");
444+
if (errCount++ < 3) {
445+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
446+
(parserStream._parser as any).error = null;
447+
parserStream._parser.resume();
448+
} else {
449+
reject();
450+
}
414451
}
415452
});
416453

@@ -488,4 +525,8 @@ export class SitemapReader extends EventEmitter {
488525
getSitemapsQueued() {
489526
return this.queue.size;
490527
}
528+
529+
getNumPending() {
530+
return this.pending;
531+
}
491532
}

0 commit comments

Comments
 (0)