From ac084d6fa7efd7de1786908454abad8a6eafdb49 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 6 Dec 2025 16:06:54 -0800 Subject: [PATCH 1/6] add exclude key to track URLs that have been excluded to avoid removing them from the seen list avoids requeuing URLs that are excluded on redirect --- src/util/state.ts | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/util/state.ts b/src/util/state.ts index bb973b11..24a6d404 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -120,6 +120,7 @@ declare module "ioredis" { qkey: string, skey: string, esKey: string, + exrKey: string, url: string, score: number, data: string, @@ -188,6 +189,7 @@ export type SaveState = { errors: string[]; extraSeeds: string[]; sitemapDone: boolean; + excluded?: string[]; }; // ============================================================================ @@ -213,6 +215,8 @@ export class RedisCrawlState { esKey: string; esMap: string; + exKey: string; + sitemapDoneKey: string; waczFilename: string | null = null; @@ -252,6 +256,10 @@ export class RedisCrawlState { this.esKey = this.key + ":extraSeeds"; this.esMap = this.key + ":esMap"; + // stores URLs that have been seen but excluded + // (eg. redirect-to-excluded or trimmed) + this.exKey = this.key + ":excluded"; + this.sitemapDoneKey = this.key + ":sitemapDone"; this._initLuaCommands(this.redis); @@ -259,9 +267,9 @@ export class RedisCrawlState { _initLuaCommands(redis: Redis) { redis.defineCommand("addqueue", { - numberOfKeys: 4, + numberOfKeys: 5, lua: ` -local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]); +local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]) - redis.call('scard', KEYS[5]); local limit = tonumber(ARGV[4]); if limit > 0 and size >= limit then return 1; @@ -288,7 +296,7 @@ return 0; if json then local data = cjson.decode(json); redis.call('hdel', KEYS[2], data.url); - redis.call('srem', KEYS[3], data.url); + redis.call('sadd', KEYS[3], data.url); end return 1; `, @@ -449,7 +457,7 @@ return inx; async markExcluded(url: string) { await this.redis.hdel(this.pkey, url); - await this.redis.srem(this.skey, url); + await this.redis.sadd(this.exKey, url); } recheckScope(data: QueueEntry, seeds: ScopedSeed[]) { @@ -486,7 +494,7 @@ return inx; const remain = Math.max(0, limit - totalComplete); // trim queue until size <= remain while ( - (await this.redis.trimqueue(this.qkey, this.pkey, this.skey, remain)) === + (await this.redis.trimqueue(this.qkey, this.pkey, this.exKey, remain)) === 1 ) { /* ignore */ @@ -706,6 +714,7 @@ return inx; this.qkey, this.skey, this.esKey, + this.exKey, url, this._getScore(data), JSON.stringify(data), @@ -748,8 +757,10 @@ return inx; const errors = await this.getErrorList(); const extraSeeds = await this._iterListKeys(this.esKey, seen); const sitemapDone = await this.isSitemapDone(); + const excludedSet = await this._iterSet(this.exKey); const finished = [...seen.values()]; + const excluded = [...excludedSet.values()]; return { extraSeeds, @@ -759,6 +770,7 @@ return inx; sitemapDone, failed, errors, + excluded, }; } @@ -845,6 +857,7 @@ return inx; await this.redis.del(this.fkey); await this.redis.del(this.skey); await this.redis.del(this.ekey); + await this.redis.del(this.exKey); let seen: string[] = []; @@ -940,6 +953,11 @@ return inx; } await this.redis.sadd(this.skey, seen); + + if (state.excluded?.length) { + await this.redis.sadd(this.exKey, state.excluded); + } + return seen.length; } From fa786f8f358861893e39b067417ecac21d7a1ee3 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 6 Dec 2025 16:13:50 -0800 Subject: [PATCH 2/6] interrupt earlier on skipped page --- src/crawler.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/crawler.ts b/src/crawler.ts index 6a0f3f71..32bb535e 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2219,6 +2219,7 @@ self.__bx_behaviors.selectMainBehavior(); // excluded in recorder data.pageSkipped = true; logger.warn("Page Load Blocked, skipping", { msg, loadState }); + throw new Error("logged"); } else { return this.pageFailed("Page Load Failed", retry, { msg, From ba4471f25b7acb70fa2ae9b3ad02b02a1b818b57 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 6 Dec 2025 16:46:30 -0800 Subject: [PATCH 3/6] tests: update excude redirect test to test that extra urn:page records are not written for excluded-on-redirect page --- tests/exclude-redirected.test.js | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/exclude-redirected.test.js b/tests/exclude-redirected.test.js index b81a0ef8..db33cadf 100644 --- a/tests/exclude-redirected.test.js +++ b/tests/exclude-redirected.test.js @@ -6,7 +6,7 @@ import { execSync } from "child_process"; test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => { execSync( - "docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1"); + "docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1"); // no entries besides header expect( @@ -19,3 +19,32 @@ test("ensure exclusion is applied on redirected URL, which contains 'help', so i }); + +test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => { + execSync( + "docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX"); + + + // no entries besides header + expect( + fs + .readFileSync( + "test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl", + "utf8", + ).trim().split("\n").length + ).toBe(1); + + + const data = fs.readFileSync( + "test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj", + { encoding: "utf-8" }, + ); + + // expect one occurence + const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`); + expect(first > 0).toBe(true); + + // expect no other occurences + expect(data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`, first + 1)).toBe(-1); + +}); From 804becc5b666381e91218faf471a6ef557adb055 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 7 Dec 2025 19:48:12 -0800 Subject: [PATCH 4/6] for consistency, since excluded page isn't written to pages.jsonl, there should also be no urn:pageinfo record added for excluded pages --- src/util/recorder.ts | 11 +++++++++++ tests/exclude-redirected.test.js | 8 ++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index ccd2c17d..c28cbbcb 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -118,6 +118,7 @@ export class Recorder extends EventEmitter { pageInfo!: PageInfoRecord; mainFrameId: string | null = null; skipRangeUrls!: Map; + skipPageInfo = false; swTargetId?: string | null; swFrameIds = new Set(); @@ -743,6 +744,7 @@ export class Recorder extends EventEmitter { ); if (errorReason) { + this.skipPageInfo = true; await cdp.send("Fetch.failRequest", { requestId, errorReason, @@ -946,6 +948,7 @@ export class Recorder extends EventEmitter { this.pendingRequests = new Map(); this.skipIds = new Set(); this.skipRangeUrls = new Map(); + this.skipPageInfo = false; this.pageFinished = false; this.pageInfo = { pageid, @@ -974,6 +977,14 @@ export class Recorder extends EventEmitter { } writePageInfoRecord() { + if (this.skipPageInfo) { + logger.debug( + "Skipping writing pageinfo for blocked page", + { url: "urn:pageinfo:" + this.pageUrl }, + "recorder", + ); + return; + } const text = JSON.stringify(this.pageInfo, null, 2); const url = this.pageUrl; diff --git a/tests/exclude-redirected.test.js b/tests/exclude-redirected.test.js index db33cadf..0802d6c6 100644 --- a/tests/exclude-redirected.test.js +++ b/tests/exclude-redirected.test.js @@ -40,11 +40,7 @@ test("ensure exclusion applied on redirect URL, and URL is not requeued again", { encoding: "utf-8" }, ); - // expect one occurence + // expect no urn:pageinfo records for excluded page const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`); - expect(first > 0).toBe(true); - - // expect no other occurences - expect(data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`, first + 1)).toBe(-1); - + expect(first < 0).toBe(true); }); From 9a81bfc1338fdd34d2da3f02178571c75f9092ac Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 8 Dec 2025 15:00:48 -0800 Subject: [PATCH 5/6] typo fix --- src/util/state.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/state.ts b/src/util/state.ts index 24a6d404..d9aaa416 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -120,7 +120,7 @@ declare module "ioredis" { qkey: string, skey: string, esKey: string, - exrKey: string, + exKey: string, url: string, score: number, data: string, From b61781afc4fb4a93a6d0e4496a2cf439f0a9a891 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 8 Dec 2025 16:48:06 -0800 Subject: [PATCH 6/6] add numFound() command to get num found pages, subtracting extra seeds + excluded pages --- src/crawler.ts | 3 +-- src/util/state.ts | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 32bb535e..eda53510 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2076,12 +2076,11 @@ self.__bx_behaviors.selectMainBehavior(); return; } - const realSize = await this.crawlState.queueSize(); const pendingPages = await this.crawlState.getPendingList(); const pending = pendingPages.length; const crawled = await this.crawlState.numDone(); const failed = await this.crawlState.numFailed(); - const total = realSize + pendingPages.length + crawled + failed; + const total = await this.crawlState.numFound(); const limit = { max: this.pageLimit || 0, hit: this.limitHit }; const stats = { crawled, diff --git a/src/util/state.ts b/src/util/state.ts index d9aaa416..54c7cc5a 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -115,6 +115,12 @@ export class PageState { // ============================================================================ declare module "ioredis" { interface RedisCommander { + numfound( + skey: string, + esKey: string, + exKey: string, + ): Result; + addqueue( pkey: string, qkey: string, @@ -266,6 +272,13 @@ export class RedisCrawlState { } _initLuaCommands(redis: Redis) { + redis.defineCommand("numfound", { + numberOfKeys: 3, + lua: ` +return redis.call('scard', KEYS[1]) - redis.call('llen', KEYS[2]) - redis.call('scard', KEYS[3]); +`, + }); + redis.defineCommand("addqueue", { numberOfKeys: 5, lua: ` @@ -479,6 +492,10 @@ return inx; ); } + async numFound() { + return await this.redis.numfound(this.skey, this.esKey, this.exKey); + } + async trimToLimit(limit: number) { if (limit === 0) { return;