diff --git a/src/crawler.ts b/src/crawler.ts index 6a0f3f71..eda53510 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2076,12 +2076,11 @@ self.__bx_behaviors.selectMainBehavior(); return; } - const realSize = await this.crawlState.queueSize(); const pendingPages = await this.crawlState.getPendingList(); const pending = pendingPages.length; const crawled = await this.crawlState.numDone(); const failed = await this.crawlState.numFailed(); - const total = realSize + pendingPages.length + crawled + failed; + const total = await this.crawlState.numFound(); const limit = { max: this.pageLimit || 0, hit: this.limitHit }; const stats = { crawled, @@ -2219,6 +2218,7 @@ self.__bx_behaviors.selectMainBehavior(); // excluded in recorder data.pageSkipped = true; logger.warn("Page Load Blocked, skipping", { msg, loadState }); + throw new Error("logged"); } else { return this.pageFailed("Page Load Failed", retry, { msg, diff --git a/src/util/recorder.ts b/src/util/recorder.ts index ccd2c17d..c28cbbcb 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -118,6 +118,7 @@ export class Recorder extends EventEmitter { pageInfo!: PageInfoRecord; mainFrameId: string | null = null; skipRangeUrls!: Map; + skipPageInfo = false; swTargetId?: string | null; swFrameIds = new Set(); @@ -743,6 +744,7 @@ export class Recorder extends EventEmitter { ); if (errorReason) { + this.skipPageInfo = true; await cdp.send("Fetch.failRequest", { requestId, errorReason, @@ -946,6 +948,7 @@ export class Recorder extends EventEmitter { this.pendingRequests = new Map(); this.skipIds = new Set(); this.skipRangeUrls = new Map(); + this.skipPageInfo = false; this.pageFinished = false; this.pageInfo = { pageid, @@ -974,6 +977,14 @@ export class Recorder extends EventEmitter { } writePageInfoRecord() { + if (this.skipPageInfo) { + logger.debug( + "Skipping writing pageinfo for blocked page", + { url: "urn:pageinfo:" + this.pageUrl }, + "recorder", + ); + return; + } const text = JSON.stringify(this.pageInfo, null, 2); const url = this.pageUrl; diff --git a/src/util/state.ts b/src/util/state.ts index bb973b11..54c7cc5a 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -115,11 +115,18 @@ export class PageState { // ============================================================================ declare module "ioredis" { interface RedisCommander { + numfound( + skey: string, + esKey: string, + exKey: string, + ): Result; + addqueue( pkey: string, qkey: string, skey: string, esKey: string, + exKey: string, url: string, score: number, data: string, @@ -188,6 +195,7 @@ export type SaveState = { errors: string[]; extraSeeds: string[]; sitemapDone: boolean; + excluded?: string[]; }; // ============================================================================ @@ -213,6 +221,8 @@ export class RedisCrawlState { esKey: string; esMap: string; + exKey: string; + sitemapDoneKey: string; waczFilename: string | null = null; @@ -252,16 +262,27 @@ export class RedisCrawlState { this.esKey = this.key + ":extraSeeds"; this.esMap = this.key + ":esMap"; + // stores URLs that have been seen but excluded + // (eg. redirect-to-excluded or trimmed) + this.exKey = this.key + ":excluded"; + this.sitemapDoneKey = this.key + ":sitemapDone"; this._initLuaCommands(this.redis); } _initLuaCommands(redis: Redis) { + redis.defineCommand("numfound", { + numberOfKeys: 3, + lua: ` +return redis.call('scard', KEYS[1]) - redis.call('llen', KEYS[2]) - redis.call('scard', KEYS[3]); +`, + }); + redis.defineCommand("addqueue", { - numberOfKeys: 4, + numberOfKeys: 5, lua: ` -local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]); +local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]) - redis.call('scard', KEYS[5]); local limit = tonumber(ARGV[4]); if limit > 0 and size >= limit then return 1; @@ -288,7 +309,7 @@ return 0; if json then local data = cjson.decode(json); redis.call('hdel', KEYS[2], data.url); - redis.call('srem', KEYS[3], data.url); + redis.call('sadd', KEYS[3], data.url); end return 1; `, @@ -449,7 +470,7 @@ return inx; async markExcluded(url: string) { await this.redis.hdel(this.pkey, url); - await this.redis.srem(this.skey, url); + await this.redis.sadd(this.exKey, url); } recheckScope(data: QueueEntry, seeds: ScopedSeed[]) { @@ -471,6 +492,10 @@ return inx; ); } + async numFound() { + return await this.redis.numfound(this.skey, this.esKey, this.exKey); + } + async trimToLimit(limit: number) { if (limit === 0) { return; @@ -486,7 +511,7 @@ return inx; const remain = Math.max(0, limit - totalComplete); // trim queue until size <= remain while ( - (await this.redis.trimqueue(this.qkey, this.pkey, this.skey, remain)) === + (await this.redis.trimqueue(this.qkey, this.pkey, this.exKey, remain)) === 1 ) { /* ignore */ @@ -706,6 +731,7 @@ return inx; this.qkey, this.skey, this.esKey, + this.exKey, url, this._getScore(data), JSON.stringify(data), @@ -748,8 +774,10 @@ return inx; const errors = await this.getErrorList(); const extraSeeds = await this._iterListKeys(this.esKey, seen); const sitemapDone = await this.isSitemapDone(); + const excludedSet = await this._iterSet(this.exKey); const finished = [...seen.values()]; + const excluded = [...excludedSet.values()]; return { extraSeeds, @@ -759,6 +787,7 @@ return inx; sitemapDone, failed, errors, + excluded, }; } @@ -845,6 +874,7 @@ return inx; await this.redis.del(this.fkey); await this.redis.del(this.skey); await this.redis.del(this.ekey); + await this.redis.del(this.exKey); let seen: string[] = []; @@ -940,6 +970,11 @@ return inx; } await this.redis.sadd(this.skey, seen); + + if (state.excluded?.length) { + await this.redis.sadd(this.exKey, state.excluded); + } + return seen.length; } diff --git a/tests/exclude-redirected.test.js b/tests/exclude-redirected.test.js index b81a0ef8..0802d6c6 100644 --- a/tests/exclude-redirected.test.js +++ b/tests/exclude-redirected.test.js @@ -6,7 +6,7 @@ import { execSync } from "child_process"; test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => { execSync( - "docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1"); + "docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1"); // no entries besides header expect( @@ -19,3 +19,28 @@ test("ensure exclusion is applied on redirected URL, which contains 'help', so i }); + +test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => { + execSync( + "docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX"); + + + // no entries besides header + expect( + fs + .readFileSync( + "test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl", + "utf8", + ).trim().split("\n").length + ).toBe(1); + + + const data = fs.readFileSync( + "test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj", + { encoding: "utf-8" }, + ); + + // expect no urn:pageinfo records for excluded page + const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`); + expect(first < 0).toBe(true); +});