Skip to content

Commit 850a6a6

Browse files
authored
Don't remove excluded-on-redirect URLs from seen list (#936)
Fixes #937 - Don't remove URLs from seen list - Add new excluded key, add URLs to be excluded (out-of-scope on redirect) to excluded set. The size of this set can be used to get the URLs that have been excluded in this way, to compute number of discovered URLs. - Don't write urn:pageinfo records for excluded pages, along with not writing to pages/extraPages.jsonl
1 parent 4a703cd commit 850a6a6

File tree

4 files changed

+79
-8
lines changed

4 files changed

+79
-8
lines changed

src/crawler.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2076,12 +2076,11 @@ self.__bx_behaviors.selectMainBehavior();
20762076
return;
20772077
}
20782078

2079-
const realSize = await this.crawlState.queueSize();
20802079
const pendingPages = await this.crawlState.getPendingList();
20812080
const pending = pendingPages.length;
20822081
const crawled = await this.crawlState.numDone();
20832082
const failed = await this.crawlState.numFailed();
2084-
const total = realSize + pendingPages.length + crawled + failed;
2083+
const total = await this.crawlState.numFound();
20852084
const limit = { max: this.pageLimit || 0, hit: this.limitHit };
20862085
const stats = {
20872086
crawled,
@@ -2219,6 +2218,7 @@ self.__bx_behaviors.selectMainBehavior();
22192218
// excluded in recorder
22202219
data.pageSkipped = true;
22212220
logger.warn("Page Load Blocked, skipping", { msg, loadState });
2221+
throw new Error("logged");
22222222
} else {
22232223
return this.pageFailed("Page Load Failed", retry, {
22242224
msg,

src/util/recorder.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ export class Recorder extends EventEmitter {
118118
pageInfo!: PageInfoRecord;
119119
mainFrameId: string | null = null;
120120
skipRangeUrls!: Map<string, number>;
121+
skipPageInfo = false;
121122

122123
swTargetId?: string | null;
123124
swFrameIds = new Set<string>();
@@ -743,6 +744,7 @@ export class Recorder extends EventEmitter {
743744
);
744745

745746
if (errorReason) {
747+
this.skipPageInfo = true;
746748
await cdp.send("Fetch.failRequest", {
747749
requestId,
748750
errorReason,
@@ -946,6 +948,7 @@ export class Recorder extends EventEmitter {
946948
this.pendingRequests = new Map();
947949
this.skipIds = new Set();
948950
this.skipRangeUrls = new Map<string, number>();
951+
this.skipPageInfo = false;
949952
this.pageFinished = false;
950953
this.pageInfo = {
951954
pageid,
@@ -974,6 +977,14 @@ export class Recorder extends EventEmitter {
974977
}
975978

976979
writePageInfoRecord() {
980+
if (this.skipPageInfo) {
981+
logger.debug(
982+
"Skipping writing pageinfo for blocked page",
983+
{ url: "urn:pageinfo:" + this.pageUrl },
984+
"recorder",
985+
);
986+
return;
987+
}
977988
const text = JSON.stringify(this.pageInfo, null, 2);
978989

979990
const url = this.pageUrl;

src/util/state.ts

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,18 @@ export class PageState {
130130
// ============================================================================
131131
declare module "ioredis" {
132132
interface RedisCommander<Context> {
133+
numfound(
134+
skey: string,
135+
esKey: string,
136+
exKey: string,
137+
): Result<number, Context>;
138+
133139
addqueue(
134140
pkey: string,
135141
qkey: string,
136142
skey: string,
137143
esKey: string,
144+
exKey: string,
138145
url: string,
139146
score: number,
140147
data: string,
@@ -203,6 +210,7 @@ export type SaveState = {
203210
errors: string[];
204211
extraSeeds: string[];
205212
sitemapDone: boolean;
213+
excluded?: string[];
206214
};
207215

208216
// ============================================================================
@@ -228,6 +236,8 @@ export class RedisCrawlState {
228236
esKey: string;
229237
esMap: string;
230238

239+
exKey: string;
240+
231241
sitemapDoneKey: string;
232242

233243
waczFilename: string | null = null;
@@ -267,16 +277,27 @@ export class RedisCrawlState {
267277
this.esKey = this.key + ":extraSeeds";
268278
this.esMap = this.key + ":esMap";
269279

280+
// stores URLs that have been seen but excluded
281+
// (eg. redirect-to-excluded or trimmed)
282+
this.exKey = this.key + ":excluded";
283+
270284
this.sitemapDoneKey = this.key + ":sitemapDone";
271285

272286
this._initLuaCommands(this.redis);
273287
}
274288

275289
_initLuaCommands(redis: Redis) {
290+
redis.defineCommand("numfound", {
291+
numberOfKeys: 3,
292+
lua: `
293+
return redis.call('scard', KEYS[1]) - redis.call('llen', KEYS[2]) - redis.call('scard', KEYS[3]);
294+
`,
295+
});
296+
276297
redis.defineCommand("addqueue", {
277-
numberOfKeys: 4,
298+
numberOfKeys: 5,
278299
lua: `
279-
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]);
300+
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]) - redis.call('scard', KEYS[5]);
280301
local limit = tonumber(ARGV[4]);
281302
if limit > 0 and size >= limit then
282303
return 1;
@@ -303,7 +324,7 @@ return 0;
303324
if json then
304325
local data = cjson.decode(json);
305326
redis.call('hdel', KEYS[2], data.url);
306-
redis.call('srem', KEYS[3], data.url);
327+
redis.call('sadd', KEYS[3], data.url);
307328
end
308329
return 1;
309330
`,
@@ -464,7 +485,7 @@ return inx;
464485
async markExcluded(url: string) {
465486
await this.redis.hdel(this.pkey, url);
466487

467-
await this.redis.srem(this.skey, url);
488+
await this.redis.sadd(this.exKey, url);
468489
}
469490

470491
recheckScope(data: QueueEntry, seeds: ScopedSeed[]) {
@@ -486,6 +507,10 @@ return inx;
486507
);
487508
}
488509

510+
async numFound() {
511+
return await this.redis.numfound(this.skey, this.esKey, this.exKey);
512+
}
513+
489514
async trimToLimit(limit: number) {
490515
if (limit === 0) {
491516
return;
@@ -501,7 +526,7 @@ return inx;
501526
const remain = Math.max(0, limit - totalComplete);
502527
// trim queue until size <= remain
503528
while (
504-
(await this.redis.trimqueue(this.qkey, this.pkey, this.skey, remain)) ===
529+
(await this.redis.trimqueue(this.qkey, this.pkey, this.exKey, remain)) ===
505530
1
506531
) {
507532
/* ignore */
@@ -721,6 +746,7 @@ return inx;
721746
this.qkey,
722747
this.skey,
723748
this.esKey,
749+
this.exKey,
724750
url,
725751
this._getScore(data),
726752
JSON.stringify(data),
@@ -763,8 +789,10 @@ return inx;
763789
const errors = await this.getErrorList();
764790
const extraSeeds = await this._iterListKeys(this.esKey, seen);
765791
const sitemapDone = await this.isSitemapDone();
792+
const excludedSet = await this._iterSet(this.exKey);
766793

767794
const finished = [...seen.values()];
795+
const excluded = [...excludedSet.values()];
768796

769797
return {
770798
extraSeeds,
@@ -774,6 +802,7 @@ return inx;
774802
sitemapDone,
775803
failed,
776804
errors,
805+
excluded,
777806
};
778807
}
779808

@@ -860,6 +889,7 @@ return inx;
860889
await this.redis.del(this.fkey);
861890
await this.redis.del(this.skey);
862891
await this.redis.del(this.ekey);
892+
await this.redis.del(this.exKey);
863893

864894
let seen: string[] = [];
865895

@@ -955,6 +985,11 @@ return inx;
955985
}
956986

957987
await this.redis.sadd(this.skey, seen);
988+
989+
if (state.excluded?.length) {
990+
await this.redis.sadd(this.exKey, state.excluded);
991+
}
992+
958993
return seen.length;
959994
}
960995

tests/exclude-redirected.test.js

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { execSync } from "child_process";
66

77
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
88
execSync(
9-
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
9+
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
1010

1111
// no entries besides header
1212
expect(
@@ -19,3 +19,28 @@ test("ensure exclusion is applied on redirected URL, which contains 'help', so i
1919

2020
});
2121

22+
23+
test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => {
24+
execSync(
25+
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX");
26+
27+
28+
// no entries besides header
29+
expect(
30+
fs
31+
.readFileSync(
32+
"test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl",
33+
"utf8",
34+
).trim().split("\n").length
35+
).toBe(1);
36+
37+
38+
const data = fs.readFileSync(
39+
"test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj",
40+
{ encoding: "utf-8" },
41+
);
42+
43+
// expect no urn:pageinfo records for excluded page
44+
const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`);
45+
expect(first < 0).toBe(true);
46+
});

0 commit comments

Comments
 (0)