Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2076,12 +2076,11 @@ self.__bx_behaviors.selectMainBehavior();
return;
}

const realSize = await this.crawlState.queueSize();
const pendingPages = await this.crawlState.getPendingList();
const pending = pendingPages.length;
const crawled = await this.crawlState.numDone();
const failed = await this.crawlState.numFailed();
const total = realSize + pendingPages.length + crawled + failed;
const total = await this.crawlState.numFound();
const limit = { max: this.pageLimit || 0, hit: this.limitHit };
const stats = {
crawled,
Expand Down Expand Up @@ -2219,6 +2218,7 @@ self.__bx_behaviors.selectMainBehavior();
// excluded in recorder
data.pageSkipped = true;
logger.warn("Page Load Blocked, skipping", { msg, loadState });
throw new Error("logged");
} else {
return this.pageFailed("Page Load Failed", retry, {
msg,
Expand Down
11 changes: 11 additions & 0 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ export class Recorder extends EventEmitter {
pageInfo!: PageInfoRecord;
mainFrameId: string | null = null;
skipRangeUrls!: Map<string, number>;
skipPageInfo = false;

swTargetId?: string | null;
swFrameIds = new Set<string>();
Expand Down Expand Up @@ -743,6 +744,7 @@ export class Recorder extends EventEmitter {
);

if (errorReason) {
this.skipPageInfo = true;
await cdp.send("Fetch.failRequest", {
requestId,
errorReason,
Expand Down Expand Up @@ -946,6 +948,7 @@ export class Recorder extends EventEmitter {
this.pendingRequests = new Map();
this.skipIds = new Set();
this.skipRangeUrls = new Map<string, number>();
this.skipPageInfo = false;
this.pageFinished = false;
this.pageInfo = {
pageid,
Expand Down Expand Up @@ -974,6 +977,14 @@ export class Recorder extends EventEmitter {
}

writePageInfoRecord() {
if (this.skipPageInfo) {
logger.debug(
"Skipping writing pageinfo for blocked page",
{ url: "urn:pageinfo:" + this.pageUrl },
"recorder",
);
return;
}
const text = JSON.stringify(this.pageInfo, null, 2);

const url = this.pageUrl;
Expand Down
45 changes: 40 additions & 5 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,18 @@ export class PageState {
// ============================================================================
declare module "ioredis" {
interface RedisCommander<Context> {
numfound(
skey: string,
esKey: string,
exKey: string,
): Result<number, Context>;

addqueue(
pkey: string,
qkey: string,
skey: string,
esKey: string,
exKey: string,
url: string,
score: number,
data: string,
Expand Down Expand Up @@ -188,6 +195,7 @@ export type SaveState = {
errors: string[];
extraSeeds: string[];
sitemapDone: boolean;
excluded?: string[];
};

// ============================================================================
Expand All @@ -213,6 +221,8 @@ export class RedisCrawlState {
esKey: string;
esMap: string;

exKey: string;

sitemapDoneKey: string;

waczFilename: string | null = null;
Expand Down Expand Up @@ -252,16 +262,27 @@ export class RedisCrawlState {
this.esKey = this.key + ":extraSeeds";
this.esMap = this.key + ":esMap";

// stores URLs that have been seen but excluded
// (eg. redirect-to-excluded or trimmed)
this.exKey = this.key + ":excluded";

this.sitemapDoneKey = this.key + ":sitemapDone";

this._initLuaCommands(this.redis);
}

_initLuaCommands(redis: Redis) {
redis.defineCommand("numfound", {
numberOfKeys: 3,
lua: `
return redis.call('scard', KEYS[1]) - redis.call('llen', KEYS[2]) - redis.call('scard', KEYS[3]);
`,
});

redis.defineCommand("addqueue", {
numberOfKeys: 4,
numberOfKeys: 5,
lua: `
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]);
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]) - redis.call('scard', KEYS[5]);
local limit = tonumber(ARGV[4]);
if limit > 0 and size >= limit then
return 1;
Expand All @@ -288,7 +309,7 @@ return 0;
if json then
local data = cjson.decode(json);
redis.call('hdel', KEYS[2], data.url);
redis.call('srem', KEYS[3], data.url);
redis.call('sadd', KEYS[3], data.url);
end
return 1;
`,
Expand Down Expand Up @@ -449,7 +470,7 @@ return inx;
async markExcluded(url: string) {
await this.redis.hdel(this.pkey, url);

await this.redis.srem(this.skey, url);
await this.redis.sadd(this.exKey, url);
}

recheckScope(data: QueueEntry, seeds: ScopedSeed[]) {
Expand All @@ -471,6 +492,10 @@ return inx;
);
}

async numFound() {
return await this.redis.numfound(this.skey, this.esKey, this.exKey);
}

async trimToLimit(limit: number) {
if (limit === 0) {
return;
Expand All @@ -486,7 +511,7 @@ return inx;
const remain = Math.max(0, limit - totalComplete);
// trim queue until size <= remain
while (
(await this.redis.trimqueue(this.qkey, this.pkey, this.skey, remain)) ===
(await this.redis.trimqueue(this.qkey, this.pkey, this.exKey, remain)) ===
1
) {
/* ignore */
Expand Down Expand Up @@ -706,6 +731,7 @@ return inx;
this.qkey,
this.skey,
this.esKey,
this.exKey,
url,
this._getScore(data),
JSON.stringify(data),
Expand Down Expand Up @@ -748,8 +774,10 @@ return inx;
const errors = await this.getErrorList();
const extraSeeds = await this._iterListKeys(this.esKey, seen);
const sitemapDone = await this.isSitemapDone();
const excludedSet = await this._iterSet(this.exKey);

const finished = [...seen.values()];
const excluded = [...excludedSet.values()];

return {
extraSeeds,
Expand All @@ -759,6 +787,7 @@ return inx;
sitemapDone,
failed,
errors,
excluded,
};
}

Expand Down Expand Up @@ -845,6 +874,7 @@ return inx;
await this.redis.del(this.fkey);
await this.redis.del(this.skey);
await this.redis.del(this.ekey);
await this.redis.del(this.exKey);

let seen: string[] = [];

Expand Down Expand Up @@ -940,6 +970,11 @@ return inx;
}

await this.redis.sadd(this.skey, seen);

if (state.excluded?.length) {
await this.redis.sadd(this.exKey, state.excluded);
}

return seen.length;
}

Expand Down
27 changes: 26 additions & 1 deletion tests/exclude-redirected.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { execSync } from "child_process";

test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
execSync(
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");

// no entries besides header
expect(
Expand All @@ -19,3 +19,28 @@ test("ensure exclusion is applied on redirected URL, which contains 'help', so i

});


test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => {
execSync(
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX");


// no entries besides header
expect(
fs
.readFileSync(
"test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl",
"utf8",
).trim().split("\n").length
).toBe(1);


const data = fs.readFileSync(
"test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj",
{ encoding: "utf-8" },
);

// expect no urn:pageinfo records for excluded page
const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`);
expect(first < 0).toBe(true);
});
Loading