diff --git a/package.json b/package.json index 2fcf16b7..ac4a647e 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,7 @@ "js-levenshtein": "^1.1.6", "js-yaml": "^4.1.0", "minio": "^7.1.3", + "normalize-url": "^8.1.0", "p-queue": "^7.3.4", "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", diff --git a/src/util/state.ts b/src/util/state.ts index 3df430fc..856604a4 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -11,6 +11,7 @@ import { import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; import { interpolateFilename, UploadResult } from "./storage.js"; +import normalizeUrl, { Options as NormamlizeUrlOptions } from "normalize-url"; // ============================================================================ export enum LoadState { @@ -28,6 +29,20 @@ export enum QueueState { DUPE_URL = 2, } +// ============================================================================ +const normalizeUrlOpts: NormamlizeUrlOptions = { + defaultProtocol: "https", + stripAuthentication: false, + stripTextFragment: false, + stripWWW: false, + stripHash: false, + removeTrailingSlash: false, + removeSingleSlash: false, + removeExplicitPort: false, + sortQueryParameters: true, + removePath: false, +}; + // ============================================================================ // treat 0 or 206 as 200 for purposes of dedup function normalizeDedupStatus(status: number): number { @@ -673,7 +688,6 @@ return inx; return res >= 3; } - //async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) { async addToQueue( { url, @@ -685,6 +699,7 @@ return inx; }: QueueEntry, limit = 0, ) { + url = normalizeUrl(url, normalizeUrlOpts); const added = this._timestamp(); const data: QueueEntry = { added, url, seedId, depth, extraHops }; @@ -1010,6 +1025,7 @@ return inx; } async addIfNoDupe(key: string, url: string, status: number) { + url = normalizeUrl(url, normalizeUrlOpts); return ( (await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) === 1 diff --git a/tests/url-normalize.test.js b/tests/url-normalize.test.js new file mode 100644 index 00000000..b82c6089 --- /dev/null +++ b/tests/url-normalize.test.js @@ -0,0 +1,15 @@ +import fs from "fs"; +import child_process from "child_process"; + +test("ensure URLs with same query args but in different order considered same URL", async () => { + child_process.execSync("docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url 'https://example-com.webrecorder.net/?A=1&B=2' --url 'https://example-com.webrecorder.net/?B=2&A=1' --collection url-norm-1 --scopeType page"); + + // url is normalized, only 1 URL is crawled + // check pages.jsonl for 1 URL (+ 1 header) + expect(fs.readFileSync( + "test-crawls/collections/url-norm-1/pages/pages.jsonl", "utf8", + ) + .trim() + .split("\n").length).toBe(1 + 1); +}); + diff --git a/yarn.lock b/yarn.lock index 94214f3f..e09d37b5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4150,6 +4150,11 @@ normalize-path@^3.0.0: resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65" integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA== +normalize-url@^8.1.0: + version "8.1.0" + resolved "https://registry.yarnpkg.com/normalize-url/-/normalize-url-8.1.0.tgz#d33504f67970decf612946fd4880bc8c0983486d" + integrity sha512-X06Mfd/5aKsRHc0O0J5CUedwnPmnDtLF2+nq+KN9KSDlJHkPuh0JUviWjEWMe0SW/9TDdSLVPuk7L5gGTIA1/w== + npm-run-path@^4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea"