From 0b0f29a75466416c533a5bd9afd7846eaa181d1e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 5 Dec 2025 09:29:41 -0800 Subject: [PATCH 1/5] - use 'normalize-url' package to avoid differently sorted query args - configure other options, such as keeping www. and trailing slashes, only using this for query arg sorting --- package.json | 1 + src/util/state.ts | 18 +++++++++++++++++- yarn.lock | 5 +++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index 2fcf16b7..ac4a647e 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,7 @@ "js-levenshtein": "^1.1.6", "js-yaml": "^4.1.0", "minio": "^7.1.3", + "normalize-url": "^8.1.0", "p-queue": "^7.3.4", "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", diff --git a/src/util/state.ts b/src/util/state.ts index 3df430fc..f78abb50 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -11,6 +11,7 @@ import { import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; import { interpolateFilename, UploadResult } from "./storage.js"; +import normalizeUrl, { Options } from "normalize-url"; // ============================================================================ export enum LoadState { @@ -28,6 +29,20 @@ export enum QueueState { DUPE_URL = 2, } +// ============================================================================ +const normalizeOpts: Options = { + defaultProtocol: "https", + stripAuthentication: false, + stripTextFragment: false, + stripWWW: false, + stripHash: false, + removeTrailingSlash: false, + removeSingleSlash: false, + removeExplicitPort: true, + sortQueryParameters: true, + removePath: false, +}; + // ============================================================================ // treat 0 or 206 as 200 for purposes of dedup function normalizeDedupStatus(status: number): number { @@ -673,7 +688,6 @@ return inx; return res >= 3; } - //async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) { async addToQueue( { url, @@ -685,6 +699,7 @@ return inx; }: QueueEntry, limit = 0, ) { + url = normalizeUrl(url, normalizeOpts); const added = this._timestamp(); const data: QueueEntry = { added, url, seedId, depth, extraHops }; @@ -1010,6 +1025,7 @@ return inx; } async addIfNoDupe(key: string, url: string, status: number) { + url = normalizeUrl(url, normalizeOpts); return ( (await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) === 1 diff --git a/yarn.lock b/yarn.lock index 94214f3f..e09d37b5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4150,6 +4150,11 @@ normalize-path@^3.0.0: resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65" integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA== +normalize-url@^8.1.0: + version "8.1.0" + resolved "https://registry.yarnpkg.com/normalize-url/-/normalize-url-8.1.0.tgz#d33504f67970decf612946fd4880bc8c0983486d" + integrity sha512-X06Mfd/5aKsRHc0O0J5CUedwnPmnDtLF2+nq+KN9KSDlJHkPuh0JUviWjEWMe0SW/9TDdSLVPuk7L5gGTIA1/w== + npm-run-path@^4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea" From 896059c73be8ed1cdb18b725965018d40e8a1182 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 5 Dec 2025 17:11:20 -0800 Subject: [PATCH 2/5] don't remove port --- src/util/state.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/state.ts b/src/util/state.ts index f78abb50..8539fd24 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -38,7 +38,7 @@ const normalizeOpts: Options = { stripHash: false, removeTrailingSlash: false, removeSingleSlash: false, - removeExplicitPort: true, + removeExplicitPort: false, sortQueryParameters: true, removePath: false, }; From 4c59c6d70e1345512a201f4783097db0d7c85e1b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 5 Dec 2025 17:26:14 -0800 Subject: [PATCH 3/5] add test --- tests/url-normalize.test.js | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/url-normalize.test.js diff --git a/tests/url-normalize.test.js b/tests/url-normalize.test.js new file mode 100644 index 00000000..b82c6089 --- /dev/null +++ b/tests/url-normalize.test.js @@ -0,0 +1,15 @@ +import fs from "fs"; +import child_process from "child_process"; + +test("ensure URLs with same query args but in different order considered same URL", async () => { + child_process.execSync("docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url 'https://example-com.webrecorder.net/?A=1&B=2' --url 'https://example-com.webrecorder.net/?B=2&A=1' --collection url-norm-1 --scopeType page"); + + // url is normalized, only 1 URL is crawled + // check pages.jsonl for 1 URL (+ 1 header) + expect(fs.readFileSync( + "test-crawls/collections/url-norm-1/pages/pages.jsonl", "utf8", + ) + .trim() + .split("\n").length).toBe(1 + 1); +}); + From f7c0adacf33f9a86e9fd2bbf993d564b1bf406ce Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 6 Dec 2025 13:13:08 -0800 Subject: [PATCH 4/5] adjust name --- src/util/state.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util/state.ts b/src/util/state.ts index 8539fd24..2e45b6ad 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -30,7 +30,7 @@ export enum QueueState { } // ============================================================================ -const normalizeOpts: Options = { +const normalizeUrlOpts: Options = { defaultProtocol: "https", stripAuthentication: false, stripTextFragment: false, @@ -699,7 +699,7 @@ return inx; }: QueueEntry, limit = 0, ) { - url = normalizeUrl(url, normalizeOpts); + url = normalizeUrl(url, normalizeUrlOpts); const added = this._timestamp(); const data: QueueEntry = { added, url, seedId, depth, extraHops }; @@ -1025,7 +1025,7 @@ return inx; } async addIfNoDupe(key: string, url: string, status: number) { - url = normalizeUrl(url, normalizeOpts); + url = normalizeUrl(url, normalizeUrlOpts); return ( (await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) === 1 From 890ae1fa563ca9f6ca153a21fab498422bb2393c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 8 Dec 2025 14:04:47 -0800 Subject: [PATCH 5/5] use more specific name for options --- src/util/state.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/state.ts b/src/util/state.ts index 2e45b6ad..856604a4 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -11,7 +11,7 @@ import { import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; import { interpolateFilename, UploadResult } from "./storage.js"; -import normalizeUrl, { Options } from "normalize-url"; +import normalizeUrl, { Options as NormamlizeUrlOptions } from "normalize-url"; // ============================================================================ export enum LoadState { @@ -30,7 +30,7 @@ export enum QueueState { } // ============================================================================ -const normalizeUrlOpts: Options = { +const normalizeUrlOpts: NormamlizeUrlOptions = { defaultProtocol: "https", stripAuthentication: false, stripTextFragment: false,