Skip to content

Commit 4a703cd

Browse files
authored
sort query args before queuing URLs (#935)
- use 'normalize-url' package to avoid differently sorted query args that are the same url - configure other options, such as keeping www. and trailing slashes, only using this for query arg sorting
1 parent 993081d commit 4a703cd

File tree

4 files changed

+38
-1
lines changed

4 files changed

+38
-1
lines changed

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
"js-levenshtein": "^1.1.6",
3131
"js-yaml": "^4.1.0",
3232
"minio": "^7.1.3",
33+
"normalize-url": "^8.1.0",
3334
"p-queue": "^7.3.4",
3435
"pixelmatch": "^5.3.0",
3536
"pngjs": "^7.0.0",

src/util/state.ts

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import {
1111
import { ScopedSeed } from "./seeds.js";
1212
import { Frame } from "puppeteer-core";
1313
import { interpolateFilename, UploadResult } from "./storage.js";
14+
import normalizeUrl, { Options as NormamlizeUrlOptions } from "normalize-url";
1415

1516
// ============================================================================
1617
export enum LoadState {
@@ -28,6 +29,20 @@ export enum QueueState {
2829
DUPE_URL = 2,
2930
}
3031

32+
// ============================================================================
33+
const normalizeUrlOpts: NormamlizeUrlOptions = {
34+
defaultProtocol: "https",
35+
stripAuthentication: false,
36+
stripTextFragment: false,
37+
stripWWW: false,
38+
stripHash: false,
39+
removeTrailingSlash: false,
40+
removeSingleSlash: false,
41+
removeExplicitPort: false,
42+
sortQueryParameters: true,
43+
removePath: false,
44+
};
45+
3146
// ============================================================================
3247
// treat 0 or 206 as 200 for purposes of dedup
3348
function normalizeDedupStatus(status: number): number {
@@ -675,7 +690,6 @@ return inx;
675690
return res >= 3;
676691
}
677692

678-
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
679693
async addToQueue(
680694
{
681695
url,
@@ -687,6 +701,7 @@ return inx;
687701
}: QueueEntry,
688702
limit = 0,
689703
) {
704+
url = normalizeUrl(url, normalizeUrlOpts);
690705
const added = this._timestamp();
691706
const data: QueueEntry = { added, url, seedId, depth, extraHops };
692707

@@ -1012,6 +1027,7 @@ return inx;
10121027
}
10131028

10141029
async addIfNoDupe(key: string, url: string, status: number) {
1030+
url = normalizeUrl(url, normalizeUrlOpts);
10151031
return (
10161032
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
10171033
1

tests/url-normalize.test.js

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import fs from "fs";
2+
import child_process from "child_process";
3+
4+
test("ensure URLs with same query args but in different order considered same URL", async () => {
5+
child_process.execSync("docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url 'https://example-com.webrecorder.net/?A=1&B=2' --url 'https://example-com.webrecorder.net/?B=2&A=1' --collection url-norm-1 --scopeType page");
6+
7+
// url is normalized, only 1 URL is crawled
8+
// check pages.jsonl for 1 URL (+ 1 header)
9+
expect(fs.readFileSync(
10+
"test-crawls/collections/url-norm-1/pages/pages.jsonl", "utf8",
11+
)
12+
.trim()
13+
.split("\n").length).toBe(1 + 1);
14+
});
15+

yarn.lock

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4150,6 +4150,11 @@ normalize-path@^3.0.0:
41504150
resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65"
41514151
integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==
41524152

4153+
normalize-url@^8.1.0:
4154+
version "8.1.0"
4155+
resolved "https://registry.yarnpkg.com/normalize-url/-/normalize-url-8.1.0.tgz#d33504f67970decf612946fd4880bc8c0983486d"
4156+
integrity sha512-X06Mfd/5aKsRHc0O0J5CUedwnPmnDtLF2+nq+KN9KSDlJHkPuh0JUviWjEWMe0SW/9TDdSLVPuk7L5gGTIA1/w==
4157+
41534158
npm-run-path@^4.0.1:
41544159
version "4.0.1"
41554160
resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea"

0 commit comments

Comments
 (0)