Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"js-levenshtein": "^1.1.6",
"js-yaml": "^4.1.0",
"minio": "^7.1.3",
"normalize-url": "^8.1.0",
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
Expand Down
18 changes: 17 additions & 1 deletion src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core";
import { interpolateFilename, UploadResult } from "./storage.js";
import normalizeUrl, { Options } from "normalize-url";

// ============================================================================
export enum LoadState {
Expand All @@ -28,6 +29,20 @@ export enum QueueState {
DUPE_URL = 2,
}

// ============================================================================
const normalizeUrlOpts: Options = {
defaultProtocol: "https",
stripAuthentication: false,
stripTextFragment: false,
stripWWW: false,
stripHash: false,
removeTrailingSlash: false,
removeSingleSlash: false,
removeExplicitPort: false,
sortQueryParameters: true,
removePath: false,
};

// ============================================================================
// treat 0 or 206 as 200 for purposes of dedup
function normalizeDedupStatus(status: number): number {
Expand Down Expand Up @@ -673,7 +688,6 @@ return inx;
return res >= 3;
}

//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
async addToQueue(
{
url,
Expand All @@ -685,6 +699,7 @@ return inx;
}: QueueEntry,
limit = 0,
) {
url = normalizeUrl(url, normalizeUrlOpts);
const added = this._timestamp();
const data: QueueEntry = { added, url, seedId, depth, extraHops };

Expand Down Expand Up @@ -1010,6 +1025,7 @@ return inx;
}

async addIfNoDupe(key: string, url: string, status: number) {
url = normalizeUrl(url, normalizeUrlOpts);
return (
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
1
Expand Down
15 changes: 15 additions & 0 deletions tests/url-normalize.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import fs from "fs";
import child_process from "child_process";

test("ensure URLs with same query args but in different order considered same URL", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url 'https://example-com.webrecorder.net/?A=1&B=2' --url 'https://example-com.webrecorder.net/?B=2&A=1' --collection url-norm-1 --scopeType page");

// url is normalized, only 1 URL is crawled
// check pages.jsonl for 1 URL (+ 1 header)
expect(fs.readFileSync(
"test-crawls/collections/url-norm-1/pages/pages.jsonl", "utf8",
)
.trim()
.split("\n").length).toBe(1 + 1);
});

5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4150,6 +4150,11 @@ normalize-path@^3.0.0:
resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65"
integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==

normalize-url@^8.1.0:
version "8.1.0"
resolved "https://registry.yarnpkg.com/normalize-url/-/normalize-url-8.1.0.tgz#d33504f67970decf612946fd4880bc8c0983486d"
integrity sha512-X06Mfd/5aKsRHc0O0J5CUedwnPmnDtLF2+nq+KN9KSDlJHkPuh0JUviWjEWMe0SW/9TDdSLVPuk7L5gGTIA1/w==

npm-run-path@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea"
Expand Down
Loading