Skip to content

Commit 30646ca

Browse files
authored
Add downloads dir to cache external dependency within the crawl (#921)
Fixes #920 - Downloads profile, custom behavior, and seed list to `/downloads` directory in the crawl - Seed File: Downloaded into downloads. Never refetched if already exists on subsequent crawl restarts. - Custom Behaviors: Git: Downloaded into dir, then moved to /downloads/behaviors/<dir name>. if already exist, failure to downloaded will reuse existing directory - Custom Behaviors: File: Downloaded into temp file, then moved to /downloads/behaviors/<name.js>. if already exists, failure to download will reuse existing file. - Profile: using `/profile` directory to contain the browser profile - Profile: downloaded to temp file, then placed into /downloads/profile.tar.gz. If failed to download, but already exists, existing /profile directory is used - Also fixes #897
1 parent 1d15a15 commit 30646ca

File tree

11 files changed

+423
-121
lines changed

11 files changed

+423
-121
lines changed

src/crawler.ts

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ export class Crawler {
156156
warcCdxDir: string;
157157
indexesDir: string;
158158

159+
downloadsDir: string;
160+
159161
screenshotWriter: WARCWriter | null;
160162
textWriter: WARCWriter | null;
161163

@@ -289,6 +291,9 @@ export class Crawler {
289291
this.warcCdxDir = path.join(this.collDir, "warc-cdx");
290292
this.indexesDir = path.join(this.collDir, "indexes");
291293

294+
// download dirs
295+
this.downloadsDir = path.join(this.collDir, "downloads");
296+
292297
this.screenshotWriter = null;
293298
this.textWriter = null;
294299

@@ -307,7 +312,7 @@ export class Crawler {
307312

308313
this.customBehaviors = "";
309314

310-
this.browser = new Browser();
315+
this.browser = new Browser(this.collDir);
311316
}
312317

313318
protected parseArgs() {
@@ -503,6 +508,8 @@ export class Crawler {
503508
await fsp.mkdir(this.warcCdxDir, { recursive: true });
504509
}
505510

511+
await fsp.mkdir(this.downloadsDir, { recursive: true });
512+
506513
this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" });
507514
logger.setExternalLogStream(this.logFH);
508515

@@ -514,7 +521,7 @@ export class Crawler {
514521
this.proxyServer = res.proxyServer;
515522
this.proxyPacUrl = res.proxyPacUrl;
516523

517-
this.seeds = await parseSeeds(this.params);
524+
this.seeds = await parseSeeds(this.downloadsDir, this.params);
518525
this.numOriginalSeeds = this.seeds.length;
519526

520527
logger.info("Seeds", this.seeds);
@@ -1015,7 +1022,10 @@ self.__bx_behaviors.selectMainBehavior();
10151022
async loadCustomBehaviors(sources: string[]) {
10161023
let str = "";
10171024

1018-
for (const { contents } of await collectCustomBehaviors(sources)) {
1025+
for (const { contents } of await collectCustomBehaviors(
1026+
this.downloadsDir,
1027+
sources,
1028+
)) {
10191029
str += `self.__bx_behaviors.load(${contents});\n`;
10201030
}
10211031

@@ -1029,7 +1039,10 @@ self.__bx_behaviors.selectMainBehavior();
10291039
return;
10301040
}
10311041

1032-
for (const { path, contents } of await collectCustomBehaviors(sources)) {
1042+
for (const { path, contents } of await collectCustomBehaviors(
1043+
this.downloadsDir,
1044+
sources,
1045+
)) {
10331046
await this.browser.checkScript(cdp, path, contents);
10341047
}
10351048
}

src/create-login-profile.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env node
22

33
import fs from "fs";
4+
import os from "os";
45
import http, { IncomingMessage, ServerResponse } from "http";
56

67
import readline from "readline";
@@ -203,7 +204,7 @@ async function main() {
203204
]);
204205
}
205206

206-
const browser = new Browser();
207+
const browser = new Browser(os.tmpdir());
207208

208209
await browser.launch({
209210
profileUrl: params.profile,

src/util/browser.ts

Lines changed: 76 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import * as child_process from "child_process";
22
import fs from "fs";
3+
import fsp from "node:fs/promises";
34
import { pipeline } from "node:stream/promises";
45
import { Readable } from "node:stream";
6+
import crypto from "crypto";
57

6-
import os from "os";
78
import path from "path";
89

910
import { formatErr, LogContext, logger } from "./logger.js";
@@ -31,6 +32,7 @@ import puppeteer, {
3132
import { Recorder } from "./recorder.js";
3233
import { timedRun } from "./timing.js";
3334
import assert from "node:assert";
35+
import { replaceDir } from "./file_reader.js";
3436

3537
type BtrixChromeOpts = {
3638
proxyServer?: string;
@@ -61,6 +63,7 @@ const BROWSER_HEIGHT_OFFSET = 81;
6163

6264
// ==================================================================
6365
export class Browser {
66+
downloadsDir: string;
6467
profileDir: string;
6568
customProfile = false;
6669
// TODO: Fix this the next time the file is edited.
@@ -81,12 +84,9 @@ export class Browser {
8184
screenHeight: number;
8285
screenWHRatio: number;
8386

84-
constructor() {
85-
this.profileDir = path.join(os.tmpdir(), "btrixProfile");
86-
if (fs.existsSync(this.profileDir)) {
87-
fs.rmSync(this.profileDir, { recursive: true, force: true });
88-
}
89-
fs.mkdirSync(this.profileDir);
87+
constructor(rootDir: string) {
88+
this.downloadsDir = path.join(rootDir, "downloads");
89+
this.profileDir = path.join(rootDir, "profile");
9090

9191
// must be provided, part of Dockerfile
9292
assert(process.env.GEOMETRY);
@@ -112,9 +112,7 @@ export class Browser {
112112
return;
113113
}
114114

115-
if (profileUrl) {
116-
this.customProfile = await this.loadProfile(profileUrl);
117-
}
115+
await this.installProfile(profileUrl);
118116

119117
this.swOpt = swOpt;
120118

@@ -190,61 +188,97 @@ export class Browser {
190188
}
191189
}
192190

193-
async loadProfile(profileFilename: string): Promise<boolean> {
194-
const targetFilename = path.join(os.tmpdir(), "profile.tar.gz");
191+
async installProfile(profileUrl: string) {
192+
await fsp.mkdir(this.profileDir, { recursive: true });
193+
194+
if (!profileUrl) {
195+
return;
196+
}
197+
198+
const profileTarGz = path.join(this.downloadsDir, "profile.tar.gz");
199+
200+
const exists = fs.existsSync(profileTarGz);
201+
202+
const suffix = crypto.randomBytes(4).toString("hex");
203+
204+
const tmpProfileDest = path.join(
205+
this.downloadsDir,
206+
`profile-${suffix}.tar.gz`,
207+
);
208+
const tmpProfileDir = path.join(this.downloadsDir, `profile-${suffix}`);
209+
210+
await fsp.mkdir(tmpProfileDir, { recursive: true });
211+
212+
try {
213+
await this.loadProfile(profileUrl, tmpProfileDest, tmpProfileDir);
214+
215+
// replace old profile dir with new profile dir
216+
await replaceDir(tmpProfileDir, this.profileDir, exists);
217+
218+
// replace old tarball with new tarball
219+
await fsp.rename(tmpProfileDest, profileTarGz);
220+
} catch (e) {
221+
if (exists) {
222+
logger.warn(
223+
"Error updating profile, using existing profile",
224+
formatErr(e),
225+
"browser",
226+
);
227+
} else {
228+
// remove the temp profile dir, likely empty
229+
await fsp.rm(tmpProfileDir, { recursive: true });
230+
logger.fatal("Profile setup failed", formatErr(e), "browser");
231+
}
232+
}
233+
this.customProfile = true;
234+
}
195235

236+
async loadProfile(
237+
profileRemoteSrc: string,
238+
profileLocalSrc: string,
239+
profileDir: string,
240+
) {
196241
if (
197-
profileFilename &&
198-
(profileFilename.startsWith("http:") ||
199-
profileFilename.startsWith("https:"))
242+
profileRemoteSrc &&
243+
(profileRemoteSrc.startsWith("http:") ||
244+
profileRemoteSrc.startsWith("https:"))
200245
) {
201246
logger.info(
202-
`Downloading ${profileFilename} to ${targetFilename}`,
247+
`Downloading ${profileRemoteSrc} to ${profileLocalSrc}`,
203248
{},
204249
"browser",
205250
);
206251

207-
const resp = await fetch(profileFilename);
252+
const resp = await fetch(profileRemoteSrc);
208253
await pipeline(
209254
// TODO: Fix this the next time the file is edited.
210255
// eslint-disable-next-line @typescript-eslint/no-explicit-any
211256
Readable.fromWeb(resp.body as any),
212-
fs.createWriteStream(targetFilename),
257+
fs.createWriteStream(profileLocalSrc),
213258
);
214-
215-
profileFilename = targetFilename;
216-
} else if (profileFilename && profileFilename.startsWith("@")) {
259+
} else if (profileRemoteSrc && profileRemoteSrc.startsWith("@")) {
217260
const storage = initStorage();
218261

219262
if (!storage) {
220-
logger.fatal(
263+
throw new Error(
221264
"Profile specified relative to s3 storage, but no S3 storage defined",
222265
);
223-
return false;
224266
}
225267

226-
await storage.downloadFile(profileFilename.slice(1), targetFilename);
227-
228-
profileFilename = targetFilename;
268+
await storage.downloadFile(profileRemoteSrc.slice(1), profileLocalSrc);
269+
} else {
270+
await fsp.copyFile(profileRemoteSrc, profileLocalSrc);
229271
}
230272

231-
if (profileFilename) {
232-
try {
233-
child_process.execSync("tar xvfz " + profileFilename, {
234-
cwd: this.profileDir,
235-
});
236-
this.removeSingletons();
237-
return true;
238-
} catch (e) {
239-
logger.fatal(
240-
`Profile filename ${profileFilename} not a valid tar.gz, can not load profile, exiting`,
241-
{},
242-
"browser",
243-
);
244-
}
273+
try {
274+
child_process.execSync("tar xvfz " + profileLocalSrc, {
275+
cwd: profileDir,
276+
stdio: "ignore",
277+
});
278+
this.removeSingletons();
279+
} catch (e) {
280+
throw new Error(`Profile ${profileLocalSrc} not a valid tar.gz`);
245281
}
246-
247-
return false;
248282
}
249283

250284
removeSingletons() {

0 commit comments

Comments
 (0)