Skip to content

Commit ffe9c34

Browse files
authored
🤖 perf: reduce SSH backoff max to 10s with jitter (#1249)
Reduces max SSH backoff from 60s to 10s for faster recovery after transient failures. **Changes:** - Backoff schedule: `[1, 2, 4, 7, 10]` seconds (was `[1, 5, 10, 20, 40, 60]`) - Add ±20% jitter to prevent thundering herd when different hosts recover simultaneously - Same-host serialization via singleflighting remains unchanged (herd only released on success) **Tests added:** - Backoff caps at ~10s with jitter - Callers waking from backoff share single probe --- _Generated with `mux` • Model: `anthropic:claude-opus-4-5` • Thinking: `high`_
1 parent 0202716 commit ffe9c34

File tree

2 files changed

+80
-4
lines changed

2 files changed

+80
-4
lines changed

src/node/runtime/sshConnectionPool.test.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,26 @@ describe("SSHConnectionPool", () => {
185185
expect(health?.backoffUntil).toBeDefined();
186186
});
187187

188+
test("backoff caps at ~10s with jitter", () => {
189+
const pool = new SSHConnectionPool();
190+
const config: SSHRuntimeConfig = {
191+
host: "test.example.com",
192+
srcBaseDir: "/work",
193+
};
194+
195+
// Report many failures to hit the cap
196+
for (let i = 0; i < 10; i++) {
197+
pool.reportFailure(config, "Connection refused");
198+
}
199+
200+
const health = pool.getConnectionHealth(config)!;
201+
const backoffMs = health.backoffUntil!.getTime() - Date.now();
202+
203+
// Max base is 10s, jitter adds ±20%, so max is ~12s (10 * 1.2)
204+
expect(backoffMs).toBeGreaterThan(7_500); // 10 * 0.8 - some tolerance
205+
expect(backoffMs).toBeLessThanOrEqual(12_500); // 10 * 1.2 + some tolerance
206+
});
207+
188208
test("resetBackoff clears backoff state after failed probe", async () => {
189209
const pool = new SSHConnectionPool();
190210
const config: SSHRuntimeConfig = {
@@ -317,5 +337,52 @@ describe("SSHConnectionPool", () => {
317337
// Only 1 failure should be recorded (not 3) - proves singleflighting worked
318338
expect(pool.getConnectionHealth(config)?.consecutiveFailures).toBe(1);
319339
});
340+
341+
test("callers waking from backoff share single probe (herd only released on success)", async () => {
342+
const pool = new SSHConnectionPool();
343+
const config: SSHRuntimeConfig = {
344+
host: "test.example.com",
345+
srcBaseDir: "/work",
346+
};
347+
348+
// Put connection in backoff
349+
pool.reportFailure(config, "Initial failure");
350+
expect(pool.getConnectionHealth(config)?.consecutiveFailures).toBe(1);
351+
352+
let probeCount = 0;
353+
const sleepResolvers: Array<() => void> = [];
354+
355+
// Start 3 waiters - they'll all sleep through backoff
356+
const waiters = [1, 2, 3].map(() =>
357+
pool.acquireConnection(config, {
358+
sleep: () =>
359+
new Promise<void>((resolve) => {
360+
sleepResolvers.push(() => {
361+
// When sleep resolves, simulate recovery (mark healthy)
362+
// This happens during the first probe - all waiters share it
363+
if (probeCount === 0) {
364+
probeCount++;
365+
pool.markHealthy(config);
366+
}
367+
resolve();
368+
});
369+
}),
370+
})
371+
);
372+
373+
// Let all sleepers proceed
374+
await Promise.resolve(); // Let all acquireConnection calls reach sleep
375+
expect(sleepResolvers.length).toBe(3);
376+
377+
// Wake them all up "simultaneously"
378+
sleepResolvers.forEach((resolve) => resolve());
379+
380+
// All should succeed
381+
await Promise.all(waiters);
382+
383+
// Only one "probe" (markHealthy) should have happened
384+
expect(probeCount).toBe(1);
385+
expect(pool.getConnectionHealth(config)?.status).toBe("healthy");
386+
});
320387
});
321388
});

src/node/runtime/sshConnectionPool.ts

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,18 @@ export interface ConnectionHealth {
5454
}
5555

5656
/**
57-
* Backoff schedule in seconds: 1s → 5s → 10s → 20s → 40s → 60s (cap)
57+
* Backoff schedule in seconds: 1s → 2s → 4s → 7s → 10s (cap)
58+
* Kept short to avoid blocking user actions; thundering herd is mitigated by jitter.
5859
*/
59-
const BACKOFF_SCHEDULE = [1, 5, 10, 20, 40, 60];
60+
const BACKOFF_SCHEDULE = [1, 2, 4, 7, 10];
61+
62+
/**
63+
* Add ±20% jitter to prevent thundering herd when multiple clients recover simultaneously.
64+
*/
65+
function withJitter(seconds: number): number {
66+
const jitterFactor = 0.8 + Math.random() * 0.4; // 0.8 to 1.2
67+
return seconds * jitterFactor;
68+
}
6069

6170
/**
6271
* Time after which a "healthy" connection should be re-probed.
@@ -315,7 +324,7 @@ export class SSHConnectionPool {
315324
const current = this.health.get(key);
316325
const failures = (current?.consecutiveFailures ?? 0) + 1;
317326
const backoffIndex = Math.min(failures - 1, BACKOFF_SCHEDULE.length - 1);
318-
const backoffSecs = BACKOFF_SCHEDULE[backoffIndex];
327+
const backoffSecs = withJitter(BACKOFF_SCHEDULE[backoffIndex]);
319328

320329
this.health.set(key, {
321330
status: "unhealthy",
@@ -326,7 +335,7 @@ export class SSHConnectionPool {
326335
});
327336

328337
log.warn(
329-
`SSH connection failed (${failures} consecutive). Backoff for ${backoffSecs}s. Error: ${error}`
338+
`SSH connection failed (${failures} consecutive). Backoff for ${backoffSecs.toFixed(1)}s. Error: ${error}`
330339
);
331340
}
332341

0 commit comments

Comments
 (0)