Skip to content

Commit 1d879d5

Browse files
committed
🤖 Fix SSH exec timeout with ControlMaster socket reuse
## Problem SSH commands with timeout could run indefinitely on remote hosts despite client-side timeouts. When ControlMaster multiplexing was enabled, killing the local SSH client didn't terminate the remote command—it continued running under the shared master connection. ## Solution **Three-layer timeout protection:** 1. **Client-side timeout** - SIGKILL after timeout expires (immediate local cleanup) 2. **Remote-side timeout** - Wrap commands with `timeout -s KILL` utility 3. **SSH connection timeout** - ConnectTimeout + ServerAlive* options Remote timeout is set to `timeout + 1s` to ensure local timeout normally fires first for cleaner error handling. Uses BusyBox-compatible syntax (`-s KILL`) for Alpine Linux compatibility. **ControlMaster always enabled** - Socket reuse now works for all operations, including timed commands. This preserves 50-200ms performance savings per operation from connection pooling. ## Testing - Added timeout integration test for both local and SSH runtimes - Verifies `sleep 10` with 1s timeout completes in ~1s (not 10s) - All 88 integration tests pass - 807 unit tests pass (4 pre-existing gitService failures unrelated) ## Impact Commands with timeout now properly terminate on remote hosts while maintaining the performance benefits of SSH connection multiplexing.
1 parent f36f7a4 commit 1d879d5

File tree

2 files changed

+61
-4
lines changed

2 files changed

+61
-4
lines changed

src/runtime/SSHRuntime.ts

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,22 @@ export class SSHRuntime implements Runtime {
9696
parts.push(command);
9797

9898
// Join all parts with && to ensure each step succeeds before continuing
99-
const fullCommand = parts.join(" && ");
99+
let fullCommand = parts.join(" && ");
100+
101+
// Wrap remote command with timeout if specified
102+
// This ensures the command is killed on the remote side even if the local
103+
// SSH client is killed but the ControlMaster connection persists
104+
if (options.timeout !== undefined) {
105+
// Use timeout command with KILL signal
106+
// Set remote timeout slightly longer (+1s) than local timeout to ensure
107+
// the local timeout fires first in normal cases (for cleaner error handling)
108+
// Note: Using BusyBox-compatible syntax (-s KILL) which also works with GNU timeout
109+
const remoteTimeout = Math.ceil(options.timeout) + 1;
110+
fullCommand = `timeout -s KILL ${remoteTimeout} bash -c ${shescape.quote(fullCommand)}`;
111+
}
100112

101113
// Wrap in bash -c with shescape for safe shell execution
102-
const remoteCommand = `bash -c ${shescape.quote(fullCommand)}`;
114+
const remoteCommand = options.timeout ? fullCommand : `bash -c ${shescape.quote(fullCommand)}`;
103115

104116
// Build SSH args
105117
const sshArgs: string[] = ["-T"];
@@ -123,10 +135,32 @@ export class SSHRuntime implements Runtime {
123135
// ControlMaster=auto: Create master connection if none exists, otherwise reuse
124136
// ControlPath: Unix socket path for multiplexing
125137
// ControlPersist=60: Keep master connection alive for 60s after last session
138+
//
139+
// Socket reuse is safe even with timeouts because:
140+
// - Each SSH command gets its own channel within the multiplexed connection
141+
// - SIGKILL on the client immediately closes that channel
142+
// - Remote sshd terminates the command when the channel closes
143+
// - Multiplexing only shares the TCP connection, not command lifetime
126144
sshArgs.push("-o", "ControlMaster=auto");
127145
sshArgs.push("-o", `ControlPath=${this.controlPath}`);
128146
sshArgs.push("-o", "ControlPersist=60");
129147

148+
// Set comprehensive timeout options to ensure SSH respects the timeout
149+
// ConnectTimeout: Maximum time to wait for connection establishment (DNS, TCP handshake, SSH auth)
150+
// ServerAliveInterval: Send keepalive every 5 seconds to detect dead connections
151+
// ServerAliveCountMax: Consider connection dead after 2 missed keepalives (10 seconds total)
152+
// Together these ensure that:
153+
// 1. Connection establishment can't hang indefinitely
154+
// 2. Established connections that die are detected quickly
155+
// 3. The overall timeout is respected from the moment ssh command starts
156+
if (options.timeout !== undefined) {
157+
// Use the configured timeout for connection establishment
158+
sshArgs.push("-o", `ConnectTimeout=${Math.ceil(options.timeout)}`);
159+
// Set aggressive keepalives to detect dead connections
160+
sshArgs.push("-o", "ServerAliveInterval=5");
161+
sshArgs.push("-o", "ServerAliveCountMax=2");
162+
}
163+
130164
sshArgs.push(this.config.host, remoteCommand);
131165

132166
// Debug: log the actual SSH command being executed
@@ -173,14 +207,14 @@ export class SSHRuntime implements Runtime {
173207

174208
// Handle abort signal
175209
if (options.abortSignal) {
176-
options.abortSignal.addEventListener("abort", () => sshProcess.kill());
210+
options.abortSignal.addEventListener("abort", () => sshProcess.kill("SIGKILL"));
177211
}
178212

179213
// Handle timeout
180214
if (options.timeout !== undefined) {
181215
setTimeout(() => {
182216
timedOut = true;
183-
sshProcess.kill();
217+
sshProcess.kill("SIGKILL");
184218
}, options.timeout * 1000);
185219
}
186220

tests/runtime/runtime.test.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,29 @@ describeIntegration("Runtime integration tests", () => {
142142

143143
expect(result.stdout.trim()).toContain(workspace.path);
144144
});
145+
test.concurrent(
146+
"handles timeout correctly",
147+
async () => {
148+
const runtime = createRuntime();
149+
await using workspace = await TestWorkspace.create(runtime, type);
150+
151+
// Command that sleeps longer than timeout
152+
const startTime = performance.now();
153+
const result = await execBuffered(runtime, "sleep 10", {
154+
cwd: workspace.path,
155+
timeout: 1, // 1 second timeout
156+
});
157+
const duration = performance.now() - startTime;
158+
159+
// Exit code should be EXIT_CODE_TIMEOUT (-998)
160+
expect(result.exitCode).toBe(-998);
161+
// Should complete in around 1 second, not 10 seconds
162+
// Allow some margin for overhead (especially on SSH)
163+
expect(duration).toBeLessThan(3000); // 3 seconds max
164+
expect(duration).toBeGreaterThan(500); // At least 0.5 seconds
165+
},
166+
15000
167+
); // 15 second timeout for test (includes workspace creation overhead)
145168
});
146169

147170
describe("readFile() - File reading", () => {

0 commit comments

Comments
 (0)