Skip to content

Commit ece9431

Browse files
committed
upgrade the runners to 8gb / 8cpu after eks fail
1 parent 053dba4 commit ece9431

File tree

6 files changed

+75
-16
lines changed

6 files changed

+75
-16
lines changed

sandbox-sidecar/src/runners/e2bRunner.ts

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,15 @@ export class E2BSandboxRunner implements SandboxRunner {
9494
private async runApply(job: SandboxRunRecord, appendLog?: (chunk: string) => void): Promise<RunnerOutput> {
9595
const requestedVersion = job.payload.terraformVersion || "1.5.7";
9696
const requestedEngine = job.payload.engine || "terraform";
97+
const startTime = Date.now();
9798
const { sandbox, needsInstall } = await this.createSandbox(requestedVersion, requestedEngine);
99+
100+
logger.info({
101+
sandboxId: sandbox.sandboxId,
102+
workingDir: job.payload.workingDirectory,
103+
isDestroy: job.payload.isDestroy,
104+
}, "Starting apply operation");
105+
98106
try {
99107
// Install IaC tool if using fallback template
100108
if (needsInstall) {
@@ -110,6 +118,8 @@ export class E2BSandboxRunner implements SandboxRunner {
110118

111119
// Run terraform init (with AWS creds if configured for benchmark)
112120
const metadata = job.payload.metadata;
121+
122+
logger.info({ sandboxId: sandbox.sandboxId, elapsed: Date.now() - startTime }, "Starting terraform init");
113123
await this.runTerraformCommand(
114124
sandbox,
115125
workDir,
@@ -118,9 +128,11 @@ export class E2BSandboxRunner implements SandboxRunner {
118128
streamLog,
119129
metadata,
120130
);
131+
logger.info({ sandboxId: sandbox.sandboxId, elapsed: Date.now() - startTime }, "Terraform init completed");
121132

122133
// Run terraform apply/destroy
123134
const applyCommand = job.payload.isDestroy ? "destroy" : "apply";
135+
logger.info({ sandboxId: sandbox.sandboxId, command: applyCommand, elapsed: Date.now() - startTime }, "Starting terraform apply/destroy");
124136
const applyResult = await this.runTerraformCommand(
125137
sandbox,
126138
workDir,
@@ -129,6 +141,7 @@ export class E2BSandboxRunner implements SandboxRunner {
129141
streamLog,
130142
metadata,
131143
);
144+
logger.info({ sandboxId: sandbox.sandboxId, command: applyCommand, elapsed: Date.now() - startTime }, "Terraform apply/destroy completed");
132145

133146
// Log the apply output for debugging
134147
logger.info({
@@ -173,9 +186,30 @@ export class E2BSandboxRunner implements SandboxRunner {
173186
state: stateBase64,
174187
};
175188

189+
logger.info({ sandboxId: sandbox.sandboxId, elapsed: Date.now() - startTime }, "Apply operation completed successfully");
176190
return { logs: logs.join(""), result };
191+
} catch (err) {
192+
const elapsed = Date.now() - startTime;
193+
const errorMessage = err instanceof Error ? err.message : String(err);
194+
195+
// Log detailed error info for debugging sandbox termination issues
196+
logger.error({
197+
sandboxId: sandbox.sandboxId,
198+
elapsed,
199+
elapsedSeconds: Math.round(elapsed / 1000),
200+
errorMessage,
201+
errorType: err instanceof Error ? err.constructor.name : typeof err,
202+
workingDir: job.payload.workingDirectory,
203+
}, "Apply operation failed - sandbox may have been terminated");
204+
205+
throw err;
177206
} finally {
178-
await sandbox.kill();
207+
try {
208+
await sandbox.kill();
209+
} catch (killErr) {
210+
// Sandbox may already be dead, that's fine
211+
logger.debug({ killErr }, "Failed to kill sandbox (may already be terminated)");
212+
}
179213
}
180214
}
181215

@@ -201,11 +235,19 @@ export class E2BSandboxRunner implements SandboxRunner {
201235
logger.warn({ templateId, engine, version }, "no pre-built template found, will install at runtime");
202236
}
203237

204-
// Extend sandbox lifetime to 30 minutes for long-running benchmarks
238+
// Extend sandbox lifetime to 1 hour for long-running benchmarks (EKS, etc.)
205239
// Default is 5 minutes which isn't enough for large terraform applies
206-
const sandboxTimeoutSeconds = 30 * 60; // 30 minutes
240+
// Pro tier supports up to 24 hours, Hobby up to 1 hour
241+
const sandboxTimeoutSeconds = 60 * 60; // 1 hour
242+
243+
// NOTE: CPU/memory are set at template BUILD time, not runtime
244+
// See templates/build-all.ts for resource configuration (8 CPU, 8GB RAM)
245+
246+
logger.info({
247+
templateId,
248+
timeoutSeconds: sandboxTimeoutSeconds,
249+
}, "creating E2B sandbox");
207250

208-
logger.info({ templateId, timeoutSeconds: sandboxTimeoutSeconds }, "creating E2B sandbox");
209251
const sandbox = await Sandbox.create(templateId, {
210252
apiKey: this.options.apiKey,
211253
timeoutMs: sandboxTimeoutSeconds * 1000,
@@ -380,9 +422,18 @@ export class E2BSandboxRunner implements SandboxRunner {
380422
appendLog?.(chunk);
381423
};
382424

383-
// Use long timeout for benchmarks (30 minutes) - some operations like 10k resources take a while
384-
// Set to 0 to disable timeout entirely if needed
385-
const timeoutMs = 30 * 60 * 1000; // 30 minutes
425+
// Use long timeout for benchmarks (1 hour) - EKS and large operations need this
426+
// Pro tier supports up to 24 hours, Hobby up to 1 hour
427+
const timeoutMs = 60 * 60 * 1000; // 1 hour
428+
429+
// Explicitly extend sandbox lifetime before running long commands
430+
// This ensures the sandbox won't be killed mid-operation
431+
try {
432+
await Sandbox.setTimeout(sandbox.sandboxId, timeoutMs, { apiKey: this.options.apiKey });
433+
logger.info({ sandboxId: sandbox.sandboxId, timeoutMs }, "Extended sandbox timeout before command");
434+
} catch (err) {
435+
logger.warn({ err, sandboxId: sandbox.sandboxId }, "Failed to extend sandbox timeout (continuing anyway)");
436+
}
386437

387438
const result = await sandbox.commands.run(cmdStr, {
388439
cwd,

sandbox-sidecar/src/templateRegistry.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ export interface TemplateInfo {
88
}
99

1010
// Template version - bump this when the build recipe changes
11-
const TEMPLATE_VERSION = "0.1.2";
11+
const TEMPLATE_VERSION = "0.1.3";
1212

1313
// Generate alias matching the build system
1414
function aliasFor(engine: string, version: string, tplVersion: string): string {

sandbox-sidecar/templates/build-all.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ async function main() {
2828

2929
await Template.build(buildTemplateObject(spec), {
3030
alias,
31-
cpuCount: 2,
32-
memoryMB: 4096,
31+
cpuCount: 8, // Max for Pro tier (was 2)
32+
memoryMB: 8192, // 8GB - Max for Pro tier (was 4GB)
3333
onBuildLogs: defaultBuildLogger(),
3434
});
3535

sandbox-sidecar/templates/build.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ import { template } from "./test-template.ts";
55
async function main() {
66
const buildInfo = await Template.build(template, {
77
alias: "terraform-prebuilt-new", // template name / alias
8-
cpuCount: 4,
9-
memoryMB: 2048,
8+
cpuCount: 8, // Max for Pro tier
9+
memoryMB: 8192, // 8GB - Max for Pro tier
1010
onBuildLogs: defaultBuildLogger(),
1111
});
1212

sandbox-sidecar/templates/manifest.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export interface TemplateSpec {
77
tplVersion: string;
88
}
99

10-
export const TEMPLATE_VERSION = "0.1.2"; // bump this when recipe changes
10+
export const TEMPLATE_VERSION = "0.1.3"; // bump this when recipe changes (8 CPU, 8GB RAM)
1111

1212
export const TEMPLATES: TemplateSpec[] = [
1313
{ engine: "terraform", engineVersion: "1.0.11", tplVersion: TEMPLATE_VERSION },

taco/internal/github/commands.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ func (e *CommandExecutor) Execute(ctx context.Context, req *ExecuteRequest) *Com
154154
logger.Warn("AWS_ACCESS_KEY_ID not set - AWS resources will fail")
155155
}
156156

157+
// Save clone timing before the switch (since result gets reassigned)
158+
cloneTime := result.Timing.Clone
159+
157160
// 8. Execute based on action
158161
switch req.Command.Action {
159162
case "plan":
@@ -164,13 +167,18 @@ func (e *CommandExecutor) Execute(ctx context.Context, req *ExecuteRequest) *Com
164167
result = e.executeApply(ctx, logger, req, runID, unitID, archive, state, tfVersion, engine, workingDir, metadata, totalStart, true)
165168
case "benchmark":
166169
result = e.executeBenchmark(ctx, logger, req, runID, unitID, archive, tfVersion, engine, workingDir, metadata, totalStart)
170+
// Restore the clone time we measured before the switch
171+
result.Timing.Clone = cloneTime
167172
default:
168173
result.Error = fmt.Sprintf("Unknown action: %s", req.Command.Action)
169174
}
170175

171-
result.Timing.Clone = time.Since(cloneStart) - result.Timing.Init - result.Timing.Execute
172-
if result.Timing.Clone < 0 {
173-
result.Timing.Clone = 0
176+
// For non-benchmark actions, recalculate Clone as total minus init/execute
177+
if req.Command.Action != "benchmark" {
178+
result.Timing.Clone = time.Since(cloneStart) - result.Timing.Init - result.Timing.Execute
179+
if result.Timing.Clone < 0 {
180+
result.Timing.Clone = 0
181+
}
174182
}
175183

176184
return result

0 commit comments

Comments
 (0)