upgrade the runners to 8gb / 8cpu after eks fail

breardon2011 · breardon2011 · commit ece943127664 · 2025-12-03T11:36:50.000-08:00
diff --git a/sandbox-sidecar/src/runners/e2bRunner.ts b/sandbox-sidecar/src/runners/e2bRunner.ts
@@ -94,7 +94,15 @@ export class E2BSandboxRunner implements SandboxRunner {
   private async runApply(job: SandboxRunRecord, appendLog?: (chunk: string) => void): Promise<RunnerOutput> {
     const requestedVersion = job.payload.terraformVersion || "1.5.7";
       const requestedEngine = job.payload.engine || "terraform";
+      const startTime = Date.now();
       const { sandbox, needsInstall } = await this.createSandbox(requestedVersion, requestedEngine);
+      
+      logger.info({ 
+        sandboxId: sandbox.sandboxId, 
+        workingDir: job.payload.workingDirectory,
+        isDestroy: job.payload.isDestroy,
+      }, "Starting apply operation");
+      
       try {
         // Install IaC tool if using fallback template
         if (needsInstall) {
@@ -110,6 +118,8 @@ export class E2BSandboxRunner implements SandboxRunner {
 
       // Run terraform init (with AWS creds if configured for benchmark)
       const metadata = job.payload.metadata;
+      
+      logger.info({ sandboxId: sandbox.sandboxId, elapsed: Date.now() - startTime }, "Starting terraform init");
       await this.runTerraformCommand(
         sandbox,
         workDir,
@@ -118,9 +128,11 @@ export class E2BSandboxRunner implements SandboxRunner {
         streamLog,
         metadata,
       );
+      logger.info({ sandboxId: sandbox.sandboxId, elapsed: Date.now() - startTime }, "Terraform init completed");
 
       // Run terraform apply/destroy
       const applyCommand = job.payload.isDestroy ? "destroy" : "apply";
+      logger.info({ sandboxId: sandbox.sandboxId, command: applyCommand, elapsed: Date.now() - startTime }, "Starting terraform apply/destroy");
       const applyResult = await this.runTerraformCommand(
         sandbox,
         workDir,
@@ -129,6 +141,7 @@ export class E2BSandboxRunner implements SandboxRunner {
         streamLog,
         metadata,
       );
+      logger.info({ sandboxId: sandbox.sandboxId, command: applyCommand, elapsed: Date.now() - startTime }, "Terraform apply/destroy completed");
 
       // Log the apply output for debugging
       logger.info({ 
@@ -173,9 +186,30 @@ export class E2BSandboxRunner implements SandboxRunner {
         state: stateBase64,
       };
 
+      logger.info({ sandboxId: sandbox.sandboxId, elapsed: Date.now() - startTime }, "Apply operation completed successfully");
       return { logs: logs.join(""), result };
+    } catch (err) {
+      const elapsed = Date.now() - startTime;
+      const errorMessage = err instanceof Error ? err.message : String(err);
+      
+      // Log detailed error info for debugging sandbox termination issues
+      logger.error({
+        sandboxId: sandbox.sandboxId,
+        elapsed,
+        elapsedSeconds: Math.round(elapsed / 1000),
+        errorMessage,
+        errorType: err instanceof Error ? err.constructor.name : typeof err,
+        workingDir: job.payload.workingDirectory,
+      }, "Apply operation failed - sandbox may have been terminated");
+      
+      throw err;
     } finally {
-      await sandbox.kill();
+      try {
+        await sandbox.kill();
+      } catch (killErr) {
+        // Sandbox may already be dead, that's fine
+        logger.debug({ killErr }, "Failed to kill sandbox (may already be terminated)");
+      }
     }
   }
 
@@ -201,11 +235,19 @@ export class E2BSandboxRunner implements SandboxRunner {
       logger.warn({ templateId, engine, version }, "no pre-built template found, will install at runtime");
     }
     
-    // Extend sandbox lifetime to 30 minutes for long-running benchmarks
+    // Extend sandbox lifetime to 1 hour for long-running benchmarks (EKS, etc.)
     // Default is 5 minutes which isn't enough for large terraform applies
-    const sandboxTimeoutSeconds = 30 * 60; // 30 minutes
+    // Pro tier supports up to 24 hours, Hobby up to 1 hour
+    const sandboxTimeoutSeconds = 60 * 60; // 1 hour
+    
+    // NOTE: CPU/memory are set at template BUILD time, not runtime
+    // See templates/build-all.ts for resource configuration (8 CPU, 8GB RAM)
+    
+    logger.info({ 
+      templateId, 
+      timeoutSeconds: sandboxTimeoutSeconds,
+    }, "creating E2B sandbox");
     
-    logger.info({ templateId, timeoutSeconds: sandboxTimeoutSeconds }, "creating E2B sandbox");
     const sandbox = await Sandbox.create(templateId, {
       apiKey: this.options.apiKey,
       timeoutMs: sandboxTimeoutSeconds * 1000,
@@ -380,9 +422,18 @@ export class E2BSandboxRunner implements SandboxRunner {
       appendLog?.(chunk);
     };
 
-    // Use long timeout for benchmarks (30 minutes) - some operations like 10k resources take a while
-    // Set to 0 to disable timeout entirely if needed
-    const timeoutMs = 30 * 60 * 1000; // 30 minutes
+    // Use long timeout for benchmarks (1 hour) - EKS and large operations need this
+    // Pro tier supports up to 24 hours, Hobby up to 1 hour
+    const timeoutMs = 60 * 60 * 1000; // 1 hour
+    
+    // Explicitly extend sandbox lifetime before running long commands
+    // This ensures the sandbox won't be killed mid-operation
+    try {
+      await Sandbox.setTimeout(sandbox.sandboxId, timeoutMs, { apiKey: this.options.apiKey });
+      logger.info({ sandboxId: sandbox.sandboxId, timeoutMs }, "Extended sandbox timeout before command");
+    } catch (err) {
+      logger.warn({ err, sandboxId: sandbox.sandboxId }, "Failed to extend sandbox timeout (continuing anyway)");
+    }
     
     const result = await sandbox.commands.run(cmdStr, {
       cwd,
diff --git a/sandbox-sidecar/src/templateRegistry.ts b/sandbox-sidecar/src/templateRegistry.ts
@@ -8,7 +8,7 @@ export interface TemplateInfo {
 }
 
 // Template version - bump this when the build recipe changes
-const TEMPLATE_VERSION = "0.1.2";
+const TEMPLATE_VERSION = "0.1.3";
 
 // Generate alias matching the build system
 function aliasFor(engine: string, version: string, tplVersion: string): string {
diff --git a/sandbox-sidecar/templates/build-all.ts b/sandbox-sidecar/templates/build-all.ts
@@ -28,8 +28,8 @@ async function main() {
 
     await Template.build(buildTemplateObject(spec), {
       alias,
-      cpuCount: 2,
-      memoryMB: 4096,
+      cpuCount: 8,      // Max for Pro tier (was 2)
+      memoryMB: 8192,   // 8GB - Max for Pro tier (was 4GB)
       onBuildLogs: defaultBuildLogger(),
     });
 
diff --git a/sandbox-sidecar/templates/build.ts b/sandbox-sidecar/templates/build.ts
@@ -5,8 +5,8 @@ import { template } from "./test-template.ts";
 async function main() {
   const buildInfo = await Template.build(template, {
     alias: "terraform-prebuilt-new",           // template name / alias
-    cpuCount: 4,
-    memoryMB: 2048,
+    cpuCount: 8,      // Max for Pro tier
+    memoryMB: 8192,   // 8GB - Max for Pro tier
     onBuildLogs: defaultBuildLogger(),
   });
 
diff --git a/sandbox-sidecar/templates/manifest.ts b/sandbox-sidecar/templates/manifest.ts
@@ -7,7 +7,7 @@ export interface TemplateSpec {
   tplVersion: string;      
 }
 
-export const TEMPLATE_VERSION = "0.1.2";  // bump this when recipe changes
+export const TEMPLATE_VERSION = "0.1.3";  // bump this when recipe changes (8 CPU, 8GB RAM)
 
 export const TEMPLATES: TemplateSpec[] = [
   { engine: "terraform", engineVersion: "1.0.11", tplVersion: TEMPLATE_VERSION },
diff --git a/taco/internal/github/commands.go b/taco/internal/github/commands.go
@@ -154,6 +154,9 @@ func (e *CommandExecutor) Execute(ctx context.Context, req *ExecuteRequest) *Com
 		logger.Warn("AWS_ACCESS_KEY_ID not set - AWS resources will fail")
 	}
 
+	// Save clone timing before the switch (since result gets reassigned)
+	cloneTime := result.Timing.Clone
+
 	// 8. Execute based on action
 	switch req.Command.Action {
 	case "plan":
@@ -164,13 +167,18 @@ func (e *CommandExecutor) Execute(ctx context.Context, req *ExecuteRequest) *Com
 		result = e.executeApply(ctx, logger, req, runID, unitID, archive, state, tfVersion, engine, workingDir, metadata, totalStart, true)
 	case "benchmark":
 		result = e.executeBenchmark(ctx, logger, req, runID, unitID, archive, tfVersion, engine, workingDir, metadata, totalStart)
+		// Restore the clone time we measured before the switch
+		result.Timing.Clone = cloneTime
 	default:
 		result.Error = fmt.Sprintf("Unknown action: %s", req.Command.Action)
 	}
 
-	result.Timing.Clone = time.Since(cloneStart) - result.Timing.Init - result.Timing.Execute
-	if result.Timing.Clone < 0 {
-		result.Timing.Clone = 0
+	// For non-benchmark actions, recalculate Clone as total minus init/execute
+	if req.Command.Action != "benchmark" {
+		result.Timing.Clone = time.Since(cloneStart) - result.Timing.Init - result.Timing.Execute
+		if result.Timing.Clone < 0 {
+			result.Timing.Clone = 0
+		}
 	}
 
 	return result

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ export interface TemplateInfo {`
`8`	`8`	`}`
`9`	`9`
`10`	`10`	`// Template version - bump this when the build recipe changes`
`11`		`-const TEMPLATE_VERSION = "0.1.2";`
	`11`	`+const TEMPLATE_VERSION = "0.1.3";`
`12`	`12`
`13`	`13`	`// Generate alias matching the build system`
`14`	`14`	`function aliasFor(engine: string, version: string, tplVersion: string): string {`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ export interface TemplateSpec {`
`7`	`7`	`tplVersion: string;`
`8`	`8`	`}`
`9`	`9`
`10`		`-export const TEMPLATE_VERSION = "0.1.2"; // bump this when recipe changes`
	`10`	`+export const TEMPLATE_VERSION = "0.1.3"; // bump this when recipe changes (8 CPU, 8GB RAM)`
`11`	`11`
`12`	`12`	`export const TEMPLATES: TemplateSpec[] = [`
`13`	`13`	`{ engine: "terraform", engineVersion: "1.0.11", tplVersion: TEMPLATE_VERSION },`