More failure condition tests

matt-aitken · matt-aitken · commit c46d56059cce · 2025-03-06T18:37:41.000Z
diff --git a/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts b/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts
@@ -278,6 +278,119 @@ describe("RunEngine attempt failures", () => {
     }
   });
 
+  containerTest(
+    "Fail (not a retriable error)",
+    { timeout: 15_000 },
+    async ({ prisma, redisOptions }) => {
+      //create environment
+      const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+      const engine = new RunEngine({
+        prisma,
+        worker: {
+          redis: redisOptions,
+          workers: 1,
+          tasksPerWorker: 10,
+          pollIntervalMs: 100,
+        },
+        queue: {
+          redis: redisOptions,
+        },
+        runLock: {
+          redis: redisOptions,
+        },
+        machines: {
+          defaultMachine: "small-1x",
+          machines: {
+            "small-1x": {
+              name: "small-1x" as const,
+              cpu: 0.5,
+              memory: 0.5,
+              centsPerMs: 0.0001,
+            },
+          },
+          baseCostInCents: 0.0001,
+        },
+        tracer: trace.getTracer("test", "0.0.0"),
+      });
+
+      try {
+        const taskIdentifier = "test-task";
+
+        //create background worker
+        await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, undefined, {
+          maxAttempts: 1,
+        });
+
+        //trigger the run
+        const run = await engine.trigger(
+          {
+            number: 1,
+            friendlyId: "run_1234",
+            environment: authenticatedEnvironment,
+            taskIdentifier,
+            payload: "{}",
+            payloadType: "application/json",
+            context: {},
+            traceContext: {},
+            traceId: "t12345",
+            spanId: "s12345",
+            masterQueue: "main",
+            queueName: "task/test-task",
+            isTest: false,
+            tags: [],
+          },
+          prisma
+        );
+
+        //dequeue the run
+        const dequeued = await engine.dequeueFromMasterQueue({
+          consumerId: "test_12345",
+          masterQueue: run.masterQueue,
+          maxRunCount: 10,
+        });
+
+        //create an attempt
+        const attemptResult = await engine.startRunAttempt({
+          runId: dequeued[0].run.id,
+          snapshotId: dequeued[0].snapshot.id,
+        });
+
+        //fail the attempt with an unretriable error
+        const error = {
+          type: "INTERNAL_ERROR" as const,
+          code: "DISK_SPACE_EXCEEDED" as const,
+        };
+        const result = await engine.completeRunAttempt({
+          runId: dequeued[0].run.id,
+          snapshotId: attemptResult.snapshot.id,
+          completion: {
+            ok: false,
+            id: dequeued[0].run.id,
+            error,
+            retry: {
+              timestamp: Date.now(),
+              delay: 0,
+            },
+          },
+        });
+        expect(result.attemptStatus).toBe("RUN_FINISHED");
+        expect(result.snapshot.executionStatus).toBe("FINISHED");
+        expect(result.run.status).toBe("CRASHED");
+
+        //state should be pending
+        const executionData3 = await engine.getRunExecutionData({ runId: run.id });
+        assertNonNullable(executionData3);
+        expect(executionData3.snapshot.executionStatus).toBe("FINISHED");
+        //only when the new attempt is created, should the attempt be increased
+        expect(executionData3.run.attemptNumber).toBe(1);
+        expect(executionData3.run.status).toBe("CRASHED");
+      } finally {
+        engine.quit();
+      }
+    }
+  );
+
   containerTest("OOM fail", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
     //create environment
     const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
@@ -553,4 +666,170 @@ describe("RunEngine attempt failures", () => {
       }
     }
   );
+
+  containerTest(
+    "OOM fails after retrying on larger machine",
+    { timeout: 15_000 },
+    async ({ prisma, redisOptions }) => {
+      //create environment
+      const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+      const engine = new RunEngine({
+        prisma,
+        worker: {
+          redis: redisOptions,
+          workers: 1,
+          tasksPerWorker: 10,
+          pollIntervalMs: 100,
+        },
+        queue: {
+          redis: redisOptions,
+        },
+        runLock: {
+          redis: redisOptions,
+        },
+        machines: {
+          defaultMachine: "small-1x",
+          machines: {
+            "small-1x": {
+              name: "small-1x" as const,
+              cpu: 0.5,
+              memory: 0.5,
+              centsPerMs: 0.0001,
+            },
+            "small-2x": {
+              name: "small-2x" as const,
+              cpu: 1,
+              memory: 1,
+              centsPerMs: 0.0002,
+            },
+          },
+          baseCostInCents: 0.0001,
+        },
+        tracer: trace.getTracer("test", "0.0.0"),
+      });
+
+      try {
+        const taskIdentifier = "test-task";
+
+        //create background worker
+        await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, undefined, {
+          outOfMemory: {
+            machine: "small-2x",
+          },
+        });
+
+        //trigger the run
+        const run = await engine.trigger(
+          {
+            number: 1,
+            friendlyId: "run_1234",
+            environment: authenticatedEnvironment,
+            taskIdentifier,
+            payload: "{}",
+            payloadType: "application/json",
+            context: {},
+            traceContext: {},
+            traceId: "t12345",
+            spanId: "s12345",
+            masterQueue: "main",
+            queueName: "task/test-task",
+            isTest: false,
+            tags: [],
+          },
+          prisma
+        );
+
+        //dequeue the run
+        const dequeued = await engine.dequeueFromMasterQueue({
+          consumerId: "test_12345",
+          masterQueue: run.masterQueue,
+          maxRunCount: 10,
+        });
+
+        //create first attempt
+        const attemptResult = await engine.startRunAttempt({
+          runId: dequeued[0].run.id,
+          snapshotId: dequeued[0].snapshot.id,
+        });
+
+        //fail the first attempt with an OOM error
+        const error = {
+          type: "INTERNAL_ERROR" as const,
+          code: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" as const,
+          message: "Process exited with code -1 after signal SIGKILL.",
+          stackTrace: "JavaScript heap out of memory",
+        };
+
+        const result = await engine.completeRunAttempt({
+          runId: dequeued[0].run.id,
+          snapshotId: attemptResult.snapshot.id,
+          completion: {
+            ok: false,
+            id: dequeued[0].run.id,
+            error,
+            retry: {
+              timestamp: Date.now(),
+              delay: 0,
+            },
+          },
+        });
+
+        // The run should be retried with a larger machine
+        expect(result.attemptStatus).toBe("RETRY_QUEUED");
+        expect(result.snapshot.executionStatus).toBe("QUEUED");
+        expect(result.run.status).toBe("RETRYING_AFTER_FAILURE");
+
+        //state should be queued
+        const executionData = await engine.getRunExecutionData({ runId: run.id });
+        assertNonNullable(executionData);
+        expect(executionData.snapshot.executionStatus).toBe("QUEUED");
+        expect(executionData.run.attemptNumber).toBe(1);
+        expect(executionData.run.status).toBe("RETRYING_AFTER_FAILURE");
+
+        //dequeue again
+        const dequeued2 = await engine.dequeueFromMasterQueue({
+          consumerId: "test_12345",
+          masterQueue: run.masterQueue,
+          maxRunCount: 10,
+        });
+
+        //create second attempt
+        const attemptResult2 = await engine.startRunAttempt({
+          runId: dequeued2[0].run.id,
+          snapshotId: dequeued2[0].snapshot.id,
+        });
+        expect(attemptResult2.run.attemptNumber).toBe(2);
+
+        //fail the second attempt with the same OOM error
+        const result2 = await engine.completeRunAttempt({
+          runId: dequeued2[0].run.id,
+          snapshotId: attemptResult2.snapshot.id,
+          completion: {
+            ok: false,
+            id: dequeued2[0].run.id,
+            error,
+            retry: {
+              timestamp: Date.now(),
+              delay: 0,
+            },
+          },
+        });
+
+        // The run should fail after the second OOM
+        expect(result2.attemptStatus).toBe("RUN_FINISHED");
+        expect(result2.snapshot.executionStatus).toBe("FINISHED");
+        expect(result2.run.status).toBe("CRASHED");
+
+        //final state should be crashed
+        const finalExecutionData = await engine.getRunExecutionData({ runId: run.id });
+        assertNonNullable(finalExecutionData);
+        expect(finalExecutionData.snapshot.executionStatus).toBe("FINISHED");
+        expect(finalExecutionData.run.attemptNumber).toBe(2);
+        expect(finalExecutionData.run.status).toBe("CRASHED");
+      } finally {
+        engine.quit();
+      }
+    }
+  );
 });