fix issue with swallowing batch completion error in the callback

ericallam · ericallam · commit 8d67a4b18a7d · 2025-12-15T16:29:12.000Z
diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts
@@ -941,23 +941,15 @@ const EnvironmentSchema = z
     BATCH_TRIGGER_WORKER_REDIS_CLUSTER_MODE_ENABLED: z.string().default("0"),
 
     // BatchQueue DRR settings (Run Engine v2)
-    BATCH_QUEUE_DRR_QUANTUM: z.coerce.number().int().default(5),
-    BATCH_QUEUE_MAX_DEFICIT: z.coerce.number().int().default(50),
-    BATCH_QUEUE_CONSUMER_COUNT: z.coerce.number().int().optional(),
-    BATCH_QUEUE_CONSUMER_INTERVAL_MS: z.coerce.number().int().optional(),
+    BATCH_QUEUE_DRR_QUANTUM: z.coerce.number().int().default(25),
+    BATCH_QUEUE_MAX_DEFICIT: z.coerce.number().int().default(100),
+    BATCH_QUEUE_CONSUMER_COUNT: z.coerce.number().int().default(3),
+    BATCH_QUEUE_CONSUMER_INTERVAL_MS: z.coerce.number().int().default(50),
     // Global rate limit: max items processed per second across all consumers
     // If not set, no global rate limiting is applied
     BATCH_QUEUE_GLOBAL_RATE_LIMIT: z.coerce.number().int().positive().optional(),
-
-    // Batch rate limits and concurrency by plan type
-    // Rate limit: max items per minute for batch creation
-    BATCH_RATE_LIMIT_FREE: z.coerce.number().int().default(100), // 100 items/min for free
-    BATCH_RATE_LIMIT_PAID: z.coerce.number().int().default(10_000), // 10k items/min for paid
-    BATCH_RATE_LIMIT_ENTERPRISE: z.coerce.number().int().default(100_000), // 100k items/min for enterprise
-    // Processing concurrency: max concurrent batch items being processed
-    BATCH_CONCURRENCY_FREE: z.coerce.number().int().default(1),
-    BATCH_CONCURRENCY_PAID: z.coerce.number().int().default(10),
-    BATCH_CONCURRENCY_ENTERPRISE: z.coerce.number().int().default(50),
+    // Processing concurrency: max concurrent batch items being processed per environment
+    BATCH_CONCURRENCY_DEFAULT_CONCURRENCY: z.coerce.number().int().default(1),
 
     ADMIN_WORKER_ENABLED: z.string().default(process.env.WORKER_ENABLED ?? "true"),
     ADMIN_WORKER_CONCURRENCY_WORKERS: z.coerce.number().int().default(2),
diff --git a/apps/webapp/app/routes/api.v3.batches.ts b/apps/webapp/app/routes/api.v3.batches.ts
@@ -31,7 +31,7 @@ const { action, loader } = createActionApiRoute(
     headers: HeadersSchema,
     body: CreateBatchRequestBody,
     allowJWT: true,
-    maxContentLength: 65_536, // 64KB is plenty for the batch metadata
+    maxContentLength: 131_072, // 128KB is plenty for the batch metadata
     authorization: {
       action: "batchTrigger",
       resource: () => ({
diff --git a/apps/webapp/app/runEngine/services/createBatch.server.ts b/apps/webapp/app/runEngine/services/createBatch.server.ts
@@ -37,7 +37,7 @@ export class CreateBatchService extends WithRunEngine {
   private readonly validator: DefaultTriggerTaskValidator;
 
   constructor(protected readonly _prisma: PrismaClientOrTransaction = prisma) {
-    super({ prisma });
+    super({ prisma: _prisma });
 
     this.queueConcern = new DefaultQueueManager(this._prisma, this._engine);
     this.validator = new DefaultTriggerTaskValidator();
diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts
@@ -176,7 +176,7 @@ function createRunEngine() {
       consumerIntervalMs: env.BATCH_QUEUE_CONSUMER_INTERVAL_MS,
       // Default processing concurrency when no specific limit is set
       // This is overridden per-batch based on the plan type at batch creation
-      defaultConcurrency: env.BATCH_CONCURRENCY_PAID, // Use paid plan default as baseline
+      defaultConcurrency: env.BATCH_CONCURRENCY_DEFAULT_CONCURRENCY,
       // Optional global rate limiter - limits max items/sec processed across all consumers
       globalRateLimiter: env.BATCH_QUEUE_GLOBAL_RATE_LIMIT
         ? createBatchGlobalRateLimiter(env.BATCH_QUEUE_GLOBAL_RATE_LIMIT)
diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts
@@ -796,6 +796,8 @@ export function setupBatchQueueCallbacks() {
         batchId,
         error: error instanceof Error ? error.message : String(error),
       });
+      // Re-throw to preserve Redis data for retry (BatchQueue expects errors to propagate)
+      throw error;
     }
   });
 
diff --git a/internal-packages/run-engine/src/batch-queue/tests/index.test.ts b/internal-packages/run-engine/src/batch-queue/tests/index.test.ts
@@ -569,4 +569,93 @@ describe("BatchQueue", () => {
       }
     });
   });
+
+  describe("completion callback error handling", () => {
+    redisTest(
+      "should preserve Redis data when completion callback throws an error",
+      async ({ redisContainer }) => {
+        const queue = createBatchQueue(redisContainer, { startConsumers: true });
+        let callbackCallCount = 0;
+        let lastCompletionResult: CompleteBatchResult | null = null;
+
+        try {
+          queue.onProcessItem(async ({ itemIndex }) => {
+            return { success: true, runId: `run_${itemIndex}` };
+          });
+
+          queue.onBatchComplete(async (result) => {
+            callbackCallCount++;
+            lastCompletionResult = result;
+            // Simulate database failure on first attempt
+            if (callbackCallCount === 1) {
+              throw new Error("Database temporarily unavailable");
+            }
+          });
+
+          await queue.initializeBatch(createInitOptions("batch1", "env1", 3));
+          await enqueueItems(queue, "batch1", "env1", createBatchItems(3));
+
+          // Wait for completion callback to be called (and fail)
+          await vi.waitFor(
+            () => {
+              expect(callbackCallCount).toBeGreaterThanOrEqual(1);
+            },
+            { timeout: 5000 }
+          );
+
+          // Redis data should still exist after callback failure
+          const meta = await queue.getBatchMeta("batch1");
+          expect(meta).not.toBeNull();
+          expect(meta?.batchId).toBe("batch1");
+
+          // Verify the completion result was correct
+          expect(lastCompletionResult).not.toBeNull();
+          expect(lastCompletionResult!.batchId).toBe("batch1");
+          expect(lastCompletionResult!.successfulRunCount).toBe(3);
+          expect(lastCompletionResult!.runIds).toHaveLength(3);
+        } finally {
+          await queue.close();
+        }
+      }
+    );
+
+    redisTest(
+      "should cleanup Redis data when completion callback succeeds",
+      async ({ redisContainer }) => {
+        const queue = createBatchQueue(redisContainer, { startConsumers: true });
+        let completionCalled = false;
+
+        try {
+          queue.onProcessItem(async ({ itemIndex }) => {
+            return { success: true, runId: `run_${itemIndex}` };
+          });
+
+          queue.onBatchComplete(async () => {
+            completionCalled = true;
+            // Callback succeeds - no error thrown
+          });
+
+          await queue.initializeBatch(createInitOptions("batch1", "env1", 3));
+          await enqueueItems(queue, "batch1", "env1", createBatchItems(3));
+
+          // Wait for completion
+          await vi.waitFor(
+            () => {
+              expect(completionCalled).toBe(true);
+            },
+            { timeout: 5000 }
+          );
+
+          // Small delay to ensure cleanup has occurred
+          await new Promise((resolve) => setTimeout(resolve, 100));
+
+          // Redis data should be cleaned up after successful callback
+          const meta = await queue.getBatchMeta("batch1");
+          expect(meta).toBeNull();
+        } finally {
+          await queue.close();
+        }
+      }
+    );
+  });
 });

Original file line number	Diff line number	Diff line change
`@@ -796,6 +796,8 @@ export function setupBatchQueueCallbacks() {`
`796`	`796`	`batchId,`
`797`	`797`	`error: error instanceof Error ? error.message : String(error),`
`798`	`798`	`});`
	`799`	`+ // Re-throw to preserve Redis data for retry (BatchQueue expects errors to propagate)`
	`800`	`+ throw error;`
`799`	`801`	`}`
`800`	`802`	`});`
`801`	`803`