Skip to content

Commit 6044302

Browse files
Merge branch 'main' into issue/ffmpegv7
2 parents 68955dd + fa0e2bf commit 6044302

File tree

177 files changed

+6742
-1413
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

177 files changed

+6742
-1413
lines changed

.changeset/honest-files-decide.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@trigger.dev/sdk": patch
3+
---
4+
5+
Deprecate toolTask and replace with `ai.tool(mySchemaTask)`

.changeset/nice-colts-boil.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"trigger.dev": patch
3+
---
4+
5+
Improve warm start times by eagerly creating the child TaskRunProcess when a previous run as completed

.github/workflows/publish-worker-re2.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,16 @@ permissions:
1818
contents: read
1919

2020
jobs:
21-
check-branch:
22-
runs-on: ubuntu-latest
23-
steps:
24-
- name: Fail if re2-prod-* is pushed from a non-main branch
25-
if: startsWith(github.ref_name, 're2-prod-') && github.base_ref != 'main'
26-
run: |
27-
echo "🚫 re2-prod-* tags can only be pushed from the main branch."
28-
exit 1
21+
# check-branch:
22+
# runs-on: ubuntu-latest
23+
# steps:
24+
# - name: Fail if re2-prod-* is pushed from a non-main branch
25+
# if: startsWith(github.ref_name, 're2-prod-') && github.base_ref != 'main'
26+
# run: |
27+
# echo "🚫 re2-prod-* tags can only be pushed from the main branch."
28+
# exit 1
2929
build:
30-
needs: check-branch
30+
# needs: check-branch
3131
strategy:
3232
matrix:
3333
package: [supervisor]

.github/workflows/publish.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,14 @@ jobs:
5656
secrets: inherit
5757

5858
publish-webapp:
59-
needs: [typecheck, units]
59+
needs: [typecheck]
6060
uses: ./.github/workflows/publish-webapp.yml
6161
secrets: inherit
6262
with:
6363
image_tag: ${{ inputs.image_tag }}
6464

6565
publish-worker:
66-
needs: [typecheck, units]
66+
needs: [typecheck]
6767
uses: ./.github/workflows/publish-worker.yml
6868
secrets: inherit
6969
with:

.vscode/launch.json

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@
138138
"type": "node-terminal",
139139
"request": "launch",
140140
"name": "Debug RunEngine tests",
141-
"command": "pnpm run test ./src/engine/tests/releaseConcurrencyQueue.test.ts -t 'Should manage token bucket and queue correctly'",
141+
"command": "pnpm run test ./src/engine/tests/releaseConcurrencyTokenBucketQueue.test.ts -t 'Should retrieve metrics for all queues via getQueueMetrics'",
142142
"cwd": "${workspaceFolder}/internal-packages/run-engine",
143143
"sourceMaps": true
144144
},
@@ -149,6 +149,14 @@
149149
"command": "pnpm run test ./src/run-queue/index.test.ts",
150150
"cwd": "${workspaceFolder}/internal-packages/run-engine",
151151
"sourceMaps": true
152+
},
153+
{
154+
"type": "node-terminal",
155+
"request": "launch",
156+
"name": "Debug d3-demo",
157+
"command": "pnpm exec trigger dev",
158+
"cwd": "${workspaceFolder}/references/d3-demo",
159+
"sourceMaps": true
152160
}
153161
]
154162
}

apps/supervisor/package.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
"build": "tsc",
99
"dev": "tsx --require dotenv/config --watch src/index.ts || (echo '!! Remember to run: nvm use'; exit 1)",
1010
"start": "node dist/index.js",
11-
"test:watch": "vitest",
11+
"test:run": "vitest --no-file-parallelism --run",
12+
"test:watch": "vitest --no-file-parallelism",
1213
"typecheck": "tsc --noEmit"
1314
},
1415
"dependencies": {
@@ -24,6 +25,7 @@
2425
},
2526
"devDependencies": {
2627
"@types/dockerode": "^3.3.33",
27-
"docker-api-ts": "^0.2.2"
28+
"docker-api-ts": "^0.2.2",
29+
"vitest": "^1.4.0"
2830
}
2931
}

apps/supervisor/src/env.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@ const Env = z.object({
3131
// Dequeue settings (provider mode)
3232
TRIGGER_DEQUEUE_ENABLED: BoolEnv.default("true"),
3333
TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(1000),
34+
TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(10),
3435

3536
// Optional services
3637
TRIGGER_WARM_START_URL: z.string().optional(),
3738
TRIGGER_CHECKPOINT_URL: z.string().optional(),
39+
TRIGGER_METADATA_URL: z.string().optional(),
3840

3941
// Used by the workload manager, e.g docker/k8s
4042
DOCKER_NETWORK: z.string().default("host"),
@@ -49,6 +51,7 @@ const Env = z.object({
4951
// Kubernetes specific settings
5052
KUBERNETES_FORCE_ENABLED: BoolEnv.default(false),
5153
KUBERNETES_NAMESPACE: z.string().default("default"),
54+
KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"),
5255
EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"),
5356
EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"),
5457

apps/supervisor/src/index.ts

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class ManagedSupervisor {
6161
workloadApiDomain: env.TRIGGER_WORKLOAD_API_DOMAIN,
6262
workloadApiPort: env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL,
6363
warmStartUrl: this.warmStartUrl,
64+
metadataUrl: env.TRIGGER_METADATA_URL,
6465
imagePullSecrets: env.KUBERNETES_IMAGE_PULL_SECRETS?.split(","),
6566
heartbeatIntervalSeconds: env.RUNNER_HEARTBEAT_INTERVAL_SECONDS,
6667
snapshotPollIntervalSeconds: env.RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS,
@@ -98,7 +99,10 @@ class ManagedSupervisor {
9899
this.logger.warn("[ManagedWorker] Failed pod handler disabled");
99100
}
100101

101-
this.resourceMonitor = new KubernetesResourceMonitor(createK8sApi(), "");
102+
this.resourceMonitor = new KubernetesResourceMonitor(
103+
createK8sApi(),
104+
env.TRIGGER_WORKER_INSTANCE_NAME
105+
);
102106
this.workloadManager = new KubernetesWorkloadManager(workloadManagerOptions);
103107
} else {
104108
this.resourceMonitor = new DockerResourceMonitor(new Docker());
@@ -112,10 +116,11 @@ class ManagedSupervisor {
112116
managedWorkerSecret: env.MANAGED_WORKER_SECRET,
113117
dequeueIntervalMs: env.TRIGGER_DEQUEUE_INTERVAL_MS,
114118
queueConsumerEnabled: env.TRIGGER_DEQUEUE_ENABLED,
119+
maxRunCount: env.TRIGGER_DEQUEUE_MAX_RUN_COUNT,
115120
runNotificationsEnabled: env.TRIGGER_WORKLOAD_API_ENABLED,
116121
preDequeue: async () => {
117122
if (this.isKubernetes) {
118-
// TODO: Test k8s resource monitor and remove this
123+
// Not used in k8s for now
119124
return {};
120125
}
121126

@@ -219,6 +224,7 @@ class ManagedSupervisor {
219224

220225
try {
221226
await this.workloadManager.create({
227+
dequeuedAt: message.dequeuedAt,
222228
envId: message.environment.id,
223229
envType: message.environment.type,
224230
image: message.image,
@@ -233,10 +239,11 @@ class ManagedSupervisor {
233239
snapshotFriendlyId: message.snapshot.friendlyId,
234240
});
235241

236-
this.resourceMonitor.blockResources({
237-
cpu: message.run.machine.cpu,
238-
memory: message.run.machine.memory,
239-
});
242+
// Disabled for now
243+
// this.resourceMonitor.blockResources({
244+
// cpu: message.run.machine.cpu,
245+
// memory: message.run.machine.memory,
246+
// });
240247
} catch (error) {
241248
this.logger.error("[ManagedWorker] Failed to create workload", { error });
242249
}

apps/supervisor/src/services/failedPodHandler.test.ts

Lines changed: 105 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,102 @@ describe("FailedPodHandler Integration Tests", () => {
314314
await handler.stop();
315315
}
316316
}, 60000);
317+
318+
it("should handle graceful shutdown pods differently", async () => {
319+
const handler = new FailedPodHandler({ namespace, k8s, register });
320+
321+
try {
322+
// Create first batch of pods before starting handler
323+
const firstBatchPodNames = await createTestPods({
324+
k8sApi: k8s,
325+
namespace,
326+
count: 2,
327+
exitCode: FailedPodHandler.GRACEFUL_SHUTDOWN_EXIT_CODE,
328+
});
329+
330+
// Wait for pods to reach Failed state
331+
await waitForPodsPhase({
332+
k8sApi: k8s,
333+
namespace,
334+
podNames: firstBatchPodNames,
335+
phase: "Failed",
336+
});
337+
338+
// Start the handler
339+
await handler.start();
340+
341+
// Wait for first batch to be deleted
342+
await waitForPodsDeletion({
343+
k8sApi: k8s,
344+
namespace,
345+
podNames: firstBatchPodNames,
346+
});
347+
348+
// Create second batch of pods after handler is running
349+
const secondBatchPodNames = await createTestPods({
350+
k8sApi: k8s,
351+
namespace,
352+
count: 3,
353+
exitCode: FailedPodHandler.GRACEFUL_SHUTDOWN_EXIT_CODE,
354+
});
355+
356+
// Wait for second batch to be deleted
357+
await waitForPodsDeletion({
358+
k8sApi: k8s,
359+
namespace,
360+
podNames: secondBatchPodNames,
361+
});
362+
363+
// Verify metrics
364+
const metrics = handler.getMetrics();
365+
366+
// Check informer events were recorded for both batches
367+
const informerEvents = await metrics.informerEventsTotal.get();
368+
expect(informerEvents.values).toContainEqual(
369+
expect.objectContaining({
370+
labels: expect.objectContaining({
371+
namespace,
372+
verb: "add",
373+
}),
374+
value: 5, // 2 from first batch + 3 from second batch
375+
})
376+
);
377+
378+
// Check pods were processed as graceful shutdowns
379+
const processedPods = await metrics.processedPodsTotal.get();
380+
381+
// Should not be marked as Failed
382+
const failedPods = processedPods.values.find(
383+
(v) => v.labels.namespace === namespace && v.labels.status === "Failed"
384+
);
385+
expect(failedPods).toBeUndefined();
386+
387+
// Should be marked as GracefulShutdown
388+
const gracefulShutdowns = processedPods.values.find(
389+
(v) => v.labels.namespace === namespace && v.labels.status === "GracefulShutdown"
390+
);
391+
expect(gracefulShutdowns).toBeDefined();
392+
expect(gracefulShutdowns?.value).toBe(5); // Total from both batches
393+
394+
// Check pods were still deleted
395+
const deletedPods = await metrics.deletedPodsTotal.get();
396+
expect(deletedPods.values).toContainEqual(
397+
expect.objectContaining({
398+
labels: expect.objectContaining({
399+
namespace,
400+
status: "Failed",
401+
}),
402+
value: 5, // Total from both batches
403+
})
404+
);
405+
406+
// Check no deletion errors were recorded
407+
const deletionErrors = await metrics.deletionErrorsTotal.get();
408+
expect(deletionErrors.values).toHaveLength(0);
409+
} finally {
410+
await handler.stop();
411+
}
412+
}, 30000);
317413
});
318414

319415
async function createTestPods({
@@ -325,6 +421,7 @@ async function createTestPods({
325421
namePrefix = "test-pod",
326422
command = ["/bin/sh", "-c", shouldFail ? "exit 1" : "exit 0"],
327423
randomizeName = true,
424+
exitCode,
328425
}: {
329426
k8sApi: K8sApi;
330427
namespace: string;
@@ -334,9 +431,15 @@ async function createTestPods({
334431
namePrefix?: string;
335432
command?: string[];
336433
randomizeName?: boolean;
434+
exitCode?: number;
337435
}) {
338436
const createdPods: string[] = [];
339437

438+
// If exitCode is specified, override the command
439+
if (exitCode !== undefined) {
440+
command = ["/bin/sh", "-c", `exit ${exitCode}`];
441+
}
442+
340443
for (let i = 0; i < count; i++) {
341444
const podName = randomizeName
342445
? `${namePrefix}-${i}-${Math.random().toString(36).substring(2, 15)}`
@@ -352,7 +455,7 @@ async function createTestPods({
352455
restartPolicy: "Never",
353456
containers: [
354457
{
355-
name: "test",
458+
name: "run-controller", // Changed to match the name we check in failedPodHandler
356459
image: "busybox:1.37.0",
357460
command,
358461
},
@@ -470,7 +573,7 @@ async function deleteAllPodsInNamespace({
470573
const podNames = pods.items.map((p) => p.metadata?.name ?? "");
471574

472575
// Delete all pods
473-
await k8sApi.core.deleteCollectionNamespacedPod({ namespace });
576+
await k8sApi.core.deleteCollectionNamespacedPod({ namespace, gracePeriodSeconds: 0 });
474577

475578
// Wait for all pods to be deleted
476579
await waitForPodsDeletion({ k8sApi, namespace, podNames });

apps/supervisor/src/services/failedPodHandler.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { Counter, Registry, Histogram } from "prom-client";
66
import { register } from "../metrics.js";
77
import { setTimeout } from "timers/promises";
88

9-
type PodStatus = "Pending" | "Running" | "Succeeded" | "Failed" | "Unknown";
9+
type PodStatus = "Pending" | "Running" | "Succeeded" | "Failed" | "Unknown" | "GracefulShutdown";
1010

1111
export type FailedPodHandlerOptions = {
1212
namespace: string;
@@ -34,6 +34,8 @@ export class FailedPodHandler {
3434
private readonly processingDurationSeconds: Histogram<string>;
3535
private readonly informerEventsTotal: Counter;
3636

37+
static readonly GRACEFUL_SHUTDOWN_EXIT_CODE = 200;
38+
3739
constructor(opts: FailedPodHandlerOptions) {
3840
this.id = Math.random().toString(36).substring(2, 15);
3941
this.logger = new SimpleStructuredLogger("failed-pod-handler", LogLevel.debug, {
@@ -206,6 +208,21 @@ export class FailedPodHandler {
206208

207209
private async processFailedPod(pod: V1Pod) {
208210
this.logger.info("pod-failed: processing pod", this.podSummary(pod));
211+
212+
const mainContainer = pod.status?.containerStatuses?.find((c) => c.name === "run-controller");
213+
214+
// If it's our special "graceful shutdown" exit code, don't process it further, just delete it
215+
if (
216+
mainContainer?.state?.terminated?.exitCode === FailedPodHandler.GRACEFUL_SHUTDOWN_EXIT_CODE
217+
) {
218+
this.logger.debug("pod-failed: graceful shutdown detected", this.podSummary(pod));
219+
this.processedPodsTotal.inc({
220+
namespace: this.namespace,
221+
status: "GracefulShutdown",
222+
});
223+
return;
224+
}
225+
209226
this.processedPodsTotal.inc({
210227
namespace: this.namespace,
211228
status: this.podStatus(pod),

0 commit comments

Comments
 (0)