Skip to content

Commit b1c27c2

Browse files
committed
add failed pod handler and tests
1 parent 180bd6f commit b1c27c2

File tree

5 files changed

+796
-12
lines changed

5 files changed

+796
-12
lines changed

apps/supervisor/src/env.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ const Env = z.object({
5959
POD_CLEANER_ENABLED: BoolEnv.default(true),
6060
POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000),
6161
POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500),
62+
63+
// Failed pod handler
64+
FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true),
65+
FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000),
6266
});
6367

6468
export const env = Env.parse(stdEnv);

apps/supervisor/src/index.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import { createK8sApi } from "./clients/kubernetes.js";
2222
import { collectDefaultMetrics } from "prom-client";
2323
import { register } from "./metrics.js";
2424
import { PodCleaner } from "./services/podCleaner.js";
25+
import { FailedPodHandler } from "./services/failedPodHandler.js";
2526

2627
if (env.METRICS_COLLECT_DEFAULTS) {
2728
collectDefaultMetrics({ register });
@@ -35,7 +36,9 @@ class ManagedSupervisor {
3536
private readonly logger = new SimpleStructuredLogger("managed-worker");
3637
private readonly resourceMonitor: ResourceMonitor;
3738
private readonly checkpointClient?: CheckpointClient;
39+
3840
private readonly podCleaner?: PodCleaner;
41+
private readonly failedPodHandler?: FailedPodHandler;
3942

4043
private readonly isKubernetes = isKubernetesEnvironment(env.KUBERNETES_FORCE_ENABLED);
4144
private readonly warmStartUrl = env.TRIGGER_WARM_START_URL;
@@ -53,6 +56,13 @@ class ManagedSupervisor {
5356
});
5457
}
5558

59+
if (env.FAILED_POD_HANDLER_ENABLED) {
60+
this.failedPodHandler = new FailedPodHandler({
61+
namespace: env.KUBERNETES_NAMESPACE,
62+
reconnectIntervalMs: env.FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS,
63+
});
64+
}
65+
5666
if (this.warmStartUrl) {
5767
this.logger.log("[ManagedWorker] 🔥 Warm starts enabled", {
5868
warmStartUrl: this.warmStartUrl,
@@ -293,6 +303,10 @@ class ManagedSupervisor {
293303
await this.podCleaner.start();
294304
}
295305

306+
if (this.failedPodHandler) {
307+
await this.failedPodHandler.start();
308+
}
309+
296310
if (env.TRIGGER_WORKLOAD_API_ENABLED) {
297311
this.logger.log("[ManagedWorker] Workload API enabled", {
298312
protocol: env.TRIGGER_WORKLOAD_API_PROTOCOL,
@@ -316,6 +330,10 @@ class ManagedSupervisor {
316330
if (this.podCleaner) {
317331
await this.podCleaner.stop();
318332
}
333+
334+
if (this.failedPodHandler) {
335+
await this.failedPodHandler.stop();
336+
}
319337
}
320338
}
321339

0 commit comments

Comments
 (0)