Skip to content

Commit 87bd15a

Browse files
committed
Moved isOOMRunError and added SIBABRT condition
1 parent 8b66469 commit 87bd15a

File tree

3 files changed

+71
-41
lines changed

3 files changed

+71
-41
lines changed

apps/webapp/app/v3/services/alerts/deliverAlert.server.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
RunFailedWebhook,
1414
DeploymentFailedWebhook,
1515
DeploymentSuccessWebhook,
16+
isOOMRunError,
1617
} from "@trigger.dev/core/v3";
1718
import assertNever from "assert-never";
1819
import { subtle } from "crypto";
@@ -375,7 +376,7 @@ export class DeliverAlertService extends BaseService {
375376
idempotencyKey: alert.taskRun.idempotencyKey ?? undefined,
376377
tags: alert.taskRun.runTags,
377378
error,
378-
isOutOfMemoryError: isOOMError(error),
379+
isOutOfMemoryError: isOOMRunError(error),
379380
machine: alert.taskRun.machinePreset ?? "Unknown",
380381
dashboardUrl: `${env.APP_ORIGIN}${v3RunPath(
381382
alert.project.organization,

apps/webapp/app/v3/services/completeAttempt.server.ts

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import {
1111
TaskRunSuccessfulExecutionResult,
1212
flattenAttributes,
1313
isManualOutOfMemoryError,
14+
isOOMRunError,
1415
sanitizeError,
1516
shouldRetryError,
1617
taskRunErrorEnhancer,
@@ -255,7 +256,7 @@ export class CompleteAttemptService extends BaseService {
255256

256257
let retriableError = shouldRetryError(taskRunErrorEnhancer(completion.error));
257258
let isOOMRetry = false;
258-
let isOOMAttempt = isOOMError(completion.error);
259+
let isOOMAttempt = isOOMRunError(completion.error);
259260
let isOnMaxOOMMachine = false;
260261
let oomMachine: MachinePresetName | undefined;
261262

@@ -738,45 +739,6 @@ async function findAttempt(prismaClient: PrismaClientOrTransaction, friendlyId:
738739
});
739740
}
740741

741-
export function isOOMError(error: TaskRunError) {
742-
if (error.type === "INTERNAL_ERROR") {
743-
if (
744-
error.code === "TASK_PROCESS_OOM_KILLED" ||
745-
error.code === "TASK_PROCESS_MAYBE_OOM_KILLED"
746-
) {
747-
return true;
748-
}
749-
750-
// For the purposes of retrying on a larger machine, we're going to treat this is an OOM error.
751-
// This is what they look like if we're executing using k8s. They then get corrected later, but it's too late.
752-
// {"code": "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", "type": "INTERNAL_ERROR", "message": "Process exited with code -1 after signal SIGKILL."}
753-
if (
754-
error.code === "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" &&
755-
error.message &&
756-
error.message.includes("SIGKILL") &&
757-
error.message.includes("-1")
758-
) {
759-
return true;
760-
}
761-
}
762-
763-
if (error.type === "BUILT_IN_ERROR") {
764-
// ffmpeg also does weird stuff
765-
// { "name": "Error", "type": "BUILT_IN_ERROR", "message": "ffmpeg was killed with signal SIGKILL" }
766-
if (error.message && error.message.includes("ffmpeg was killed with signal SIGKILL")) {
767-
return true;
768-
}
769-
}
770-
771-
// Special `OutOfMemoryError` for doing a manual OOM kill.
772-
// Useful if a native library does an OOM but doesn't actually crash the run and you want to manually
773-
if (isManualOutOfMemoryError(error)) {
774-
return true;
775-
}
776-
777-
return false;
778-
}
779-
780742
function exitRun(runId: string) {
781743
socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", {
782744
version: "v1",

packages/core/src/v3/errors.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,58 @@ export function isManualOutOfMemoryError(error: TaskRunError) {
7676
return false;
7777
}
7878

79+
export function isOOMRunError(error: TaskRunError) {
80+
if (error.type === "INTERNAL_ERROR") {
81+
if (
82+
error.code === "TASK_PROCESS_OOM_KILLED" ||
83+
error.code === "TASK_PROCESS_MAYBE_OOM_KILLED"
84+
) {
85+
return true;
86+
}
87+
88+
// For the purposes of retrying on a larger machine, we're going to treat this is an OOM error.
89+
// This is what they look like if we're executing using k8s. They then get corrected later, but it's too late.
90+
// {"code": "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", "type": "INTERNAL_ERROR", "message": "Process exited with code -1 after signal SIGKILL."}
91+
if (
92+
error.code === "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" &&
93+
error.message &&
94+
error.message.includes("-1")
95+
) {
96+
if (error.message.includes("SIGKILL")) {
97+
return true;
98+
}
99+
100+
if (error.message.includes("SIGABRT") && error.stackTrace) {
101+
const oomIndicators = [
102+
"JavaScript heap out of memory",
103+
"Reached heap limit",
104+
"FATAL ERROR: Reached heap limit Allocation failed",
105+
];
106+
107+
if (oomIndicators.some((indicator) => error.stackTrace!.includes(indicator))) {
108+
return true;
109+
}
110+
}
111+
}
112+
}
113+
114+
if (error.type === "BUILT_IN_ERROR") {
115+
// ffmpeg also does weird stuff
116+
// { "name": "Error", "type": "BUILT_IN_ERROR", "message": "ffmpeg was killed with signal SIGKILL" }
117+
if (error.message && error.message.includes("ffmpeg was killed with signal SIGKILL")) {
118+
return true;
119+
}
120+
}
121+
122+
// Special `OutOfMemoryError` for doing a manual OOM kill.
123+
// Useful if a native library does an OOM but doesn't actually crash the run and you want to manually
124+
if (isManualOutOfMemoryError(error)) {
125+
return true;
126+
}
127+
128+
return false;
129+
}
130+
79131
export class TaskPayloadParsedError extends Error {
80132
public readonly cause: unknown;
81133

@@ -562,6 +614,8 @@ const findSignalInMessage = (message?: string, truncateLength = 100) => {
562614
return "SIGSEGV";
563615
} else if (trunc.includes("SIGKILL")) {
564616
return "SIGKILL";
617+
} else if (trunc.includes("SIGABRT")) {
618+
return "SIGABRT";
565619
} else {
566620
return;
567621
}
@@ -587,6 +641,10 @@ export function taskRunErrorEnhancer(error: TaskRunError): EnhanceError<TaskRunE
587641
return {
588642
...getPrettyTaskRunError("TASK_PROCESS_MAYBE_OOM_KILLED"),
589643
};
644+
case "SIGABRT":
645+
return {
646+
...getPrettyTaskRunError("TASK_PROCESS_MAYBE_OOM_KILLED"),
647+
};
590648
default:
591649
return {
592650
...getPrettyTaskRunError("TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE"),
@@ -636,6 +694,10 @@ export function taskRunErrorEnhancer(error: TaskRunError): EnhanceError<TaskRunE
636694
return {
637695
...getPrettyTaskRunError("TASK_PROCESS_MAYBE_OOM_KILLED"),
638696
};
697+
case "SIGABRT":
698+
return {
699+
...getPrettyTaskRunError("TASK_PROCESS_MAYBE_OOM_KILLED"),
700+
};
639701
default: {
640702
return {
641703
...getPrettyTaskRunError("TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE"),
@@ -689,6 +751,11 @@ export function exceptionEventEnhancer(
689751
...exception,
690752
...getPrettyExceptionEvent("TASK_PROCESS_MAYBE_OOM_KILLED"),
691753
};
754+
case "SIGABRT":
755+
return {
756+
...exception,
757+
...getPrettyExceptionEvent("TASK_PROCESS_MAYBE_OOM_KILLED"),
758+
};
692759
default:
693760
return exception;
694761
}

0 commit comments

Comments
 (0)