Skip to content

Commit 8cc9cd2

Browse files
reza-mahdiani-mReza Mahdiani
andauthored
[RAPTOR-14422] Drum fast fail when config file has issues (#1688)
* fast fails when config file is faulty * fast fail when config file can not be loaded * refactoring * rebased --------- Co-authored-by: Reza Mahdiani <reza.mahdiani@datarobot.com>
1 parent 0eaab02 commit 8cc9cd2

File tree

4 files changed

+58
-11
lines changed

4 files changed

+58
-11
lines changed

custom_model_runner/datarobot_drum/drum/drum.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
This is proprietary source code of DataRobot, Inc. and its affiliates.
55
Released under the terms of DataRobot Tool and Utility Agreement.
66
"""
7+
78
import contextlib
89
import copy
910
import json
@@ -775,12 +776,16 @@ def _prepare_prediction_server_or_batch_pipeline(self, run_language):
775776
"triton_grpc_port": int(options.triton_grpc_port),
776777
"api_token": options.api_token,
777778
"allow_dr_api_access": options.allow_dr_api_access,
778-
"query_params": '"{}"'.format(options.query)
779-
if getattr(options, "query", None) is not None
780-
else "null",
781-
"content_type": '"{}"'.format(options.content_type)
782-
if getattr(options, "content_type", None) is not None
783-
else "null",
779+
"query_params": (
780+
'"{}"'.format(options.query)
781+
if getattr(options, "query", None) is not None
782+
else "null"
783+
),
784+
"content_type": (
785+
'"{}"'.format(options.content_type)
786+
if getattr(options, "content_type", None) is not None
787+
else "null"
788+
),
784789
"target_type": self.target_type.value,
785790
"user_secrets_mount_path": getattr(options, "user_secrets_mount_path", None),
786791
"user_secrets_prefix": getattr(options, "user_secrets_prefix", None),
@@ -806,9 +811,11 @@ def _prepare_prediction_server_or_batch_pipeline(self, run_language):
806811
"engine_type": "Generic",
807812
"component_type": "prediction_server",
808813
"processes": options.max_workers if getattr(options, "max_workers") else "null",
809-
"deployment_config": '"{}"'.format(options.deployment_config)
810-
if getattr(options, "deployment_config", None) is not None
811-
else "null",
814+
"deployment_config": (
815+
'"{}"'.format(options.deployment_config)
816+
if getattr(options, "deployment_config", None) is not None
817+
else "null"
818+
),
812819
}
813820
)
814821

custom_model_runner/datarobot_drum/drum/exceptions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,11 @@ class DrumSerializationError(DrumException):
4444

4545
class DrumRootComponentException(DrumException):
4646
"""Raised when there is an issue specific to root components."""
47+
48+
49+
class UnrecoverableError(DrumException):
50+
"""A base exception for any error that is considered fatal and main runner should terminate immediately."""
51+
52+
53+
class UnrecoverableConfigurationError(UnrecoverableError):
54+
"""Raised when failure in parsing or validating configuration file."""

custom_model_runner/datarobot_drum/drum/gpu_predictors/vllm_predictor.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44
This is proprietary source code of DataRobot, Inc. and its affiliates.
55
Released under the terms of DataRobot Tool and Utility Agreement.
66
"""
7+
78
import json
89
import os
910
import subprocess
1011
from pathlib import Path
1112

1213
from datarobot_drum.drum.enum import CustomHooks
13-
from datarobot_drum.drum.exceptions import DrumCommonException
14+
from datarobot_drum.drum.exceptions import DrumCommonException, UnrecoverableConfigurationError
1415
from datarobot_drum.drum.gpu_predictors.base import BaseOpenAiGpuPredictor
1516

1617

@@ -60,8 +61,19 @@ def download_and_serve_model(self):
6061
# For advanced users, allow them to specify arbitrary CLI options that we haven't exposed
6162
# via runtime parameters.
6263
if engine_config_file.is_file():
63-
config = json.loads(engine_config_file.read_text())
64+
try:
65+
config = json.loads(engine_config_file.read_text())
66+
except Exception as e:
67+
# Catch any other file-related errors
68+
raise UnrecoverableConfigurationError(
69+
f"Failed to read or parse critical config file engine_config.json: {e}"
70+
) from e
6471
if "args" in config:
72+
cli_args = config["args"]
73+
if not isinstance(cli_args, list):
74+
raise UnrecoverableConfigurationError(
75+
f"Invalid configuration in engine_config.json: 'args' must be a list, but found type '{type(cli_args).__name__}'."
76+
)
6577
self.logger.info(f"Loading CLI args from config file: {engine_config_file}...")
6678
cmd.extend(config["args"])
6779

custom_model_runner/datarobot_drum/drum/main.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,15 @@
44
This is proprietary source code of DataRobot, Inc. and its affiliates.
55
Released under the terms of DataRobot Tool and Utility Agreement.
66
"""
7+
8+
import logging
9+
import threading
10+
711
from flask import Flask
812
from datarobot_drum.drum.gunicorn.context import WorkerCtx
913

14+
from datarobot_drum.drum.exceptions import UnrecoverableError
15+
1016
#!/usr/bin/env python3
1117

1218
"""
@@ -142,5 +148,19 @@ def signal_handler(sig, frame):
142148
sys.exit(ExitCodes.SCHEMA_VALIDATION_ERROR.value)
143149

144150

151+
def _handle_thread_exception(args):
152+
"""
153+
This global hook is called for any unhandled exception in any thread.
154+
"""
155+
if issubclass(args.exc_type, UnrecoverableError):
156+
logging.critical(
157+
f"CRITICAL: An unrecoverable error occurred in thread '{args.thread.name}': {args.exc_value}. Terminating process immediately.",
158+
exc_info=(args.exc_type, args.exc_value, args.exc_traceback),
159+
)
160+
os._exit(1)
161+
162+
163+
threading.excepthook = _handle_thread_exception
164+
145165
if __name__ == "__main__":
146166
main()

0 commit comments

Comments
 (0)