@@ -685,10 +685,9 @@ def get_metric_result(
685685 )
686686 except Exception as e : # pylint: disable=broad-exception-caught
687687 logger .error (
688- "Error processing metric %s for case %s: %s " ,
688+ "Error processing metric %s for case %s. " ,
689689 metric_name ,
690690 eval_case .eval_case_id ,
691- e ,
692691 exc_info = True ,
693692 )
694693 return types .EvalCaseMetricResult (
@@ -1099,7 +1098,147 @@ def aggregate(
10991098 )
11001099
11011100
1101+ class CustomCodeExecutionMetricHandler (MetricHandler ):
1102+ """Metric handler for custom code execution metrics."""
1103+
1104+ def __init__ (self , module : "evals.Evals" , metric : types .Metric ):
1105+ super ().__init__ (module = module , metric = metric )
1106+
1107+ if not self .metric .remote_custom_function :
1108+ raise ValueError (
1109+ f"CustomCodeExecutionMetricHandler for '{ self .metric .name } ' needs "
1110+ " Metric.remote_custom_function to be set."
1111+ )
1112+
1113+ def _build_request_payload (
1114+ self , eval_case : types .EvalCase , response_index : int
1115+ ) -> dict [str , Any ]:
1116+ """Builds the request parameters for evaluate instances request."""
1117+ if not eval_case .responses or response_index >= len (eval_case .responses ):
1118+ raise IndexError (f"response_index { response_index } is out of bounds." )
1119+
1120+ response_content = eval_case .responses [response_index ].response
1121+ if not response_content :
1122+ raise ValueError (
1123+ f"Response content missing for candidate { response_index } ."
1124+ )
1125+
1126+ reference_instance_data = None
1127+ if eval_case .reference :
1128+ reference_instance_data = PredefinedMetricHandler ._content_to_instance_data (
1129+ eval_case .reference .response
1130+ )
1131+
1132+ prompt_instance_data = PredefinedMetricHandler ._content_to_instance_data (
1133+ eval_case .prompt
1134+ )
1135+
1136+ instance_payload = types .EvaluationInstance (
1137+ prompt = prompt_instance_data ,
1138+ response = PredefinedMetricHandler ._content_to_instance_data (
1139+ response_content
1140+ ),
1141+ reference = reference_instance_data ,
1142+ )
1143+
1144+ return {
1145+ "instance" : instance_payload ,
1146+ }
1147+
1148+ @override
1149+ def get_metric_result (
1150+ self , eval_case : types .EvalCase , response_index : int
1151+ ) -> types .EvalCaseMetricResult :
1152+ """Processes a single evaluation case for a specific custom code execution metric."""
1153+ metric_name = self .metric .name
1154+ try :
1155+ payload = self ._build_request_payload (eval_case , response_index )
1156+ for attempt in range (_MAX_RETRIES ):
1157+ try :
1158+ api_response = self .module ._evaluate_instances (
1159+ metrics = [self .metric ],
1160+ instance = payload .get ("instance" ),
1161+ )
1162+ break
1163+ except genai_errors .ClientError as e :
1164+ if e .code == 429 :
1165+ logger .warning (
1166+ "Resource Exhausted error on attempt %d/%d: %s. Retrying in %s"
1167+ " seconds..." ,
1168+ attempt + 1 ,
1169+ _MAX_RETRIES ,
1170+ e ,
1171+ 2 ** attempt ,
1172+ )
1173+ if attempt == _MAX_RETRIES - 1 :
1174+ return types .EvalCaseMetricResult (
1175+ metric_name = metric_name ,
1176+ error_message = f"Resource exhausted after { _MAX_RETRIES } retries: { e } " ,
1177+ )
1178+ time .sleep (2 ** attempt )
1179+ else :
1180+ raise e
1181+
1182+ if (
1183+ api_response
1184+ and hasattr (api_response , "metric_results" )
1185+ and api_response .metric_results
1186+ ):
1187+ result_data = api_response .metric_results [0 ]
1188+
1189+ error_message = None
1190+ if result_data .error and getattr (result_data .error , "code" ):
1191+ error_message = f"Error in metric result: { result_data .error } "
1192+ return types .EvalCaseMetricResult (
1193+ metric_name = metric_name ,
1194+ score = result_data .score ,
1195+ explanation = result_data .explanation ,
1196+ error_message = error_message ,
1197+ )
1198+ else :
1199+ logger .error (
1200+ "Metric results missing in API response for metric '%s'."
1201+ " API response: %s" ,
1202+ metric_name ,
1203+ (
1204+ api_response .model_dump_json (exclude_none = True )
1205+ if api_response
1206+ else "None"
1207+ ),
1208+ )
1209+ return types .EvalCaseMetricResult (
1210+ metric_name = metric_name ,
1211+ error_message = "Metric results missing in API response." ,
1212+ )
1213+ except Exception as e : # pylint: disable=broad-exception-caught
1214+ logger .error (
1215+ "Error processing metric %s for case %s" ,
1216+ metric_name ,
1217+ eval_case .eval_case_id ,
1218+ exc_info = True ,
1219+ )
1220+ return types .EvalCaseMetricResult (
1221+ metric_name = metric_name , error_message = str (e )
1222+ )
1223+
1224+ @override
1225+ def aggregate (
1226+ self , eval_case_metric_results : list [types .EvalCaseMetricResult ]
1227+ ) -> types .AggregatedMetricResult :
1228+ """Aggregates the metric results for a custom code execution metric."""
1229+ logger .debug (
1230+ "Aggregating results for custom code execution metric: %s" , self .metric .name
1231+ )
1232+ return _default_aggregate_scores (
1233+ self .metric .name , eval_case_metric_results , calculate_pass_rate = True
1234+ )
1235+
1236+
11021237_METRIC_HANDLER_MAPPING = [
1238+ (
1239+ lambda m : hasattr (m , "remote_custom_function" ) and m .remote_custom_function ,
1240+ CustomCodeExecutionMetricHandler ,
1241+ ),
11031242 (
11041243 lambda m : m .custom_function and isinstance (m .custom_function , Callable ),
11051244 CustomMetricHandler ,
@@ -1125,6 +1264,7 @@ def aggregate(
11251264 TranslationMetricHandler ,
11261265 LLMMetricHandler ,
11271266 CustomMetricHandler ,
1267+ CustomCodeExecutionMetricHandler ,
11281268 PredefinedMetricHandler ,
11291269)
11301270
0 commit comments