Skip to content

Commit d11ce89

Browse files
salma-elshafeySalma Elshafey
andauthored
[Hot fix] Ensure query is is not None for tool-based evaluators (#43909)
* Ensure query exists for tool-based evaluators * Remove unnecessary condition in test * Fix condition * Fix spelling mistake --------- Co-authored-by: Salma Elshafey <selshafey@microsoft.com>
1 parent 866e0d7 commit d11ce89

File tree

6 files changed

+148
-8
lines changed

6 files changed

+148
-8
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,15 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
205205
:return: The evaluation result.
206206
:rtype: Dict
207207
"""
208+
if eval_input.get("query") is None:
209+
raise EvaluationException(
210+
message=("Query is a required input to the Tool Call Accuracy evaluator."),
211+
internal_message=("Query is a required input to the Tool Call Accuracy evaluator."),
212+
blame=ErrorBlame.USER_ERROR,
213+
category=ErrorCategory.INVALID_VALUE,
214+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
215+
)
216+
208217
# Single LLM call for all tool calls
209218
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
210219
llm_output = prompty_output_dict.get("llm_output", {})

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,12 +153,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
153153
:return: A dictionary containing the result of the evaluation.
154154
:rtype: Dict[str, Union[str, float]]
155155
"""
156-
# Format conversation history for cleaner evaluation
157-
if "query" in eval_input:
158-
eval_input["query"] = reformat_conversation_history(
159-
eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
156+
if eval_input.get("query") is None:
157+
raise EvaluationException(
158+
message=("Query is a required input to " "the Tool Input Accuracy evaluator."),
159+
internal_message=("Query is a required input " "to the Tool Input Accuracy evaluator."),
160+
blame=ErrorBlame.USER_ERROR,
161+
category=ErrorCategory.INVALID_VALUE,
162+
target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
160163
)
161164

165+
# Format conversation history for cleaner evaluation
166+
eval_input["query"] = reformat_conversation_history(
167+
eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
168+
)
169+
162170
# Call the LLM to evaluate
163171
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
164172
llm_output = prompty_output_dict.get("llm_output", {})

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -175,12 +175,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
175175
:return: A dictionary containing the result of the evaluation.
176176
:rtype: Dict[str, Union[str, float]]
177177
"""
178-
# Format conversation history for cleaner evaluation
179-
if "query" in eval_input:
180-
eval_input["query"] = reformat_conversation_history(
181-
eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
178+
if eval_input.get("query") is None:
179+
raise EvaluationException(
180+
message=("Query is a required input to the Tool Selection evaluator."),
181+
internal_message=("Query is a required input to the Tool Selection evaluator."),
182+
blame=ErrorBlame.USER_ERROR,
183+
category=ErrorCategory.INVALID_VALUE,
184+
target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
182185
)
183186

187+
# Format conversation history for cleaner evaluation
188+
eval_input["query"] = reformat_conversation_history(
189+
eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
190+
)
191+
184192
# Call the LLM to evaluate
185193
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
186194
llm_output = prompty_output_dict.get("llm_output", {})

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -688,3 +688,41 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
688688
assert result is not None
689689
assert result[key] == 5.0
690690
assert result[f"{key}_result"] == "pass"
691+
692+
def test_evaluate_missing_query(self, mock_model_config):
693+
"""Test that evaluator raises exception when query is None or missing."""
694+
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
695+
evaluator._flow = MagicMock(side_effect=flow_side_effect)
696+
697+
tool_calls = [
698+
{
699+
"type": "tool_call",
700+
"tool_call_id": "call_good",
701+
"name": "get_weather",
702+
"arguments": {"location": "Paris"},
703+
}
704+
]
705+
tool_definitions = [
706+
{
707+
"name": "get_weather",
708+
"type": "function",
709+
"description": "Get weather information",
710+
"parameters": {
711+
"type": "object",
712+
"properties": {"location": {"type": "string", "description": "The location"}},
713+
"required": ["location"],
714+
},
715+
}
716+
]
717+
718+
# Test with query=None
719+
with pytest.raises(EvaluationException) as exc_info:
720+
evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions)
721+
722+
assert "Query is a required input" in str(exc_info.value)
723+
724+
# Test with query not provided at all
725+
with pytest.raises(EvaluationException) as exc_info:
726+
evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions)
727+
728+
assert "Query is a required input" in str(exc_info.value)

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,3 +652,46 @@ def test_evaluate_with_single_tool_definition(self, mock_model_config):
652652
assert result is not None
653653
assert result[key] == 1
654654
assert result[f"{key}_result"] == "pass"
655+
656+
def test_evaluate_missing_query(self, mock_model_config):
657+
"""Test that evaluator raises exception when query is None or missing."""
658+
evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
659+
evaluator._flow = MagicMock(side_effect=flow_side_effect)
660+
661+
response = [
662+
{
663+
"role": "assistant",
664+
"content": [
665+
{
666+
"type": "tool_call",
667+
"tool_call_id": "call_123",
668+
"name": "get_weather",
669+
"arguments": {"location": "Paris"},
670+
}
671+
],
672+
}
673+
]
674+
tool_definitions = [
675+
{
676+
"name": "get_weather",
677+
"type": "function",
678+
"description": "Get weather information for a location",
679+
"parameters": {
680+
"type": "object",
681+
"properties": {"location": {"type": "string", "description": "The location to get weather for"}},
682+
"required": ["location"],
683+
},
684+
}
685+
]
686+
687+
# Test with query=None
688+
with pytest.raises(EvaluationException) as exc_info:
689+
evaluator(query=None, response=response, tool_definitions=tool_definitions)
690+
691+
assert "Query is a required input" in str(exc_info.value)
692+
693+
# Test with query not provided at all
694+
with pytest.raises(EvaluationException) as exc_info:
695+
evaluator(response=response, tool_definitions=tool_definitions)
696+
697+
assert "Query is a required input" in str(exc_info.value)

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,3 +284,37 @@ def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config
284284
evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
285285

286286
assert "Invalid score value" in str(exc_info.value)
287+
288+
def test_evaluate_tool_selection_missing_query(self, mock_model_config):
289+
"""Test that evaluator raises exception when query is None or missing."""
290+
evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)
291+
evaluator._flow = MagicMock(side_effect=tool_selection_flow_side_effect)
292+
293+
tool_calls = [
294+
{
295+
"type": "tool_call",
296+
"tool_call_id": "call_weather",
297+
"name": "get_weather",
298+
"arguments": {"location": "current"},
299+
}
300+
]
301+
tool_definitions = [
302+
{
303+
"name": "get_weather",
304+
"type": "function",
305+
"description": "Get weather information",
306+
"parameters": {"type": "object", "properties": {"location": {"type": "string"}}},
307+
}
308+
]
309+
310+
# Test with query=None
311+
with pytest.raises(EvaluationException) as exc_info:
312+
evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions)
313+
314+
assert "Query is a required input" in str(exc_info.value)
315+
316+
# Test with query not provided at all
317+
with pytest.raises(EvaluationException) as exc_info:
318+
evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions)
319+
320+
assert "Query is a required input" in str(exc_info.value)

0 commit comments

Comments
 (0)