[Hot fix] Ensure query is is not None for tool-based evaluators (#43909)

salma-elshafey · Salma Elshafey · web-flow · commit d11ce8975430 · 2025-12-18T12:02:18.000+02:00
* Ensure query exists for tool-based evaluators

* Remove unnecessary condition in test

* Fix condition

* Fix spelling mistake

---------

Co-authored-by: Salma Elshafey &lt;selshafey@microsoft.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -205,6 +205,15 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        if eval_input.get("query") is None:
+            raise EvaluationException(
+                message=("Query is a required input to the Tool Call Accuracy evaluator."),
+                internal_message=("Query is a required input to the Tool Call Accuracy evaluator."),
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+            )
+
         # Single LLM call for all tool calls
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = prompty_output_dict.get("llm_output", {})
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -153,12 +153,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         :return: A dictionary containing the result of the evaluation.
         :rtype: Dict[str, Union[str, float]]
         """
-        # Format conversation history for cleaner evaluation
-        if "query" in eval_input:
-            eval_input["query"] = reformat_conversation_history(
-                eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
+        if eval_input.get("query") is None:
+            raise EvaluationException(
+                message=("Query is a required input to " "the Tool Input Accuracy evaluator."),
+                internal_message=("Query is a required input " "to the Tool Input Accuracy evaluator."),
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             )
 
+        # Format conversation history for cleaner evaluation
+        eval_input["query"] = reformat_conversation_history(
+            eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
+        )
+
         # Call the LLM to evaluate
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = prompty_output_dict.get("llm_output", {})
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
@@ -175,12 +175,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         :return: A dictionary containing the result of the evaluation.
         :rtype: Dict[str, Union[str, float]]
         """
-        # Format conversation history for cleaner evaluation
-        if "query" in eval_input:
-            eval_input["query"] = reformat_conversation_history(
-                eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
+        if eval_input.get("query") is None:
+            raise EvaluationException(
+                message=("Query is a required input to the Tool Selection evaluator."),
+                internal_message=("Query is a required input to the Tool Selection evaluator."),
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
             )
 
+        # Format conversation history for cleaner evaluation
+        eval_input["query"] = reformat_conversation_history(
+            eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
+        )
+
         # Call the LLM to evaluate
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = prompty_output_dict.get("llm_output", {})
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -688,3 +688,41 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
         assert result is not None
         assert result[key] == 5.0
         assert result[f"{key}_result"] == "pass"
+
+    def test_evaluate_missing_query(self, mock_model_config):
+        """Test that evaluator raises exception when query is None or missing."""
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        tool_calls = [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_good",
+                "name": "get_weather",
+                "arguments": {"location": "Paris"},
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "get_weather",
+                "type": "function",
+                "description": "Get weather information",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string", "description": "The location"}},
+                    "required": ["location"],
+                },
+            }
+        ]
+
+        # Test with query=None
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions)
+
+        assert "Query is a required input" in str(exc_info.value)
+
+        # Test with query not provided at all
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions)
+
+        assert "Query is a required input" in str(exc_info.value)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py
@@ -652,3 +652,46 @@ def test_evaluate_with_single_tool_definition(self, mock_model_config):
         assert result is not None
         assert result[key] == 1
         assert result[f"{key}_result"] == "pass"
+
+    def test_evaluate_missing_query(self, mock_model_config):
+        """Test that evaluator raises exception when query is None or missing."""
+        evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        response = [
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "tool_call",
+                        "tool_call_id": "call_123",
+                        "name": "get_weather",
+                        "arguments": {"location": "Paris"},
+                    }
+                ],
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "get_weather",
+                "type": "function",
+                "description": "Get weather information for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string", "description": "The location to get weather for"}},
+                    "required": ["location"],
+                },
+            }
+        ]
+
+        # Test with query=None
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=None, response=response, tool_definitions=tool_definitions)
+
+        assert "Query is a required input" in str(exc_info.value)
+
+        # Test with query not provided at all
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(response=response, tool_definitions=tool_definitions)
+
+        assert "Query is a required input" in str(exc_info.value)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py
@@ -284,3 +284,37 @@ def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config
             evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
         assert "Invalid score value" in str(exc_info.value)
+
+    def test_evaluate_tool_selection_missing_query(self, mock_model_config):
+        """Test that evaluator raises exception when query is None or missing."""
+        evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=tool_selection_flow_side_effect)
+
+        tool_calls = [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_weather",
+                "name": "get_weather",
+                "arguments": {"location": "current"},
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "get_weather",
+                "type": "function",
+                "description": "Get weather information",
+                "parameters": {"type": "object", "properties": {"location": {"type": "string"}}},
+            }
+        ]
+
+        # Test with query=None
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions)
+
+        assert "Query is a required input" in str(exc_info.value)
+
+        # Test with query not provided at all
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions)
+
+        assert "Query is a required input" in str(exc_info.value)