diff --git a/pyproject.toml b/pyproject.toml index da1cf62eb..9d9e3a14a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.4.6" +version = "2.4.7" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py index 253ce36d4..851c1aa0f 100644 --- a/src/uipath/_cli/_evals/_evaluator_factory.py +++ b/src/uipath/_cli/_evals/_evaluator_factory.py @@ -22,7 +22,9 @@ from uipath.eval.evaluators import ( BaseEvaluator, LegacyBaseEvaluator, + LegacyContextPrecisionEvaluator, LegacyExactMatchEvaluator, + LegacyFaithfulnessEvaluator, LegacyJsonSimilarityEvaluator, LegacyLlmAsAJudgeEvaluator, LegacyTrajectoryEvaluator, @@ -68,6 +70,7 @@ ToolCallOutputEvaluator, ToolCallOutputEvaluatorConfig, ) +from uipath.eval.models import LegacyEvaluatorType logger = logging.getLogger(__name__) @@ -428,11 +431,8 @@ def _create_legacy_json_similarity_evaluator( def _create_legacy_llm_as_judge_evaluator( params: LLMEvaluatorParams, agent_model: str | None = None, - ) -> LegacyLlmAsAJudgeEvaluator: - """Create an LLM-as-a-judge evaluator.""" - if not params.prompt: - raise ValueError("LLM evaluator must include 'prompt' field") - + ) -> LegacyBaseEvaluator[Any]: + """Create an LLM-as-a-judge evaluator or context precision evaluator based on type.""" if not params.model: raise ValueError("LLM evaluator must include 'model' field") @@ -449,10 +449,16 @@ def _create_legacy_llm_as_judge_evaluator( ) params = params.model_copy(update={"model": agent_model}) - logger.info( - f"Creating LLM-as-judge evaluator '{params.name}' with model: {params.model}" - ) - return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={}) + # Check evaluator type to determine which evaluator to create + if params.evaluator_type == LegacyEvaluatorType.ContextPrecision: + return LegacyContextPrecisionEvaluator(**params.model_dump(), config={}) + elif params.evaluator_type == LegacyEvaluatorType.Faithfulness: + return LegacyFaithfulnessEvaluator(**params.model_dump(), config={}) + else: + if not params.prompt: + raise ValueError("LLM evaluator must include 'prompt' field") + + return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={}) @staticmethod def _create_legacy_trajectory_evaluator( diff --git a/src/uipath/eval/evaluators/__init__.py b/src/uipath/eval/evaluators/__init__.py index bc79e071d..248b5d571 100644 --- a/src/uipath/eval/evaluators/__init__.py +++ b/src/uipath/eval/evaluators/__init__.py @@ -10,7 +10,9 @@ # Legacy evaluators from .legacy_base_evaluator import LegacyBaseEvaluator +from .legacy_context_precision_evaluator import LegacyContextPrecisionEvaluator from .legacy_exact_match_evaluator import LegacyExactMatchEvaluator +from .legacy_faithfulness_evaluator import LegacyFaithfulnessEvaluator from .legacy_json_similarity_evaluator import LegacyJsonSimilarityEvaluator from .legacy_llm_as_judge_evaluator import LegacyLlmAsAJudgeEvaluator from .legacy_trajectory_evaluator import LegacyTrajectoryEvaluator @@ -46,7 +48,9 @@ __all__ = [ # Legacy evaluators "LegacyBaseEvaluator", + "LegacyContextPrecisionEvaluator", "LegacyExactMatchEvaluator", + "LegacyFaithfulnessEvaluator", "LegacyJsonSimilarityEvaluator", "LegacyLlmAsAJudgeEvaluator", "LegacyTrajectoryEvaluator", diff --git a/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py b/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py new file mode 100644 index 000000000..88667408c --- /dev/null +++ b/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py @@ -0,0 +1,349 @@ +"""Legacy Context Precision evaluator for assessing the relevance of context chunks to queries.""" + +import ast +import json +from typing import Any, Optional + +from uipath.eval.models import NumericEvaluationResult + +from ...platform.chat import UiPathLlmChatService +from ..models.models import AgentExecution, EvaluationResult +from .legacy_base_evaluator import ( + LegacyBaseEvaluator, + LegacyEvaluationCriteria, + LegacyEvaluatorConfig, + track_evaluation_metrics, +) +from .legacy_evaluator_utils import clean_model_name, serialize_object + + +class LegacyContextPrecisionEvaluatorConfig(LegacyEvaluatorConfig): + """Configuration for legacy context precision evaluators.""" + + name: str = "LegacyContextPrecisionEvaluator" + model: str = "" + prompt: str = """You are an expert evaluator assessing the relevance of context chunks to a given query. + +TASK: Evaluate how relevant each provided context chunk is to answering the query. +Your scoring should be deterministic - the same chunk-query pair should always receive the same score. + +EVALUATION CRITERIA: +Score each chunk using the HIGHEST applicable range (if multiple apply, use the highest): + +- HIGHLY RELEVANT (80-100) - Directly answers or addresses the query: + * 95-100: Contains the exact, complete answer to the query + * 85-94: Directly addresses the query with comprehensive information (but not the complete answer) + * 80-84: Provides a direct but partial answer to the query + +- MODERATELY RELEVANT (50-79) - Provides useful supporting information: + * 70-79: Contains substantial supporting information that helps understand the topic + * 60-69: Provides relevant context or background information + * 50-59: Has some connection to the query but limited usefulness + +- SLIGHTLY RELEVANT (20-49) - Contains tangentially related information: + * 35-49: Mentions related concepts, terms, or entities from the query + * 20-34: Very indirect connection to the query topic + +- NOT RELEVANT (0-19) - Has no meaningful connection to the query: + * 10-19: Contains some keywords from the query but no meaningful connection + * 0-9: Completely unrelated to the query or empty/malformed content + +IMPORTANT INSTRUCTIONS: +1. Evaluate EACH chunk independently - do not let one chunk influence another's score +2. Base relevance ONLY on how well the chunk helps answer the specific query +3. Consider semantic meaning, not just keyword matches +4. If a chunk is empty or malformed, assign a score of 0 +5. Scores must be integers between 0 and 100 inclusive +6. Be consistent: similar content should receive similar scores +7. Use the specific sub-ranges above to guide precise scoring +8. HIERARCHY RULE: If a chunk meets criteria for multiple ranges, always assign the HIGHEST applicable score + +OUTPUT FORMAT: +You MUST respond using the provided tool with a JSON object containing: +- A "relevancies" field that is an array +- Each array element must be an object with "relevancy_score" (integer 0-100) +- The array must have the same number of elements as context chunks provided +- Order matters: the first score corresponds to the first chunk, etc. + +EXAMPLE STRUCTURE (do not copy values, this is just format): +{ + "relevancies": [ + {"relevancy_score": 85}, + {"relevancy_score": 45}, + {"relevancy_score": 0} + ] +} + + +{{Query}} + + + +{{Chunks}} + + +Evaluate each chunk's relevance to the query and respond with the structured output.""" + + +class LegacyContextPrecisionEvaluator( + LegacyBaseEvaluator[LegacyContextPrecisionEvaluatorConfig] +): + """Legacy evaluator that assesses context precision using an LLM. + + This evaluator extracts context grounding spans from agent execution traces + and uses an LLM to score the relevance of each chunk to its corresponding query. + The final score is the mean of all chunk relevancy scores (normalized to 0-1). + """ + + model: str + query_placeholder: str = "{{Query}}" + chunks_placeholder: str = "{{Chunks}}" + llm: Optional[UiPathLlmChatService] = None + + def model_post_init(self, __context: Any): + """Initialize the LLM service after model creation.""" + super().model_post_init(__context) + self._initialize_llm() + + def _initialize_llm(self): + """Initialize the LLM used for evaluation.""" + from uipath.platform import UiPath + + uipath = UiPath() + self.llm = uipath.llm + + @track_evaluation_metrics + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: LegacyEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate context precision from agent execution traces. + + Args: + agent_execution: The execution details containing agent_trace with spans + evaluation_criteria: Legacy evaluation criteria (unused for context precision) + + Returns: + NumericEvaluationResult with normalized score (0-1) and detailed justification + """ + # Extract context grounding spans from the trace + context_groundings = self._extract_context_groundings( + agent_execution.agent_trace + ) + + if not context_groundings: + return NumericEvaluationResult( + score=0.0, + details="No context grounding tool calls found in the agent execution trace.", + ) + + # Evaluate each context grounding call + all_scores = [] + evaluation_details = [] + + for idx, grounding in enumerate(context_groundings, 1): + query = grounding.get("query", "") + chunks = grounding.get("chunks", []) + + if not query or not chunks: + evaluation_details.append( + f"{idx}. Query: (empty) - SKIPPED (no query or chunks)" + ) + continue + + scores = await self._evaluate_context_grounding(query, chunks) + + if scores: + mean_score = sum(scores) / len(scores) + all_scores.append(mean_score) + + # Format score summaries for this grounding + score_summaries = [f"Relevancy: {s:d}/100" for s in scores] + evaluation_details.append( + f'{idx}. Query: "{query}"\n' + f"\tAvg. Score: {mean_score:.1f}/100 ({len(scores)} chunks). " + f"Chunk Relevancies: [{', '.join(score_summaries)}]." + ) + + if not all_scores: + return NumericEvaluationResult( + score=0.0, + details="No valid context chunks were found for evaluation.", + ) + + # Calculate overall mean score (0-100 range) + overall_mean = sum(all_scores) / len(all_scores) + overall_mean = max(0, min(100, overall_mean)) + + # Build justification + justification = f"Overall Context Precision: {overall_mean:.1f}/100 ({len(context_groundings)} Context Tool Call(s) evaluated).\n" + if evaluation_details: + justification += "---\nPer-Context Tool Call Details:\n\n" + justification += "\n\n".join(evaluation_details) + + return NumericEvaluationResult( + score=overall_mean, + details=justification, + ) + + def _parse_span_value(self, value_str: str) -> Any: + """Parse span value that could be JSON or Python literal syntax. + + Args: + value_str: String that could be JSON or Python literal (dict/list) + + Returns: + Parsed Python object (dict, list, etc.) + + Raises: + ValueError: If string cannot be parsed as JSON or literal + """ + try: + # Try JSON first (most common) + return json.loads(value_str) + except json.JSONDecodeError: + try: + # Fall back to Python literal_eval for Python syntax + return ast.literal_eval(value_str) + except (ValueError, SyntaxError) as e: + raise ValueError(f"Cannot parse value: {value_str}") from e + + def _extract_context_groundings( + self, agent_trace: list[Any] + ) -> list[dict[str, Any]]: + """Extract context groundings from agent execution trace. + + Looks for spans with input.value and output.value attributes that represent + context grounding tool calls. + """ + context_groundings = [] + + for span in agent_trace: + if not hasattr(span, "attributes") or span.attributes is None: + continue + + attrs = span.attributes + + if attrs.get("openinference.span.kind", None) != "RETRIEVER": + # NOTE: all tool calls can be extracted using this approach + continue + + # Look for spans with input.value and output.value (context grounding calls) + query = attrs.get("input.value") + try: + chunks = self._normalize_chunks( + json.loads(attrs.get("output.value")).get("documents") + ) + + if chunks: + context_groundings.append( + { + "query": str(query), + "chunks": chunks, + } + ) + except (ValueError, KeyError, TypeError): + # Skip spans that don't have the expected structure + continue + + return context_groundings + + def _normalize_chunks(self, results: Any) -> list[str]: + """Normalize various chunk representations to a list of strings.""" + if isinstance(results, list): + return [self._serialize_chunk(chunk) for chunk in results] + elif isinstance(results, dict): + # Handle dict representations of chunks + return [self._serialize_chunk(results)] + elif isinstance(results, str): + return [results] + else: + return [str(results)] + + def _serialize_chunk(self, chunk: Any) -> str: + """Serialize a single chunk to string format.""" + return serialize_object(chunk, sort_keys=True) + + async def _evaluate_context_grounding( + self, query: str, chunks: list[str] + ) -> list[int]: + """Evaluate the relevance of chunks to a query using the LLM. + + Args: + query: The query string + chunks: List of context chunks to evaluate + + Returns: + List of relevancy scores (0-100) for each chunk + """ + # Create evaluation prompt + chunks_text = "\n".join(chunks) + prompt = self.evaluator_config.prompt.replace( + self.query_placeholder, query + ).replace(self.chunks_placeholder, chunks_text) + + # Get LLM response + response_obj = await self._get_structured_llm_response(prompt) + + # Extract relevancy scores from response + relevancies = response_obj.get("relevancies", []) + if not relevancies: + raise ValueError("No relevancies found in LLM response") + + scores = [] + for rel in relevancies: + if isinstance(rel, dict) and "relevancy_score" in rel: + score = rel["relevancy_score"] + # Clamp score to 0-100 + score = max(0, min(100, int(score))) + scores.append(score) + + return scores + + async def _get_structured_llm_response( + self, evaluation_prompt: str + ) -> dict[str, Any]: + """Get structured LLM response using the context precision schema.""" + # Remove community-agents suffix from llm model name + model = clean_model_name(self.model) + + # Prepare the request + request_data = { + "model": model, + "messages": [ + {"role": "system", "content": "Context Precision Evaluation"}, + {"role": "user", "content": evaluation_prompt}, + ], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "context_precision_evaluation", + "schema": { + "type": "object", + "properties": { + "relevancies": { + "type": "array", + "items": { + "type": "object", + "properties": { + "relevancy_score": { + "type": "number", + "description": "Relevancy score for the chunk (0-100).", + } + }, + "required": ["relevancy_score"], + }, + "description": "List of relevancy scores for each context chunk", + } + }, + "required": ["relevancies"], + }, + }, + }, + } + + assert self.llm, "LLM should be initialized before calling this method." + response = await self.llm.chat_completions(**request_data) + content = response.choices[-1].message.content or "{}" + return json.loads(content) diff --git a/src/uipath/eval/evaluators/legacy_evaluator_utils.py b/src/uipath/eval/evaluators/legacy_evaluator_utils.py new file mode 100644 index 000000000..b8c20f372 --- /dev/null +++ b/src/uipath/eval/evaluators/legacy_evaluator_utils.py @@ -0,0 +1,75 @@ +"""Utility functions for legacy evaluators.""" + +import json +from typing import Any, Optional + +from ..._utils.constants import COMMUNITY_agents_SUFFIX + + +def clean_model_name(model: str) -> str: + """Remove community-agents suffix from model name. + + Args: + model: Model name that may have the community suffix + + Returns: + Model name without the community suffix + """ + if model.endswith(COMMUNITY_agents_SUFFIX): + return model.replace(COMMUNITY_agents_SUFFIX, "") + return model + + +def serialize_object( + content: Any, + sort_keys: bool = False, +) -> str: + """Serialize content to string format. + + Args: + content: Content to serialize (str, dict, list, etc.) + sort_keys: Whether to sort dict keys (default: False) + + Returns: + Serialized string representation + """ + if isinstance(content, str): + return content + elif isinstance(content, dict): + if sort_keys: + content = dict(sorted(content.items())) + return json.dumps(content, default=str, separators=(",", ":")) + else: + return json.dumps(content, default=str, separators=(",", ":")) + + +def safe_get_span_attributes(span: Any) -> Optional[dict[str, Any]]: + """Safely extract attributes from a span. + + Args: + span: The span object + + Returns: + Span attributes dict, or None if not available + """ + if not hasattr(span, "attributes") or span.attributes is None: + return None + return span.attributes + + +def parse_json_value(value: str) -> Any: + """Safely parse a JSON string value. + + Args: + value: JSON string to parse + + Returns: + Parsed JSON object + + Raises: + ValueError: If string cannot be parsed as JSON + """ + try: + return json.loads(value) + except json.JSONDecodeError as e: + raise ValueError(f"Cannot parse JSON value: {value}") from e diff --git a/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py b/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py new file mode 100644 index 000000000..ac0373384 --- /dev/null +++ b/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py @@ -0,0 +1,513 @@ +"""Legacy Faithfulness evaluator for assessing whether agent output claims are grounded in context.""" + +import json +from typing import Any, Optional + +from uipath.eval.models import NumericEvaluationResult +from uipath.platform.chat import UiPathLlmChatService + +from ..models.models import AgentExecution, EvaluationResult +from .legacy_base_evaluator import ( + LegacyBaseEvaluator, + LegacyEvaluationCriteria, + LegacyEvaluatorConfig, + track_evaluation_metrics, +) +from .legacy_evaluator_utils import ( + clean_model_name, + serialize_object, +) + + +class LegacyFaithfulnessEvaluatorConfig(LegacyEvaluatorConfig): + """Configuration for legacy faithfulness evaluators.""" + + name: str = "LegacyFaithfulnessEvaluator" + model: str = "" + + +class LegacyFaithfulnessEvaluator( + LegacyBaseEvaluator[LegacyFaithfulnessEvaluatorConfig] +): + """Legacy evaluator that assesses faithfulness using an LLM. + + This evaluator extracts claims from agent output using a 3-stage pipeline + (selection, disambiguation, decomposition) and evaluates whether each claim + is grounded in the available context sources extracted from agent traces. + The final score is the percentage of claims that are grounded. + """ + + model: str + llm: Optional[UiPathLlmChatService] = None + + def model_post_init(self, __context: Any): + """Initialize the LLM service after model creation.""" + super().model_post_init(__context) + self._initialize_llm() + + def _initialize_llm(self): + """Initialize the LLM used for evaluation.""" + from uipath.platform import UiPath + + uipath = UiPath() + self.llm = uipath.llm + + @track_evaluation_metrics + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: LegacyEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate faithfulness of agent output against available context. + + Args: + agent_execution: The execution details containing agent_trace with spans + evaluation_criteria: Legacy evaluation criteria containing expected_output + + Returns: + NumericEvaluationResult with normalized score (0-100) and detailed justification + """ + # Extract agent output + agent_output = str(evaluation_criteria.expected_output or "") + if not agent_output or not agent_output.strip(): + return NumericEvaluationResult( + score=0.0, + details="No agent output provided for faithfulness evaluation.", + ) + + # Extract context sources from traces + context_sources = self._extract_context_sources(agent_execution.agent_trace) + + if not context_sources: + return NumericEvaluationResult( + score=0.0, + details="No context sources found in the agent execution trace.", + ) + + # Stage 1: Extract verifiable claims from agent output + claims = await self._extract_claims(agent_output) + + if not claims: + return NumericEvaluationResult( + score=100.0, + details="No verifiable claims found in agent output.", + ) + + # Stage 2: Evaluate each claim against context sources + claim_evaluations = await self._evaluate_claims_against_context( + claims, context_sources + ) + + # Calculate score + grounded_claims = [c for c in claim_evaluations if c["is_grounded"]] + score = ( + (len(grounded_claims) / len(claim_evaluations)) * 100 + if claim_evaluations + else 0.0 + ) + score = max(0, min(100, score)) + + # Build justification + justification = self._format_justification(score, claim_evaluations) + + return NumericEvaluationResult( + score=score, + details=justification, + ) + + def _extract_context_sources(self, agent_trace: list[Any]) -> list[dict[str, str]]: + """Extract context sources from agent execution trace. + + Looks for tool call outputs and context grounding spans that provide context. + + Returns: + List of context source dicts with 'content' and 'source' keys + """ + context_sources = [] + + for span in agent_trace: + if not hasattr(span, "attributes") or span.attributes is None: + continue + + attrs = span.attributes + + tool_name = attrs.get("openinference.span.kind") + if not tool_name or tool_name == "UNKNOWN": + continue + + output_value = attrs.get("output.value") + if not output_value: + continue + + try: + output_data = ( + json.loads(output_value) + if isinstance(output_value, str) + else output_value + ) + + # For RETRIEVER spans, extract individual documents + if tool_name == "RETRIEVER": + documents = output_data.get("documents", []) + if documents: + for doc in documents: + content = self._serialize_content(doc) + context_sources.append( + {"content": content, "source": "Context Grounding"} + ) + else: + # For other tool calls, extract the full output + content = self._serialize_content(output_data) + context_sources.append({"content": content, "source": tool_name}) + except (ValueError, TypeError): + continue + + return context_sources + + def _serialize_content(self, content: Any) -> str: + """Serialize content to string format.""" + return serialize_object(content, sort_keys=False) + + async def _extract_claims(self, agent_output: str) -> list[dict[str, str]]: + """Extract verifiable claims from agent output using 3-stage pipeline. + + Stages: + 1. Selection: Filter to verifiable sentences + 2. Disambiguation: Resolve internal ambiguities + 3. Decomposition: Extract standalone claims + + Returns: + List of claim dicts with 'text' and 'original_sentence' keys + """ + # Stage 1: Selection + verifiable_sentences = await self._select_verifiable_sentences(agent_output) + if not verifiable_sentences: + return [] + + # Stage 2: Disambiguation + disambiguated_sentences = await self._disambiguate_sentences( + verifiable_sentences, agent_output + ) + if not disambiguated_sentences: + return [] + + # Stage 3: Decomposition + claims = await self._decompose_to_claims(disambiguated_sentences, agent_output) + return claims + + async def _select_verifiable_sentences(self, agent_output: str) -> list[str]: + """Stage 1: Filter agent output to verifiable sentences.""" + prompt = f"""You are an expert evaluator identifying verifiable claims. + +TASK: Identify sentences in the agent output that contain verifiable, factual claims. +Filter out subjective opinions, instructions, questions, and meta-commentary. + +OUTPUT FORMAT: Return a JSON object with a "sentences" field containing an array of strings. +Each string should be a complete sentence from the original output. + + +{agent_output} + + +Identify and return only the verifiable sentences.""" + + response_obj = await self._get_structured_llm_response( + prompt, + schema_name="claim_selection", + schema={ + "type": "object", + "properties": { + "sentences": { + "type": "array", + "items": {"type": "string"}, + "description": "List of verifiable sentences from agent output", + } + }, + "required": ["sentences"], + }, + ) + + return response_obj.get("sentences", []) + + async def _disambiguate_sentences( + self, sentences: list[str], full_output: str + ) -> list[dict[str, str]]: + """Stage 2: Resolve ambiguities in sentences.""" + if not sentences: + return [] + + sentences_text = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(sentences)) + + prompt = f"""You are an expert at disambiguating claims. + +TASK: For each sentence, resolve any internal ambiguities using the full agent output as context. +Replace pronouns, references, and implicit information with explicit, standalone versions. + + +{full_output} + + + +{sentences_text} + + +OUTPUT FORMAT: Return a JSON object with a "disambiguated" field containing an array of objects. +Each object must have: +- "original": the original sentence +- "disambiguated": the disambiguated version""" + + response_obj = await self._get_structured_llm_response( + prompt, + schema_name="claim_disambiguation", + schema={ + "type": "object", + "properties": { + "disambiguated": { + "type": "array", + "items": { + "type": "object", + "properties": { + "original": {"type": "string"}, + "disambiguated": {"type": "string"}, + }, + "required": ["original", "disambiguated"], + }, + "description": "List of disambiguated sentences", + } + }, + "required": ["disambiguated"], + }, + ) + + return response_obj.get("disambiguated", []) + + async def _decompose_to_claims( + self, disambiguated: list[dict[str, str]], full_output: str + ) -> list[dict[str, str]]: + """Stage 3: Decompose sentences into standalone verifiable claims.""" + if not disambiguated: + return [] + + sentences_text = "\n".join( + f"{i + 1}. {item.get('disambiguated', '')}" + for i, item in enumerate(disambiguated) + ) + + prompt = f"""You are an expert at claim decomposition. + +TASK: Break down each sentence into standalone, atomic claims that can be independently verified. +Each claim should be self-contained and not depend on other claims for context. + + +{sentences_text} + + + +{full_output} + + +OUTPUT FORMAT: Return a JSON object with a "claims" field containing an array of objects. +Each object must have: +- "claim": the standalone claim +- "original_sentence": which sentence it came from (number)""" + + response_obj = await self._get_structured_llm_response( + prompt, + schema_name="claim_decomposition", + schema={ + "type": "object", + "properties": { + "claims": { + "type": "array", + "items": { + "type": "object", + "properties": { + "claim": {"type": "string"}, + "original_sentence": {"type": "string"}, + }, + "required": ["claim", "original_sentence"], + }, + "description": "List of decomposed claims", + } + }, + "required": ["claims"], + }, + ) + + claims_data = response_obj.get("claims", []) + return [ + { + "text": c.get("claim", ""), + "original_sentence": c.get("original_sentence", ""), + } + for c in claims_data + if c.get("claim", "").strip() + ] + + async def _evaluate_claims_against_context( + self, + claims: list[dict[str, str]], + context_sources: list[dict[str, str]], + ) -> list[dict[str, Any]]: + """Evaluate each claim against context sources. + + Returns: + List of claim evaluations with grounding status and source attribution + """ + claim_evaluations = [] + + for claim in claims: + claim_text = claim.get("text", "") + if not claim_text.strip(): + continue + + supporting_sources = [] + contradicting_sources = [] + + # Evaluate claim against each context source + for source in context_sources: + source_content = source.get("content", "") + source_name = source.get("source", "Unknown") + + stance = await self._evaluate_claim_stance(claim_text, source_content) + + if stance == "SUPPORTS": + supporting_sources.append(source_name) + elif stance == "CONTRADICTS": + contradicting_sources.append(source_name) + + # A claim is grounded if it has supporting sources and no contradicting ones + is_grounded = ( + len(supporting_sources) > 0 and len(contradicting_sources) == 0 + ) + + claim_evaluations.append( + { + "claim": claim_text, + "original_sentence": claim.get("original_sentence", ""), + "is_grounded": is_grounded, + "supporting_sources": supporting_sources, + "contradicting_sources": contradicting_sources, + } + ) + + return claim_evaluations + + async def _evaluate_claim_stance(self, claim: str, context: str) -> str: + """Evaluate whether a context source supports, contradicts, or is irrelevant to a claim. + + Returns: + One of: "SUPPORTS", "CONTRADICTS", "IRRELEVANT" + """ + prompt = f"""You are an expert evaluator assessing the relationship between claims and sources. + +TASK: Determine if the source supports, contradicts, or is irrelevant to the claim. + +DEFINITION: +- SUPPORTS: The source provides evidence that makes the claim more likely to be true +- CONTRADICTS: The source provides evidence that makes the claim false or less likely +- IRRELEVANT: The source does not address the claim at all + + +{claim} + + + +{context} + + +OUTPUT FORMAT: Return a JSON object with a "stance" field. +The stance must be exactly one of: "SUPPORTS", "CONTRADICTS", or "IRRELEVANT".""" + + response_obj = await self._get_structured_llm_response( + prompt, + schema_name="claim_stance_evaluation", + schema={ + "type": "object", + "properties": { + "stance": { + "type": "string", + "enum": ["SUPPORTS", "CONTRADICTS", "IRRELEVANT"], + "description": "Stance of the source relative to the claim", + } + }, + "required": ["stance"], + }, + ) + + stance = response_obj.get("stance", "IRRELEVANT").upper() + if stance not in ["SUPPORTS", "CONTRADICTS", "IRRELEVANT"]: + stance = "IRRELEVANT" + + return stance + + def _format_justification( + self, score: float, claim_evaluations: list[dict[str, Any]] + ) -> str: + """Format detailed justification with claim breakdown.""" + grounded_claims = [c for c in claim_evaluations if c["is_grounded"]] + ungrounded_claims = [c for c in claim_evaluations if not c["is_grounded"]] + + justification = ( + f"Overall Faithfulness: {score:.1f}/100 " + f"({len(grounded_claims)}/{len(claim_evaluations)} claims grounded).\n" + ) + + if claim_evaluations: + justification += "---\n" + + if grounded_claims: + justification += "\n✓ GROUNDED CLAIMS:\n\n" + for i, eval_item in enumerate(grounded_claims, 1): + justification += f'{i}. "{eval_item["claim"]}"\n' + if eval_item["supporting_sources"]: + sources_str = ", ".join(eval_item["supporting_sources"]) + justification += f" Supporting Sources: {sources_str}\n" + justification += "\n" + + if ungrounded_claims: + justification += "\n✗ UNGROUNDED CLAIMS:\n\n" + for i, eval_item in enumerate(ungrounded_claims, 1): + justification += f'{i}. "{eval_item["claim"]}"\n' + if eval_item["contradicting_sources"]: + sources_str = ", ".join(eval_item["contradicting_sources"]) + justification += f" Contradicting Sources: {sources_str}\n" + if ( + not eval_item["supporting_sources"] + and not eval_item["contradicting_sources"] + ): + justification += " No supporting sources found in context.\n" + justification += "\n" + + return justification.rstrip() + + async def _get_structured_llm_response( + self, + evaluation_prompt: str, + schema_name: str, + schema: dict[str, Any], + ) -> dict[str, Any]: + """Get structured LLM response using JSON schema.""" + # Remove community-agents suffix from llm model name + model = clean_model_name(self.model) + + # Prepare the request + request_data = { + "model": model, + "messages": [ + {"role": "system", "content": "Faithfulness Evaluation"}, + {"role": "user", "content": evaluation_prompt}, + ], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": schema_name, + "schema": schema, + }, + }, + } + + assert self.llm, "LLM should be initialized before calling this method." + response = await self.llm.chat_completions(**request_data) + content = response.choices[-1].message.content or "{}" + return json.loads(content) diff --git a/tests/cli/evaluators/test_legacy_context_precision_evaluator.py b/tests/cli/evaluators/test_legacy_context_precision_evaluator.py new file mode 100644 index 000000000..761facad3 --- /dev/null +++ b/tests/cli/evaluators/test_legacy_context_precision_evaluator.py @@ -0,0 +1,313 @@ +"""Tests for LegacyContextPrecisionEvaluator. + +Tests span extraction, chunk normalization, and LLM evaluation. +""" + +import json +from types import MappingProxyType +from unittest.mock import AsyncMock, patch + +import pytest + +from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams +from uipath.eval.evaluators import LegacyContextPrecisionEvaluator +from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria +from uipath.eval.models.models import ( + AgentExecution, + LegacyEvaluatorCategory, + LegacyEvaluatorType, +) + + +def _make_base_params() -> EvaluatorBaseParams: + """Create base parameters for context precision evaluator.""" + return EvaluatorBaseParams( + id="context-precision", + category=LegacyEvaluatorCategory.LlmAsAJudge, + evaluator_type=LegacyEvaluatorType.ContextPrecision, + name="Context Precision", + description="Evaluates context chunk relevance", + created_at="2025-01-01T00:00:00Z", + updated_at="2025-01-01T00:00:00Z", + target_output_key="*", + ) + + +@pytest.fixture +def evaluator_with_mocked_llm(): + """Fixture to create evaluator with mocked LLM service.""" + with patch("uipath.platform.UiPath"): + evaluator = LegacyContextPrecisionEvaluator( + **_make_base_params().model_dump(), + config={}, + model="gpt-4.1-2025-04-14", + ) + return evaluator + + +def _make_mock_span(input_query: str, output_chunks: list[str]): + """Create a mock span with context grounding data.""" + + class MockSpan: + def __init__(self): + self.attributes = MappingProxyType( + { + "openinference.span.kind": "RETRIEVER", + "input.mime_type": "text/plain", + "input.value": input_query, + "output.value": json.dumps( + { + "documents": [ + {"id": str(i), "text": chunk} + for i, chunk in enumerate(output_chunks) + ] + } + ), + "output.mime_type": "application/json", + } + ) + + return MockSpan() + + +class TestLegacyContextPrecisionEvaluator: + """Test suite for LegacyContextPrecisionEvaluator.""" + + @pytest.mark.asyncio + async def test_span_extraction_with_valid_data( + self, evaluator_with_mocked_llm + ) -> None: + """Test extraction of context groundings from spans.""" + evaluator = evaluator_with_mocked_llm + + # Create mock span with context grounding data + span = _make_mock_span( + input_query="construction industry", + output_chunks=["Building materials", "Safety codes", "Project management"], + ) + + # Extract context groundings + groundings = evaluator._extract_context_groundings([span]) + + assert len(groundings) == 1 + assert groundings[0]["query"] == "construction industry" + assert len(groundings[0]["chunks"]) == 3 + # Chunks are JSON-serialized because they come from the output + assert any("Building materials" in chunk for chunk in groundings[0]["chunks"]) + + @pytest.mark.asyncio + async def test_span_extraction_skips_invalid_spans( + self, evaluator_with_mocked_llm + ) -> None: + """Test that spans without proper structure are skipped.""" + evaluator = evaluator_with_mocked_llm + + # Create spans: one valid, one invalid + valid_span = _make_mock_span( + input_query="test query", + output_chunks=["chunk1"], + ) + + class InvalidSpan: + attributes = MappingProxyType( + { + "openinference.span.kind": "RETRIEVER", + # Missing input.value and output.value + } + ) + + groundings = evaluator._extract_context_groundings([valid_span, InvalidSpan()]) + + assert len(groundings) == 1 + assert groundings[0]["query"] == "test query" + + @pytest.mark.asyncio + async def test_chunk_normalization(self, evaluator_with_mocked_llm) -> None: + """Test normalization of various chunk formats.""" + evaluator = evaluator_with_mocked_llm + + # Test list of strings + chunks = evaluator._normalize_chunks(["chunk1", "chunk2"]) + assert len(chunks) == 2 + assert all(isinstance(c, str) for c in chunks) + + # Test list of dicts + chunks = evaluator._normalize_chunks( + [ + {"id": "1", "text": "content1"}, + {"id": "2", "text": "content2"}, + ] + ) + assert len(chunks) == 2 + assert all(isinstance(c, str) for c in chunks) + + # Test single string + chunks = evaluator._normalize_chunks("single chunk") + assert len(chunks) == 1 + assert chunks[0] == "single chunk" + + @pytest.mark.asyncio + async def test_evaluation_with_mocked_llm(self, evaluator_with_mocked_llm) -> None: + """Test evaluation logic with mocked LLM.""" + evaluator = evaluator_with_mocked_llm + + # Create mock spans + span = _make_mock_span( + input_query="python programming", + output_chunks=[ + "Python syntax guide", + "Python libraries overview", + "JavaScript fundamentals", + ], + ) + + # Extract context groundings from the span + groundings = evaluator._extract_context_groundings([span]) + assert len(groundings) == 1 + assert groundings[0]["query"] == "python programming" + assert len(groundings[0]["chunks"]) == 3 + + # Test the grounding evaluation with mocked LLM response + mock_llm_response = { + "relevancies": [ + {"relevancy_score": 95}, + {"relevancy_score": 75}, + {"relevancy_score": 45}, + ] + } + + with patch.object( + evaluator, "_get_structured_llm_response", new_callable=AsyncMock + ) as mock_llm: + mock_llm.return_value = mock_llm_response + + # Evaluate the context grounding + scores = await evaluator._evaluate_context_grounding( + groundings[0]["query"], groundings[0]["chunks"] + ) + + assert scores == [95, 75, 45] + assert abs(sum(scores) / len(scores) - 71.66666667) < 0.01 + + @pytest.mark.asyncio + async def test_evaluation_with_no_context_groundings( + self, evaluator_with_mocked_llm + ) -> None: + """Test evaluation when no context groundings are found.""" + evaluator = evaluator_with_mocked_llm + + # Create empty agent execution (no spans) + agent_execution = AgentExecution( + agent_input={}, + agent_trace=[], + agent_output="", + ) + + result = await evaluator.evaluate( + agent_execution, + evaluation_criteria=LegacyEvaluationCriteria( + expected_output="", + expected_agent_behavior="", + ), + ) + + assert result.score == 0.0 + assert "no context grounding" in result.details.lower() + + @pytest.mark.asyncio + async def test_evaluation_multiple_context_calls( + self, evaluator_with_mocked_llm + ) -> None: + """Test evaluation logic with multiple context grounding calls.""" + evaluator = evaluator_with_mocked_llm + + # Create two mock spans + span1 = _make_mock_span( + input_query="query 1", + output_chunks=["chunk1a", "chunk1b"], + ) + span2 = _make_mock_span( + input_query="query 2", + output_chunks=["chunk2a", "chunk2b", "chunk2c"], + ) + + # Extract context groundings from the spans + groundings = evaluator._extract_context_groundings([span1, span2]) + assert len(groundings) == 2 + assert groundings[0]["query"] == "query 1" + assert groundings[1]["query"] == "query 2" + + # Mock the LLM responses + mock_llm_response_1 = { + "relevancies": [{"relevancy_score": 90}, {"relevancy_score": 80}] + } + mock_llm_response_2 = { + "relevancies": [ + {"relevancy_score": 85}, + {"relevancy_score": 75}, + {"relevancy_score": 65}, + ] + } + + with patch.object( + evaluator, "_get_structured_llm_response", new_callable=AsyncMock + ) as mock_llm: + # Return different responses for each call + mock_llm.side_effect = [mock_llm_response_1, mock_llm_response_2] + + # Evaluate both context groundings + scores1 = await evaluator._evaluate_context_grounding( + groundings[0]["query"], groundings[0]["chunks"] + ) + scores2 = await evaluator._evaluate_context_grounding( + groundings[1]["query"], groundings[1]["chunks"] + ) + + # Verify individual scores + assert scores1 == [90, 80] + assert scores2 == [85, 75, 65] + + # Verify means + mean1 = sum(scores1) / len(scores1) # 85 + mean2 = sum(scores2) / len(scores2) # 75 + overall_mean = (mean1 + mean2) / 2 # 80 + assert overall_mean == 80.0 + + @pytest.mark.asyncio + async def test_span_extraction_handles_json_parse_errors( + self, evaluator_with_mocked_llm + ) -> None: + """Test that spans with invalid JSON are skipped.""" + evaluator = evaluator_with_mocked_llm + + class BadJsonSpan: + attributes = MappingProxyType( + { + "openinference.span.kind": "RETRIEVER", + "input.value": "test query", + "output.value": "not valid json", + } + ) + + # Should not raise, should skip the span + groundings = evaluator._extract_context_groundings([BadJsonSpan()]) + assert len(groundings) == 0 + + @pytest.mark.asyncio + async def test_serialization_of_dict_chunks( + self, evaluator_with_mocked_llm + ) -> None: + """Test that dict chunks are properly serialized.""" + evaluator = evaluator_with_mocked_llm + + chunks = evaluator._normalize_chunks( + [ + {"title": "Document 1", "content": "Some content"}, + {"title": "Document 2", "content": "More content"}, + ] + ) + + assert len(chunks) == 2 + assert all(isinstance(c, str) for c in chunks) + # Should be JSON serialized + assert '"title"' in chunks[0] or '"content"' in chunks[0] diff --git a/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py b/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py new file mode 100644 index 000000000..c72ad4f6b --- /dev/null +++ b/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py @@ -0,0 +1,634 @@ +"""Tests for LegacyFaithfulnessEvaluator. + +Tests span extraction, claim extraction (3-stage pipeline), and claim evaluation. +""" + +import json +from types import MappingProxyType +from typing import Any +from unittest.mock import AsyncMock, patch + +import pytest + +from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams +from uipath.eval.evaluators import LegacyFaithfulnessEvaluator +from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria +from uipath.eval.models.models import ( + AgentExecution, + LegacyEvaluatorCategory, + LegacyEvaluatorType, +) + + +def _make_base_params() -> EvaluatorBaseParams: + """Create base parameters for faithfulness evaluator.""" + return EvaluatorBaseParams( + id="faithfulness", + category=LegacyEvaluatorCategory.LlmAsAJudge, + evaluator_type=LegacyEvaluatorType.Faithfulness, + name="Faithfulness", + description="Evaluates faithfulness of claims against context", + created_at="2025-01-01T00:00:00Z", + updated_at="2025-01-01T00:00:00Z", + target_output_key="*", + ) + + +@pytest.fixture +def evaluator_with_mocked_llm(): + """Fixture to create evaluator with mocked LLM service.""" + with patch("uipath.platform.UiPath"): + evaluator = LegacyFaithfulnessEvaluator( + **_make_base_params().model_dump(), + config={}, + model="gpt-4.1-2025-04-14", + ) + return evaluator + + +def _make_mock_span(tool_name: str, output_data: dict[str, Any]): + """Create a mock span with tool call data.""" + + class MockSpan: + def __init__(self): + self.attributes = MappingProxyType( + { + "openinference.span.kind": tool_name, + "output.value": json.dumps(output_data), + } + ) + + return MockSpan() + + +def _make_retriever_span(query: str, documents: list[str]): + """Create a mock RETRIEVER span with context grounding data.""" + return _make_mock_span( + "RETRIEVER", + {"documents": [{"id": str(i), "text": doc} for i, doc in enumerate(documents)]}, + ) + + +class TestLegacyFaithfulnessEvaluator: + """Test suite for LegacyFaithfulnessEvaluator.""" + + @pytest.mark.asyncio + async def test_context_source_extraction_from_tool_calls( + self, evaluator_with_mocked_llm + ) -> None: + """Test extraction of context sources from tool call spans.""" + evaluator = evaluator_with_mocked_llm + + # Create mock spans with tool outputs + span1 = _make_mock_span("TOOL_CALL", {"result": "Tool output 1"}) + span2 = _make_mock_span("TOOL_CALL", {"result": "Tool output 2"}) + + # Extract context sources + sources = evaluator._extract_context_sources([span1, span2]) + + assert len(sources) == 2 + assert all("content" in s and "source" in s for s in sources) + + @pytest.mark.asyncio + async def test_context_source_extraction_from_retriever( + self, evaluator_with_mocked_llm + ) -> None: + """Test extraction of context sources from RETRIEVER spans.""" + evaluator = evaluator_with_mocked_llm + + # Create mock RETRIEVER span with documents + span = _make_retriever_span( + "construction", ["Building materials info", "Safety codes"] + ) + + # Extract context sources (should extract each document individually) + sources = evaluator._extract_context_sources([span]) + + assert len(sources) == 2 + assert all(s["source"] == "Context Grounding" for s in sources) + # Check that we have both documents + contents = [s["content"] for s in sources] + assert any("Building materials" in c for c in contents) + assert any("Safety codes" in c for c in contents) + + @pytest.mark.asyncio + async def test_context_source_extraction_skips_invalid_spans( + self, evaluator_with_mocked_llm + ) -> None: + """Test that spans without proper structure are skipped.""" + evaluator = evaluator_with_mocked_llm + + class InvalidSpan: + attributes = MappingProxyType( + { + "openinference.span.kind": "TOOL_CALL", + # Missing output.value + } + ) + + # Should skip invalid span + sources = evaluator._extract_context_sources([InvalidSpan()]) + assert len(sources) == 0 + + @pytest.mark.asyncio + async def test_select_verifiable_sentences(self, evaluator_with_mocked_llm) -> None: + """Test Stage 1: Selection of verifiable sentences.""" + evaluator = evaluator_with_mocked_llm + + agent_output = ( + "The capital of France is Paris. Do you agree? This is important." + ) + + mock_response = {"sentences": ["The capital of France is Paris."]} + + with patch.object( + evaluator, "_get_structured_llm_response", new_callable=AsyncMock + ) as mock_llm: + mock_llm.return_value = mock_response + + sentences = await evaluator._select_verifiable_sentences(agent_output) + + assert len(sentences) == 1 + assert "capital of France" in sentences[0] + + @pytest.mark.asyncio + async def test_disambiguate_sentences(self, evaluator_with_mocked_llm) -> None: + """Test Stage 2: Disambiguation of sentences.""" + evaluator = evaluator_with_mocked_llm + + verifiable_sentences = ["It is located in Western Europe."] + full_output = "France is a country. It is located in Western Europe." + + mock_response = { + "disambiguated": [ + { + "original": "It is located in Western Europe.", + "disambiguated": "France is located in Western Europe.", + } + ] + } + + with patch.object( + evaluator, "_get_structured_llm_response", new_callable=AsyncMock + ) as mock_llm: + mock_llm.return_value = mock_response + + result = await evaluator._disambiguate_sentences( + verifiable_sentences, full_output + ) + + assert len(result) == 1 + assert result[0]["disambiguated"] == "France is located in Western Europe." + + @pytest.mark.asyncio + async def test_decompose_to_claims(self, evaluator_with_mocked_llm) -> None: + """Test Stage 3: Decomposition into standalone claims.""" + evaluator = evaluator_with_mocked_llm + + disambiguated = [ + { + "original": "Paris and Lyon have populations over 1 million.", + "disambiguated": "Paris and Lyon have populations over 1 million.", + } + ] + full_output = ( + "France has major cities. Paris and Lyon have populations over 1 million." + ) + + mock_response = { + "claims": [ + { + "claim": "Paris has a population over 1 million", + "original_sentence": "1", + }, + { + "claim": "Lyon has a population over 1 million", + "original_sentence": "1", + }, + ] + } + + with patch.object( + evaluator, "_get_structured_llm_response", new_callable=AsyncMock + ) as mock_llm: + mock_llm.return_value = mock_response + + claims = await evaluator._decompose_to_claims(disambiguated, full_output) + + assert len(claims) == 2 + assert any("Paris" in c["text"] for c in claims) + assert any("Lyon" in c["text"] for c in claims) + + @pytest.mark.asyncio + async def test_evaluate_claim_stance_supports( + self, evaluator_with_mocked_llm + ) -> None: + """Test claim stance evaluation when source supports claim.""" + evaluator = evaluator_with_mocked_llm + + claim = "Paris is the capital of France" + context = "The capital of France is Paris, a major European city." + + mock_response = {"stance": "SUPPORTS"} + + with patch.object( + evaluator, "_get_structured_llm_response", new_callable=AsyncMock + ) as mock_llm: + mock_llm.return_value = mock_response + + stance = await evaluator._evaluate_claim_stance(claim, context) + + assert stance == "SUPPORTS" + + @pytest.mark.asyncio + async def test_evaluate_claim_stance_contradicts( + self, evaluator_with_mocked_llm + ) -> None: + """Test claim stance evaluation when source contradicts claim.""" + evaluator = evaluator_with_mocked_llm + + claim = "Paris is in Germany" + context = "Paris is a city in France, not Germany." + + mock_response = {"stance": "CONTRADICTS"} + + with patch.object( + evaluator, "_get_structured_llm_response", new_callable=AsyncMock + ) as mock_llm: + mock_llm.return_value = mock_response + + stance = await evaluator._evaluate_claim_stance(claim, context) + + assert stance == "CONTRADICTS" + + @pytest.mark.asyncio + async def test_evaluate_claim_stance_irrelevant( + self, evaluator_with_mocked_llm + ) -> None: + """Test claim stance evaluation when source is irrelevant.""" + evaluator = evaluator_with_mocked_llm + + claim = "Paris is the capital of France" + context = "The Eiffel Tower is made of iron." + + mock_response = {"stance": "IRRELEVANT"} + + with patch.object( + evaluator, "_get_structured_llm_response", new_callable=AsyncMock + ) as mock_llm: + mock_llm.return_value = mock_response + + stance = await evaluator._evaluate_claim_stance(claim, context) + + assert stance == "IRRELEVANT" + + @pytest.mark.asyncio + async def test_evaluate_claims_against_context( + self, evaluator_with_mocked_llm + ) -> None: + """Test evaluation of claims against multiple context sources.""" + evaluator = evaluator_with_mocked_llm + + claims = [ + {"text": "Paris is in France", "original_sentence": "1"}, + {"text": "Tokyo is in Japan", "original_sentence": "2"}, + ] + context_sources = [ + {"content": "Paris is the capital of France", "source": "Source 1"}, + {"content": "Tokyo is the capital of Japan", "source": "Source 2"}, + ] + + # Mock stance evaluations + with patch.object( + evaluator, "_evaluate_claim_stance", new_callable=AsyncMock + ) as mock_stance: + # Return SUPPORTS for both claims + mock_stance.side_effect = ["SUPPORTS", "SUPPORTS", "SUPPORTS", "SUPPORTS"] + + evaluations = await evaluator._evaluate_claims_against_context( + claims, context_sources + ) + + assert len(evaluations) == 2 + assert all(e["is_grounded"] for e in evaluations) + assert len(evaluations[0]["supporting_sources"]) == 2 + assert len(evaluations[1]["supporting_sources"]) == 2 + + @pytest.mark.asyncio + async def test_claim_grounding_with_contradicting_source( + self, evaluator_with_mocked_llm + ) -> None: + """Test that claims with contradicting sources are not grounded.""" + evaluator = evaluator_with_mocked_llm + + claims = [ + {"text": "The Earth is flat", "original_sentence": "1"}, + ] + context_sources = [ + {"content": "The Earth is spherical", "source": "Science Source"}, + ] + + with patch.object( + evaluator, "_evaluate_claim_stance", new_callable=AsyncMock + ) as mock_stance: + mock_stance.return_value = "CONTRADICTS" + + evaluations = await evaluator._evaluate_claims_against_context( + claims, context_sources + ) + + assert len(evaluations) == 1 + assert not evaluations[0]["is_grounded"] + assert len(evaluations[0]["contradicting_sources"]) == 1 + + @pytest.mark.asyncio + async def test_full_evaluation_with_no_agent_output( + self, evaluator_with_mocked_llm + ) -> None: + """Test evaluation when no agent output is provided.""" + evaluator = evaluator_with_mocked_llm + + agent_execution = AgentExecution( + agent_input={}, + agent_trace=[], + agent_output="", + ) + + result = await evaluator.evaluate( + agent_execution, + evaluation_criteria=LegacyEvaluationCriteria( + expected_output="", + expected_agent_behavior="", + ), + ) + + assert result.score == 0.0 + assert "no agent output" in result.details.lower() + + @pytest.mark.asyncio + async def test_full_evaluation_with_no_context_sources( + self, evaluator_with_mocked_llm + ) -> None: + """Test evaluation when no context sources are available.""" + evaluator = evaluator_with_mocked_llm + + # Create a span without output (no context source) + class NoOutputSpan: + attributes = MappingProxyType( + { + "openinference.span.kind": "TOOL_CALL", + # No output.value + } + ) + + agent_execution = AgentExecution( + agent_input={}, + agent_trace=[], + agent_output="The sky is blue.", + ) + + with patch.object(evaluator, "_extract_context_sources", return_value=[]): + result = await evaluator.evaluate( + agent_execution, + evaluation_criteria=LegacyEvaluationCriteria( + expected_output="The sky is blue.", + expected_agent_behavior="", + ), + ) + + assert result.score == 0.0 + assert "no context sources" in result.details.lower() + + @pytest.mark.asyncio + async def test_full_evaluation_with_no_verifiable_claims( + self, evaluator_with_mocked_llm + ) -> None: + """Test evaluation when no verifiable claims are found.""" + evaluator = evaluator_with_mocked_llm + + agent_execution = AgentExecution( + agent_input={}, + agent_trace=[], + agent_output="Just a greeting.", + ) + + with ( + patch.object( + evaluator, + "_extract_context_sources", + return_value=[{"content": "Some context", "source": "Test"}], + ), + patch.object( + evaluator, "_extract_claims", new_callable=AsyncMock + ) as mock_claims, + ): + mock_claims.return_value = [] + + result = await evaluator.evaluate( + agent_execution, + evaluation_criteria=LegacyEvaluationCriteria( + expected_output="Just a greeting.", + expected_agent_behavior="", + ), + ) + + assert result.score == 100.0 + assert "no verifiable claims" in result.details.lower() + + @pytest.mark.asyncio + async def test_full_evaluation_with_grounded_claims( + self, evaluator_with_mocked_llm + ) -> None: + """Test full evaluation flow with grounded claims.""" + evaluator = evaluator_with_mocked_llm + + agent_execution = AgentExecution( + agent_input={}, + agent_trace=[], + agent_output="Paris is in France.", + ) + + # Mock the extraction and evaluation steps + with ( + patch.object( + evaluator, + "_extract_context_sources", + return_value=[ + { + "content": "Paris is the capital of France", + "source": "Context Grounding", + } + ], + ), + patch.object( + evaluator, "_extract_claims", new_callable=AsyncMock + ) as mock_claims, + patch.object( + evaluator, "_evaluate_claims_against_context", new_callable=AsyncMock + ) as mock_eval, + ): + mock_claims.return_value = [ + {"text": "Paris is in France", "original_sentence": "1"} + ] + mock_eval.return_value = [ + { + "claim": "Paris is in France", + "original_sentence": "1", + "is_grounded": True, + "supporting_sources": ["Context Grounding"], + "contradicting_sources": [], + } + ] + + result = await evaluator.evaluate( + agent_execution, + evaluation_criteria=LegacyEvaluationCriteria( + expected_output="Paris is in France.", + expected_agent_behavior="", + ), + ) + + assert result.score == 100.0 + assert "GROUNDED CLAIMS" in result.details + assert "Paris is in France" in result.details + + @pytest.mark.asyncio + async def test_full_evaluation_with_mixed_claims( + self, evaluator_with_mocked_llm + ) -> None: + """Test full evaluation with both grounded and ungrounded claims.""" + evaluator = evaluator_with_mocked_llm + + agent_execution = AgentExecution( + agent_input={}, + agent_trace=[], + agent_output="Paris is in France. The sky is green.", + ) + + # Mock the extraction and evaluation steps + with ( + patch.object( + evaluator, + "_extract_context_sources", + return_value=[ + { + "content": "Paris is the capital of France", + "source": "Context Grounding", + } + ], + ), + patch.object( + evaluator, "_extract_claims", new_callable=AsyncMock + ) as mock_claims, + patch.object( + evaluator, "_evaluate_claims_against_context", new_callable=AsyncMock + ) as mock_eval, + ): + mock_claims.return_value = [ + {"text": "Paris is in France", "original_sentence": "1"}, + {"text": "The sky is green", "original_sentence": "2"}, + ] + mock_eval.return_value = [ + { + "claim": "Paris is in France", + "original_sentence": "1", + "is_grounded": True, + "supporting_sources": ["Context Grounding"], + "contradicting_sources": [], + }, + { + "claim": "The sky is green", + "original_sentence": "2", + "is_grounded": False, + "supporting_sources": [], + "contradicting_sources": [], + }, + ] + + result = await evaluator.evaluate( + agent_execution, + evaluation_criteria=LegacyEvaluationCriteria( + expected_output="Paris is in France. The sky is green.", + expected_agent_behavior="", + ), + ) + + assert result.score == 50.0 + assert "1/2" in result.details + assert "GROUNDED CLAIMS" in result.details + assert "UNGROUNDED CLAIMS" in result.details + + @pytest.mark.asyncio + async def test_serialize_content_handles_various_types( + self, evaluator_with_mocked_llm + ) -> None: + """Test serialization of various content types.""" + evaluator = evaluator_with_mocked_llm + + # Test string + result = evaluator._serialize_content("simple string") + assert result == "simple string" + + # Test dict + result = evaluator._serialize_content({"key": "value"}) + assert "key" in result and "value" in result + + # Test list + result = evaluator._serialize_content(["item1", "item2"]) + assert "item1" in result and "item2" in result + + @pytest.mark.asyncio + async def test_evaluate_claim_stance_with_invalid_response( + self, evaluator_with_mocked_llm + ) -> None: + """Test that invalid stance responses default to IRRELEVANT.""" + evaluator = evaluator_with_mocked_llm + + claim = "Test claim" + context = "Test context" + + mock_response = {"stance": "INVALID_STANCE"} + + with patch.object( + evaluator, "_get_structured_llm_response", new_callable=AsyncMock + ) as mock_llm: + mock_llm.return_value = mock_response + + stance = await evaluator._evaluate_claim_stance(claim, context) + + assert stance == "IRRELEVANT" + + @pytest.mark.asyncio + async def test_format_justification_structure( + self, evaluator_with_mocked_llm + ) -> None: + """Test the structure of formatted justification.""" + evaluator = evaluator_with_mocked_llm + + claim_evaluations = [ + { + "claim": "Grounded claim 1", + "original_sentence": "1", + "is_grounded": True, + "supporting_sources": ["Source A", "Source B"], + "contradicting_sources": [], + }, + { + "claim": "Ungrounded claim", + "original_sentence": "2", + "is_grounded": False, + "supporting_sources": [], + "contradicting_sources": ["Source C"], + }, + ] + + justification = evaluator._format_justification(50.0, claim_evaluations) + + assert "Overall Faithfulness: 50.0/100" in justification + assert "1/2" in justification + assert "GROUNDED CLAIMS" in justification + assert "UNGROUNDED CLAIMS" in justification + assert "Grounded claim 1" in justification + assert "Ungrounded claim" in justification + assert "Source A" in justification + assert "Source C" in justification diff --git a/uv.lock b/uv.lock index 3f0c5cc09..c4fb23541 100644 --- a/uv.lock +++ b/uv.lock @@ -2486,7 +2486,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.4.6" +version = "2.4.7" source = { editable = "." } dependencies = [ { name = "applicationinsights" },