diff --git a/pyproject.toml b/pyproject.toml
index da1cf62eb..9d9e3a14a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "uipath"
-version = "2.4.6"
+version = "2.4.7"
description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.11"
diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py
index 253ce36d4..851c1aa0f 100644
--- a/src/uipath/_cli/_evals/_evaluator_factory.py
+++ b/src/uipath/_cli/_evals/_evaluator_factory.py
@@ -22,7 +22,9 @@
from uipath.eval.evaluators import (
BaseEvaluator,
LegacyBaseEvaluator,
+ LegacyContextPrecisionEvaluator,
LegacyExactMatchEvaluator,
+ LegacyFaithfulnessEvaluator,
LegacyJsonSimilarityEvaluator,
LegacyLlmAsAJudgeEvaluator,
LegacyTrajectoryEvaluator,
@@ -68,6 +70,7 @@
ToolCallOutputEvaluator,
ToolCallOutputEvaluatorConfig,
)
+from uipath.eval.models import LegacyEvaluatorType
logger = logging.getLogger(__name__)
@@ -428,11 +431,8 @@ def _create_legacy_json_similarity_evaluator(
def _create_legacy_llm_as_judge_evaluator(
params: LLMEvaluatorParams,
agent_model: str | None = None,
- ) -> LegacyLlmAsAJudgeEvaluator:
- """Create an LLM-as-a-judge evaluator."""
- if not params.prompt:
- raise ValueError("LLM evaluator must include 'prompt' field")
-
+ ) -> LegacyBaseEvaluator[Any]:
+ """Create an LLM-as-a-judge evaluator or context precision evaluator based on type."""
if not params.model:
raise ValueError("LLM evaluator must include 'model' field")
@@ -449,10 +449,16 @@ def _create_legacy_llm_as_judge_evaluator(
)
params = params.model_copy(update={"model": agent_model})
- logger.info(
- f"Creating LLM-as-judge evaluator '{params.name}' with model: {params.model}"
- )
- return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={})
+ # Check evaluator type to determine which evaluator to create
+ if params.evaluator_type == LegacyEvaluatorType.ContextPrecision:
+ return LegacyContextPrecisionEvaluator(**params.model_dump(), config={})
+ elif params.evaluator_type == LegacyEvaluatorType.Faithfulness:
+ return LegacyFaithfulnessEvaluator(**params.model_dump(), config={})
+ else:
+ if not params.prompt:
+ raise ValueError("LLM evaluator must include 'prompt' field")
+
+ return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={})
@staticmethod
def _create_legacy_trajectory_evaluator(
diff --git a/src/uipath/eval/evaluators/__init__.py b/src/uipath/eval/evaluators/__init__.py
index bc79e071d..248b5d571 100644
--- a/src/uipath/eval/evaluators/__init__.py
+++ b/src/uipath/eval/evaluators/__init__.py
@@ -10,7 +10,9 @@
# Legacy evaluators
from .legacy_base_evaluator import LegacyBaseEvaluator
+from .legacy_context_precision_evaluator import LegacyContextPrecisionEvaluator
from .legacy_exact_match_evaluator import LegacyExactMatchEvaluator
+from .legacy_faithfulness_evaluator import LegacyFaithfulnessEvaluator
from .legacy_json_similarity_evaluator import LegacyJsonSimilarityEvaluator
from .legacy_llm_as_judge_evaluator import LegacyLlmAsAJudgeEvaluator
from .legacy_trajectory_evaluator import LegacyTrajectoryEvaluator
@@ -46,7 +48,9 @@
__all__ = [
# Legacy evaluators
"LegacyBaseEvaluator",
+ "LegacyContextPrecisionEvaluator",
"LegacyExactMatchEvaluator",
+ "LegacyFaithfulnessEvaluator",
"LegacyJsonSimilarityEvaluator",
"LegacyLlmAsAJudgeEvaluator",
"LegacyTrajectoryEvaluator",
diff --git a/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py b/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py
new file mode 100644
index 000000000..88667408c
--- /dev/null
+++ b/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py
@@ -0,0 +1,349 @@
+"""Legacy Context Precision evaluator for assessing the relevance of context chunks to queries."""
+
+import ast
+import json
+from typing import Any, Optional
+
+from uipath.eval.models import NumericEvaluationResult
+
+from ...platform.chat import UiPathLlmChatService
+from ..models.models import AgentExecution, EvaluationResult
+from .legacy_base_evaluator import (
+ LegacyBaseEvaluator,
+ LegacyEvaluationCriteria,
+ LegacyEvaluatorConfig,
+ track_evaluation_metrics,
+)
+from .legacy_evaluator_utils import clean_model_name, serialize_object
+
+
+class LegacyContextPrecisionEvaluatorConfig(LegacyEvaluatorConfig):
+ """Configuration for legacy context precision evaluators."""
+
+ name: str = "LegacyContextPrecisionEvaluator"
+ model: str = ""
+ prompt: str = """You are an expert evaluator assessing the relevance of context chunks to a given query.
+
+TASK: Evaluate how relevant each provided context chunk is to answering the query.
+Your scoring should be deterministic - the same chunk-query pair should always receive the same score.
+
+EVALUATION CRITERIA:
+Score each chunk using the HIGHEST applicable range (if multiple apply, use the highest):
+
+- HIGHLY RELEVANT (80-100) - Directly answers or addresses the query:
+ * 95-100: Contains the exact, complete answer to the query
+ * 85-94: Directly addresses the query with comprehensive information (but not the complete answer)
+ * 80-84: Provides a direct but partial answer to the query
+
+- MODERATELY RELEVANT (50-79) - Provides useful supporting information:
+ * 70-79: Contains substantial supporting information that helps understand the topic
+ * 60-69: Provides relevant context or background information
+ * 50-59: Has some connection to the query but limited usefulness
+
+- SLIGHTLY RELEVANT (20-49) - Contains tangentially related information:
+ * 35-49: Mentions related concepts, terms, or entities from the query
+ * 20-34: Very indirect connection to the query topic
+
+- NOT RELEVANT (0-19) - Has no meaningful connection to the query:
+ * 10-19: Contains some keywords from the query but no meaningful connection
+ * 0-9: Completely unrelated to the query or empty/malformed content
+
+IMPORTANT INSTRUCTIONS:
+1. Evaluate EACH chunk independently - do not let one chunk influence another's score
+2. Base relevance ONLY on how well the chunk helps answer the specific query
+3. Consider semantic meaning, not just keyword matches
+4. If a chunk is empty or malformed, assign a score of 0
+5. Scores must be integers between 0 and 100 inclusive
+6. Be consistent: similar content should receive similar scores
+7. Use the specific sub-ranges above to guide precise scoring
+8. HIERARCHY RULE: If a chunk meets criteria for multiple ranges, always assign the HIGHEST applicable score
+
+OUTPUT FORMAT:
+You MUST respond using the provided tool with a JSON object containing:
+- A "relevancies" field that is an array
+- Each array element must be an object with "relevancy_score" (integer 0-100)
+- The array must have the same number of elements as context chunks provided
+- Order matters: the first score corresponds to the first chunk, etc.
+
+EXAMPLE STRUCTURE (do not copy values, this is just format):
+{
+ "relevancies": [
+ {"relevancy_score": 85},
+ {"relevancy_score": 45},
+ {"relevancy_score": 0}
+ ]
+}
+
+
+{{Query}}
+
+
+
+{{Chunks}}
+
+
+Evaluate each chunk's relevance to the query and respond with the structured output."""
+
+
+class LegacyContextPrecisionEvaluator(
+ LegacyBaseEvaluator[LegacyContextPrecisionEvaluatorConfig]
+):
+ """Legacy evaluator that assesses context precision using an LLM.
+
+ This evaluator extracts context grounding spans from agent execution traces
+ and uses an LLM to score the relevance of each chunk to its corresponding query.
+ The final score is the mean of all chunk relevancy scores (normalized to 0-1).
+ """
+
+ model: str
+ query_placeholder: str = "{{Query}}"
+ chunks_placeholder: str = "{{Chunks}}"
+ llm: Optional[UiPathLlmChatService] = None
+
+ def model_post_init(self, __context: Any):
+ """Initialize the LLM service after model creation."""
+ super().model_post_init(__context)
+ self._initialize_llm()
+
+ def _initialize_llm(self):
+ """Initialize the LLM used for evaluation."""
+ from uipath.platform import UiPath
+
+ uipath = UiPath()
+ self.llm = uipath.llm
+
+ @track_evaluation_metrics
+ async def evaluate(
+ self,
+ agent_execution: AgentExecution,
+ evaluation_criteria: LegacyEvaluationCriteria,
+ ) -> EvaluationResult:
+ """Evaluate context precision from agent execution traces.
+
+ Args:
+ agent_execution: The execution details containing agent_trace with spans
+ evaluation_criteria: Legacy evaluation criteria (unused for context precision)
+
+ Returns:
+ NumericEvaluationResult with normalized score (0-1) and detailed justification
+ """
+ # Extract context grounding spans from the trace
+ context_groundings = self._extract_context_groundings(
+ agent_execution.agent_trace
+ )
+
+ if not context_groundings:
+ return NumericEvaluationResult(
+ score=0.0,
+ details="No context grounding tool calls found in the agent execution trace.",
+ )
+
+ # Evaluate each context grounding call
+ all_scores = []
+ evaluation_details = []
+
+ for idx, grounding in enumerate(context_groundings, 1):
+ query = grounding.get("query", "")
+ chunks = grounding.get("chunks", [])
+
+ if not query or not chunks:
+ evaluation_details.append(
+ f"{idx}. Query: (empty) - SKIPPED (no query or chunks)"
+ )
+ continue
+
+ scores = await self._evaluate_context_grounding(query, chunks)
+
+ if scores:
+ mean_score = sum(scores) / len(scores)
+ all_scores.append(mean_score)
+
+ # Format score summaries for this grounding
+ score_summaries = [f"Relevancy: {s:d}/100" for s in scores]
+ evaluation_details.append(
+ f'{idx}. Query: "{query}"\n'
+ f"\tAvg. Score: {mean_score:.1f}/100 ({len(scores)} chunks). "
+ f"Chunk Relevancies: [{', '.join(score_summaries)}]."
+ )
+
+ if not all_scores:
+ return NumericEvaluationResult(
+ score=0.0,
+ details="No valid context chunks were found for evaluation.",
+ )
+
+ # Calculate overall mean score (0-100 range)
+ overall_mean = sum(all_scores) / len(all_scores)
+ overall_mean = max(0, min(100, overall_mean))
+
+ # Build justification
+ justification = f"Overall Context Precision: {overall_mean:.1f}/100 ({len(context_groundings)} Context Tool Call(s) evaluated).\n"
+ if evaluation_details:
+ justification += "---\nPer-Context Tool Call Details:\n\n"
+ justification += "\n\n".join(evaluation_details)
+
+ return NumericEvaluationResult(
+ score=overall_mean,
+ details=justification,
+ )
+
+ def _parse_span_value(self, value_str: str) -> Any:
+ """Parse span value that could be JSON or Python literal syntax.
+
+ Args:
+ value_str: String that could be JSON or Python literal (dict/list)
+
+ Returns:
+ Parsed Python object (dict, list, etc.)
+
+ Raises:
+ ValueError: If string cannot be parsed as JSON or literal
+ """
+ try:
+ # Try JSON first (most common)
+ return json.loads(value_str)
+ except json.JSONDecodeError:
+ try:
+ # Fall back to Python literal_eval for Python syntax
+ return ast.literal_eval(value_str)
+ except (ValueError, SyntaxError) as e:
+ raise ValueError(f"Cannot parse value: {value_str}") from e
+
+ def _extract_context_groundings(
+ self, agent_trace: list[Any]
+ ) -> list[dict[str, Any]]:
+ """Extract context groundings from agent execution trace.
+
+ Looks for spans with input.value and output.value attributes that represent
+ context grounding tool calls.
+ """
+ context_groundings = []
+
+ for span in agent_trace:
+ if not hasattr(span, "attributes") or span.attributes is None:
+ continue
+
+ attrs = span.attributes
+
+ if attrs.get("openinference.span.kind", None) != "RETRIEVER":
+ # NOTE: all tool calls can be extracted using this approach
+ continue
+
+ # Look for spans with input.value and output.value (context grounding calls)
+ query = attrs.get("input.value")
+ try:
+ chunks = self._normalize_chunks(
+ json.loads(attrs.get("output.value")).get("documents")
+ )
+
+ if chunks:
+ context_groundings.append(
+ {
+ "query": str(query),
+ "chunks": chunks,
+ }
+ )
+ except (ValueError, KeyError, TypeError):
+ # Skip spans that don't have the expected structure
+ continue
+
+ return context_groundings
+
+ def _normalize_chunks(self, results: Any) -> list[str]:
+ """Normalize various chunk representations to a list of strings."""
+ if isinstance(results, list):
+ return [self._serialize_chunk(chunk) for chunk in results]
+ elif isinstance(results, dict):
+ # Handle dict representations of chunks
+ return [self._serialize_chunk(results)]
+ elif isinstance(results, str):
+ return [results]
+ else:
+ return [str(results)]
+
+ def _serialize_chunk(self, chunk: Any) -> str:
+ """Serialize a single chunk to string format."""
+ return serialize_object(chunk, sort_keys=True)
+
+ async def _evaluate_context_grounding(
+ self, query: str, chunks: list[str]
+ ) -> list[int]:
+ """Evaluate the relevance of chunks to a query using the LLM.
+
+ Args:
+ query: The query string
+ chunks: List of context chunks to evaluate
+
+ Returns:
+ List of relevancy scores (0-100) for each chunk
+ """
+ # Create evaluation prompt
+ chunks_text = "\n".join(chunks)
+ prompt = self.evaluator_config.prompt.replace(
+ self.query_placeholder, query
+ ).replace(self.chunks_placeholder, chunks_text)
+
+ # Get LLM response
+ response_obj = await self._get_structured_llm_response(prompt)
+
+ # Extract relevancy scores from response
+ relevancies = response_obj.get("relevancies", [])
+ if not relevancies:
+ raise ValueError("No relevancies found in LLM response")
+
+ scores = []
+ for rel in relevancies:
+ if isinstance(rel, dict) and "relevancy_score" in rel:
+ score = rel["relevancy_score"]
+ # Clamp score to 0-100
+ score = max(0, min(100, int(score)))
+ scores.append(score)
+
+ return scores
+
+ async def _get_structured_llm_response(
+ self, evaluation_prompt: str
+ ) -> dict[str, Any]:
+ """Get structured LLM response using the context precision schema."""
+ # Remove community-agents suffix from llm model name
+ model = clean_model_name(self.model)
+
+ # Prepare the request
+ request_data = {
+ "model": model,
+ "messages": [
+ {"role": "system", "content": "Context Precision Evaluation"},
+ {"role": "user", "content": evaluation_prompt},
+ ],
+ "response_format": {
+ "type": "json_schema",
+ "json_schema": {
+ "name": "context_precision_evaluation",
+ "schema": {
+ "type": "object",
+ "properties": {
+ "relevancies": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "relevancy_score": {
+ "type": "number",
+ "description": "Relevancy score for the chunk (0-100).",
+ }
+ },
+ "required": ["relevancy_score"],
+ },
+ "description": "List of relevancy scores for each context chunk",
+ }
+ },
+ "required": ["relevancies"],
+ },
+ },
+ },
+ }
+
+ assert self.llm, "LLM should be initialized before calling this method."
+ response = await self.llm.chat_completions(**request_data)
+ content = response.choices[-1].message.content or "{}"
+ return json.loads(content)
diff --git a/src/uipath/eval/evaluators/legacy_evaluator_utils.py b/src/uipath/eval/evaluators/legacy_evaluator_utils.py
new file mode 100644
index 000000000..b8c20f372
--- /dev/null
+++ b/src/uipath/eval/evaluators/legacy_evaluator_utils.py
@@ -0,0 +1,75 @@
+"""Utility functions for legacy evaluators."""
+
+import json
+from typing import Any, Optional
+
+from ..._utils.constants import COMMUNITY_agents_SUFFIX
+
+
+def clean_model_name(model: str) -> str:
+ """Remove community-agents suffix from model name.
+
+ Args:
+ model: Model name that may have the community suffix
+
+ Returns:
+ Model name without the community suffix
+ """
+ if model.endswith(COMMUNITY_agents_SUFFIX):
+ return model.replace(COMMUNITY_agents_SUFFIX, "")
+ return model
+
+
+def serialize_object(
+ content: Any,
+ sort_keys: bool = False,
+) -> str:
+ """Serialize content to string format.
+
+ Args:
+ content: Content to serialize (str, dict, list, etc.)
+ sort_keys: Whether to sort dict keys (default: False)
+
+ Returns:
+ Serialized string representation
+ """
+ if isinstance(content, str):
+ return content
+ elif isinstance(content, dict):
+ if sort_keys:
+ content = dict(sorted(content.items()))
+ return json.dumps(content, default=str, separators=(",", ":"))
+ else:
+ return json.dumps(content, default=str, separators=(",", ":"))
+
+
+def safe_get_span_attributes(span: Any) -> Optional[dict[str, Any]]:
+ """Safely extract attributes from a span.
+
+ Args:
+ span: The span object
+
+ Returns:
+ Span attributes dict, or None if not available
+ """
+ if not hasattr(span, "attributes") or span.attributes is None:
+ return None
+ return span.attributes
+
+
+def parse_json_value(value: str) -> Any:
+ """Safely parse a JSON string value.
+
+ Args:
+ value: JSON string to parse
+
+ Returns:
+ Parsed JSON object
+
+ Raises:
+ ValueError: If string cannot be parsed as JSON
+ """
+ try:
+ return json.loads(value)
+ except json.JSONDecodeError as e:
+ raise ValueError(f"Cannot parse JSON value: {value}") from e
diff --git a/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py b/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py
new file mode 100644
index 000000000..ac0373384
--- /dev/null
+++ b/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py
@@ -0,0 +1,513 @@
+"""Legacy Faithfulness evaluator for assessing whether agent output claims are grounded in context."""
+
+import json
+from typing import Any, Optional
+
+from uipath.eval.models import NumericEvaluationResult
+from uipath.platform.chat import UiPathLlmChatService
+
+from ..models.models import AgentExecution, EvaluationResult
+from .legacy_base_evaluator import (
+ LegacyBaseEvaluator,
+ LegacyEvaluationCriteria,
+ LegacyEvaluatorConfig,
+ track_evaluation_metrics,
+)
+from .legacy_evaluator_utils import (
+ clean_model_name,
+ serialize_object,
+)
+
+
+class LegacyFaithfulnessEvaluatorConfig(LegacyEvaluatorConfig):
+ """Configuration for legacy faithfulness evaluators."""
+
+ name: str = "LegacyFaithfulnessEvaluator"
+ model: str = ""
+
+
+class LegacyFaithfulnessEvaluator(
+ LegacyBaseEvaluator[LegacyFaithfulnessEvaluatorConfig]
+):
+ """Legacy evaluator that assesses faithfulness using an LLM.
+
+ This evaluator extracts claims from agent output using a 3-stage pipeline
+ (selection, disambiguation, decomposition) and evaluates whether each claim
+ is grounded in the available context sources extracted from agent traces.
+ The final score is the percentage of claims that are grounded.
+ """
+
+ model: str
+ llm: Optional[UiPathLlmChatService] = None
+
+ def model_post_init(self, __context: Any):
+ """Initialize the LLM service after model creation."""
+ super().model_post_init(__context)
+ self._initialize_llm()
+
+ def _initialize_llm(self):
+ """Initialize the LLM used for evaluation."""
+ from uipath.platform import UiPath
+
+ uipath = UiPath()
+ self.llm = uipath.llm
+
+ @track_evaluation_metrics
+ async def evaluate(
+ self,
+ agent_execution: AgentExecution,
+ evaluation_criteria: LegacyEvaluationCriteria,
+ ) -> EvaluationResult:
+ """Evaluate faithfulness of agent output against available context.
+
+ Args:
+ agent_execution: The execution details containing agent_trace with spans
+ evaluation_criteria: Legacy evaluation criteria containing expected_output
+
+ Returns:
+ NumericEvaluationResult with normalized score (0-100) and detailed justification
+ """
+ # Extract agent output
+ agent_output = str(evaluation_criteria.expected_output or "")
+ if not agent_output or not agent_output.strip():
+ return NumericEvaluationResult(
+ score=0.0,
+ details="No agent output provided for faithfulness evaluation.",
+ )
+
+ # Extract context sources from traces
+ context_sources = self._extract_context_sources(agent_execution.agent_trace)
+
+ if not context_sources:
+ return NumericEvaluationResult(
+ score=0.0,
+ details="No context sources found in the agent execution trace.",
+ )
+
+ # Stage 1: Extract verifiable claims from agent output
+ claims = await self._extract_claims(agent_output)
+
+ if not claims:
+ return NumericEvaluationResult(
+ score=100.0,
+ details="No verifiable claims found in agent output.",
+ )
+
+ # Stage 2: Evaluate each claim against context sources
+ claim_evaluations = await self._evaluate_claims_against_context(
+ claims, context_sources
+ )
+
+ # Calculate score
+ grounded_claims = [c for c in claim_evaluations if c["is_grounded"]]
+ score = (
+ (len(grounded_claims) / len(claim_evaluations)) * 100
+ if claim_evaluations
+ else 0.0
+ )
+ score = max(0, min(100, score))
+
+ # Build justification
+ justification = self._format_justification(score, claim_evaluations)
+
+ return NumericEvaluationResult(
+ score=score,
+ details=justification,
+ )
+
+ def _extract_context_sources(self, agent_trace: list[Any]) -> list[dict[str, str]]:
+ """Extract context sources from agent execution trace.
+
+ Looks for tool call outputs and context grounding spans that provide context.
+
+ Returns:
+ List of context source dicts with 'content' and 'source' keys
+ """
+ context_sources = []
+
+ for span in agent_trace:
+ if not hasattr(span, "attributes") or span.attributes is None:
+ continue
+
+ attrs = span.attributes
+
+ tool_name = attrs.get("openinference.span.kind")
+ if not tool_name or tool_name == "UNKNOWN":
+ continue
+
+ output_value = attrs.get("output.value")
+ if not output_value:
+ continue
+
+ try:
+ output_data = (
+ json.loads(output_value)
+ if isinstance(output_value, str)
+ else output_value
+ )
+
+ # For RETRIEVER spans, extract individual documents
+ if tool_name == "RETRIEVER":
+ documents = output_data.get("documents", [])
+ if documents:
+ for doc in documents:
+ content = self._serialize_content(doc)
+ context_sources.append(
+ {"content": content, "source": "Context Grounding"}
+ )
+ else:
+ # For other tool calls, extract the full output
+ content = self._serialize_content(output_data)
+ context_sources.append({"content": content, "source": tool_name})
+ except (ValueError, TypeError):
+ continue
+
+ return context_sources
+
+ def _serialize_content(self, content: Any) -> str:
+ """Serialize content to string format."""
+ return serialize_object(content, sort_keys=False)
+
+ async def _extract_claims(self, agent_output: str) -> list[dict[str, str]]:
+ """Extract verifiable claims from agent output using 3-stage pipeline.
+
+ Stages:
+ 1. Selection: Filter to verifiable sentences
+ 2. Disambiguation: Resolve internal ambiguities
+ 3. Decomposition: Extract standalone claims
+
+ Returns:
+ List of claim dicts with 'text' and 'original_sentence' keys
+ """
+ # Stage 1: Selection
+ verifiable_sentences = await self._select_verifiable_sentences(agent_output)
+ if not verifiable_sentences:
+ return []
+
+ # Stage 2: Disambiguation
+ disambiguated_sentences = await self._disambiguate_sentences(
+ verifiable_sentences, agent_output
+ )
+ if not disambiguated_sentences:
+ return []
+
+ # Stage 3: Decomposition
+ claims = await self._decompose_to_claims(disambiguated_sentences, agent_output)
+ return claims
+
+ async def _select_verifiable_sentences(self, agent_output: str) -> list[str]:
+ """Stage 1: Filter agent output to verifiable sentences."""
+ prompt = f"""You are an expert evaluator identifying verifiable claims.
+
+TASK: Identify sentences in the agent output that contain verifiable, factual claims.
+Filter out subjective opinions, instructions, questions, and meta-commentary.
+
+OUTPUT FORMAT: Return a JSON object with a "sentences" field containing an array of strings.
+Each string should be a complete sentence from the original output.
+
+
+{agent_output}
+
+
+Identify and return only the verifiable sentences."""
+
+ response_obj = await self._get_structured_llm_response(
+ prompt,
+ schema_name="claim_selection",
+ schema={
+ "type": "object",
+ "properties": {
+ "sentences": {
+ "type": "array",
+ "items": {"type": "string"},
+ "description": "List of verifiable sentences from agent output",
+ }
+ },
+ "required": ["sentences"],
+ },
+ )
+
+ return response_obj.get("sentences", [])
+
+ async def _disambiguate_sentences(
+ self, sentences: list[str], full_output: str
+ ) -> list[dict[str, str]]:
+ """Stage 2: Resolve ambiguities in sentences."""
+ if not sentences:
+ return []
+
+ sentences_text = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(sentences))
+
+ prompt = f"""You are an expert at disambiguating claims.
+
+TASK: For each sentence, resolve any internal ambiguities using the full agent output as context.
+Replace pronouns, references, and implicit information with explicit, standalone versions.
+
+
+{full_output}
+
+
+
+{sentences_text}
+
+
+OUTPUT FORMAT: Return a JSON object with a "disambiguated" field containing an array of objects.
+Each object must have:
+- "original": the original sentence
+- "disambiguated": the disambiguated version"""
+
+ response_obj = await self._get_structured_llm_response(
+ prompt,
+ schema_name="claim_disambiguation",
+ schema={
+ "type": "object",
+ "properties": {
+ "disambiguated": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "original": {"type": "string"},
+ "disambiguated": {"type": "string"},
+ },
+ "required": ["original", "disambiguated"],
+ },
+ "description": "List of disambiguated sentences",
+ }
+ },
+ "required": ["disambiguated"],
+ },
+ )
+
+ return response_obj.get("disambiguated", [])
+
+ async def _decompose_to_claims(
+ self, disambiguated: list[dict[str, str]], full_output: str
+ ) -> list[dict[str, str]]:
+ """Stage 3: Decompose sentences into standalone verifiable claims."""
+ if not disambiguated:
+ return []
+
+ sentences_text = "\n".join(
+ f"{i + 1}. {item.get('disambiguated', '')}"
+ for i, item in enumerate(disambiguated)
+ )
+
+ prompt = f"""You are an expert at claim decomposition.
+
+TASK: Break down each sentence into standalone, atomic claims that can be independently verified.
+Each claim should be self-contained and not depend on other claims for context.
+
+
+{sentences_text}
+
+
+
+{full_output}
+
+
+OUTPUT FORMAT: Return a JSON object with a "claims" field containing an array of objects.
+Each object must have:
+- "claim": the standalone claim
+- "original_sentence": which sentence it came from (number)"""
+
+ response_obj = await self._get_structured_llm_response(
+ prompt,
+ schema_name="claim_decomposition",
+ schema={
+ "type": "object",
+ "properties": {
+ "claims": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "claim": {"type": "string"},
+ "original_sentence": {"type": "string"},
+ },
+ "required": ["claim", "original_sentence"],
+ },
+ "description": "List of decomposed claims",
+ }
+ },
+ "required": ["claims"],
+ },
+ )
+
+ claims_data = response_obj.get("claims", [])
+ return [
+ {
+ "text": c.get("claim", ""),
+ "original_sentence": c.get("original_sentence", ""),
+ }
+ for c in claims_data
+ if c.get("claim", "").strip()
+ ]
+
+ async def _evaluate_claims_against_context(
+ self,
+ claims: list[dict[str, str]],
+ context_sources: list[dict[str, str]],
+ ) -> list[dict[str, Any]]:
+ """Evaluate each claim against context sources.
+
+ Returns:
+ List of claim evaluations with grounding status and source attribution
+ """
+ claim_evaluations = []
+
+ for claim in claims:
+ claim_text = claim.get("text", "")
+ if not claim_text.strip():
+ continue
+
+ supporting_sources = []
+ contradicting_sources = []
+
+ # Evaluate claim against each context source
+ for source in context_sources:
+ source_content = source.get("content", "")
+ source_name = source.get("source", "Unknown")
+
+ stance = await self._evaluate_claim_stance(claim_text, source_content)
+
+ if stance == "SUPPORTS":
+ supporting_sources.append(source_name)
+ elif stance == "CONTRADICTS":
+ contradicting_sources.append(source_name)
+
+ # A claim is grounded if it has supporting sources and no contradicting ones
+ is_grounded = (
+ len(supporting_sources) > 0 and len(contradicting_sources) == 0
+ )
+
+ claim_evaluations.append(
+ {
+ "claim": claim_text,
+ "original_sentence": claim.get("original_sentence", ""),
+ "is_grounded": is_grounded,
+ "supporting_sources": supporting_sources,
+ "contradicting_sources": contradicting_sources,
+ }
+ )
+
+ return claim_evaluations
+
+ async def _evaluate_claim_stance(self, claim: str, context: str) -> str:
+ """Evaluate whether a context source supports, contradicts, or is irrelevant to a claim.
+
+ Returns:
+ One of: "SUPPORTS", "CONTRADICTS", "IRRELEVANT"
+ """
+ prompt = f"""You are an expert evaluator assessing the relationship between claims and sources.
+
+TASK: Determine if the source supports, contradicts, or is irrelevant to the claim.
+
+DEFINITION:
+- SUPPORTS: The source provides evidence that makes the claim more likely to be true
+- CONTRADICTS: The source provides evidence that makes the claim false or less likely
+- IRRELEVANT: The source does not address the claim at all
+
+
+{claim}
+
+
+
+{context}
+
+
+OUTPUT FORMAT: Return a JSON object with a "stance" field.
+The stance must be exactly one of: "SUPPORTS", "CONTRADICTS", or "IRRELEVANT"."""
+
+ response_obj = await self._get_structured_llm_response(
+ prompt,
+ schema_name="claim_stance_evaluation",
+ schema={
+ "type": "object",
+ "properties": {
+ "stance": {
+ "type": "string",
+ "enum": ["SUPPORTS", "CONTRADICTS", "IRRELEVANT"],
+ "description": "Stance of the source relative to the claim",
+ }
+ },
+ "required": ["stance"],
+ },
+ )
+
+ stance = response_obj.get("stance", "IRRELEVANT").upper()
+ if stance not in ["SUPPORTS", "CONTRADICTS", "IRRELEVANT"]:
+ stance = "IRRELEVANT"
+
+ return stance
+
+ def _format_justification(
+ self, score: float, claim_evaluations: list[dict[str, Any]]
+ ) -> str:
+ """Format detailed justification with claim breakdown."""
+ grounded_claims = [c for c in claim_evaluations if c["is_grounded"]]
+ ungrounded_claims = [c for c in claim_evaluations if not c["is_grounded"]]
+
+ justification = (
+ f"Overall Faithfulness: {score:.1f}/100 "
+ f"({len(grounded_claims)}/{len(claim_evaluations)} claims grounded).\n"
+ )
+
+ if claim_evaluations:
+ justification += "---\n"
+
+ if grounded_claims:
+ justification += "\n✓ GROUNDED CLAIMS:\n\n"
+ for i, eval_item in enumerate(grounded_claims, 1):
+ justification += f'{i}. "{eval_item["claim"]}"\n'
+ if eval_item["supporting_sources"]:
+ sources_str = ", ".join(eval_item["supporting_sources"])
+ justification += f" Supporting Sources: {sources_str}\n"
+ justification += "\n"
+
+ if ungrounded_claims:
+ justification += "\n✗ UNGROUNDED CLAIMS:\n\n"
+ for i, eval_item in enumerate(ungrounded_claims, 1):
+ justification += f'{i}. "{eval_item["claim"]}"\n'
+ if eval_item["contradicting_sources"]:
+ sources_str = ", ".join(eval_item["contradicting_sources"])
+ justification += f" Contradicting Sources: {sources_str}\n"
+ if (
+ not eval_item["supporting_sources"]
+ and not eval_item["contradicting_sources"]
+ ):
+ justification += " No supporting sources found in context.\n"
+ justification += "\n"
+
+ return justification.rstrip()
+
+ async def _get_structured_llm_response(
+ self,
+ evaluation_prompt: str,
+ schema_name: str,
+ schema: dict[str, Any],
+ ) -> dict[str, Any]:
+ """Get structured LLM response using JSON schema."""
+ # Remove community-agents suffix from llm model name
+ model = clean_model_name(self.model)
+
+ # Prepare the request
+ request_data = {
+ "model": model,
+ "messages": [
+ {"role": "system", "content": "Faithfulness Evaluation"},
+ {"role": "user", "content": evaluation_prompt},
+ ],
+ "response_format": {
+ "type": "json_schema",
+ "json_schema": {
+ "name": schema_name,
+ "schema": schema,
+ },
+ },
+ }
+
+ assert self.llm, "LLM should be initialized before calling this method."
+ response = await self.llm.chat_completions(**request_data)
+ content = response.choices[-1].message.content or "{}"
+ return json.loads(content)
diff --git a/tests/cli/evaluators/test_legacy_context_precision_evaluator.py b/tests/cli/evaluators/test_legacy_context_precision_evaluator.py
new file mode 100644
index 000000000..761facad3
--- /dev/null
+++ b/tests/cli/evaluators/test_legacy_context_precision_evaluator.py
@@ -0,0 +1,313 @@
+"""Tests for LegacyContextPrecisionEvaluator.
+
+Tests span extraction, chunk normalization, and LLM evaluation.
+"""
+
+import json
+from types import MappingProxyType
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams
+from uipath.eval.evaluators import LegacyContextPrecisionEvaluator
+from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria
+from uipath.eval.models.models import (
+ AgentExecution,
+ LegacyEvaluatorCategory,
+ LegacyEvaluatorType,
+)
+
+
+def _make_base_params() -> EvaluatorBaseParams:
+ """Create base parameters for context precision evaluator."""
+ return EvaluatorBaseParams(
+ id="context-precision",
+ category=LegacyEvaluatorCategory.LlmAsAJudge,
+ evaluator_type=LegacyEvaluatorType.ContextPrecision,
+ name="Context Precision",
+ description="Evaluates context chunk relevance",
+ created_at="2025-01-01T00:00:00Z",
+ updated_at="2025-01-01T00:00:00Z",
+ target_output_key="*",
+ )
+
+
+@pytest.fixture
+def evaluator_with_mocked_llm():
+ """Fixture to create evaluator with mocked LLM service."""
+ with patch("uipath.platform.UiPath"):
+ evaluator = LegacyContextPrecisionEvaluator(
+ **_make_base_params().model_dump(),
+ config={},
+ model="gpt-4.1-2025-04-14",
+ )
+ return evaluator
+
+
+def _make_mock_span(input_query: str, output_chunks: list[str]):
+ """Create a mock span with context grounding data."""
+
+ class MockSpan:
+ def __init__(self):
+ self.attributes = MappingProxyType(
+ {
+ "openinference.span.kind": "RETRIEVER",
+ "input.mime_type": "text/plain",
+ "input.value": input_query,
+ "output.value": json.dumps(
+ {
+ "documents": [
+ {"id": str(i), "text": chunk}
+ for i, chunk in enumerate(output_chunks)
+ ]
+ }
+ ),
+ "output.mime_type": "application/json",
+ }
+ )
+
+ return MockSpan()
+
+
+class TestLegacyContextPrecisionEvaluator:
+ """Test suite for LegacyContextPrecisionEvaluator."""
+
+ @pytest.mark.asyncio
+ async def test_span_extraction_with_valid_data(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test extraction of context groundings from spans."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Create mock span with context grounding data
+ span = _make_mock_span(
+ input_query="construction industry",
+ output_chunks=["Building materials", "Safety codes", "Project management"],
+ )
+
+ # Extract context groundings
+ groundings = evaluator._extract_context_groundings([span])
+
+ assert len(groundings) == 1
+ assert groundings[0]["query"] == "construction industry"
+ assert len(groundings[0]["chunks"]) == 3
+ # Chunks are JSON-serialized because they come from the output
+ assert any("Building materials" in chunk for chunk in groundings[0]["chunks"])
+
+ @pytest.mark.asyncio
+ async def test_span_extraction_skips_invalid_spans(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test that spans without proper structure are skipped."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Create spans: one valid, one invalid
+ valid_span = _make_mock_span(
+ input_query="test query",
+ output_chunks=["chunk1"],
+ )
+
+ class InvalidSpan:
+ attributes = MappingProxyType(
+ {
+ "openinference.span.kind": "RETRIEVER",
+ # Missing input.value and output.value
+ }
+ )
+
+ groundings = evaluator._extract_context_groundings([valid_span, InvalidSpan()])
+
+ assert len(groundings) == 1
+ assert groundings[0]["query"] == "test query"
+
+ @pytest.mark.asyncio
+ async def test_chunk_normalization(self, evaluator_with_mocked_llm) -> None:
+ """Test normalization of various chunk formats."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Test list of strings
+ chunks = evaluator._normalize_chunks(["chunk1", "chunk2"])
+ assert len(chunks) == 2
+ assert all(isinstance(c, str) for c in chunks)
+
+ # Test list of dicts
+ chunks = evaluator._normalize_chunks(
+ [
+ {"id": "1", "text": "content1"},
+ {"id": "2", "text": "content2"},
+ ]
+ )
+ assert len(chunks) == 2
+ assert all(isinstance(c, str) for c in chunks)
+
+ # Test single string
+ chunks = evaluator._normalize_chunks("single chunk")
+ assert len(chunks) == 1
+ assert chunks[0] == "single chunk"
+
+ @pytest.mark.asyncio
+ async def test_evaluation_with_mocked_llm(self, evaluator_with_mocked_llm) -> None:
+ """Test evaluation logic with mocked LLM."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Create mock spans
+ span = _make_mock_span(
+ input_query="python programming",
+ output_chunks=[
+ "Python syntax guide",
+ "Python libraries overview",
+ "JavaScript fundamentals",
+ ],
+ )
+
+ # Extract context groundings from the span
+ groundings = evaluator._extract_context_groundings([span])
+ assert len(groundings) == 1
+ assert groundings[0]["query"] == "python programming"
+ assert len(groundings[0]["chunks"]) == 3
+
+ # Test the grounding evaluation with mocked LLM response
+ mock_llm_response = {
+ "relevancies": [
+ {"relevancy_score": 95},
+ {"relevancy_score": 75},
+ {"relevancy_score": 45},
+ ]
+ }
+
+ with patch.object(
+ evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+ ) as mock_llm:
+ mock_llm.return_value = mock_llm_response
+
+ # Evaluate the context grounding
+ scores = await evaluator._evaluate_context_grounding(
+ groundings[0]["query"], groundings[0]["chunks"]
+ )
+
+ assert scores == [95, 75, 45]
+ assert abs(sum(scores) / len(scores) - 71.66666667) < 0.01
+
+ @pytest.mark.asyncio
+ async def test_evaluation_with_no_context_groundings(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test evaluation when no context groundings are found."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Create empty agent execution (no spans)
+ agent_execution = AgentExecution(
+ agent_input={},
+ agent_trace=[],
+ agent_output="",
+ )
+
+ result = await evaluator.evaluate(
+ agent_execution,
+ evaluation_criteria=LegacyEvaluationCriteria(
+ expected_output="",
+ expected_agent_behavior="",
+ ),
+ )
+
+ assert result.score == 0.0
+ assert "no context grounding" in result.details.lower()
+
+ @pytest.mark.asyncio
+ async def test_evaluation_multiple_context_calls(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test evaluation logic with multiple context grounding calls."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Create two mock spans
+ span1 = _make_mock_span(
+ input_query="query 1",
+ output_chunks=["chunk1a", "chunk1b"],
+ )
+ span2 = _make_mock_span(
+ input_query="query 2",
+ output_chunks=["chunk2a", "chunk2b", "chunk2c"],
+ )
+
+ # Extract context groundings from the spans
+ groundings = evaluator._extract_context_groundings([span1, span2])
+ assert len(groundings) == 2
+ assert groundings[0]["query"] == "query 1"
+ assert groundings[1]["query"] == "query 2"
+
+ # Mock the LLM responses
+ mock_llm_response_1 = {
+ "relevancies": [{"relevancy_score": 90}, {"relevancy_score": 80}]
+ }
+ mock_llm_response_2 = {
+ "relevancies": [
+ {"relevancy_score": 85},
+ {"relevancy_score": 75},
+ {"relevancy_score": 65},
+ ]
+ }
+
+ with patch.object(
+ evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+ ) as mock_llm:
+ # Return different responses for each call
+ mock_llm.side_effect = [mock_llm_response_1, mock_llm_response_2]
+
+ # Evaluate both context groundings
+ scores1 = await evaluator._evaluate_context_grounding(
+ groundings[0]["query"], groundings[0]["chunks"]
+ )
+ scores2 = await evaluator._evaluate_context_grounding(
+ groundings[1]["query"], groundings[1]["chunks"]
+ )
+
+ # Verify individual scores
+ assert scores1 == [90, 80]
+ assert scores2 == [85, 75, 65]
+
+ # Verify means
+ mean1 = sum(scores1) / len(scores1) # 85
+ mean2 = sum(scores2) / len(scores2) # 75
+ overall_mean = (mean1 + mean2) / 2 # 80
+ assert overall_mean == 80.0
+
+ @pytest.mark.asyncio
+ async def test_span_extraction_handles_json_parse_errors(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test that spans with invalid JSON are skipped."""
+ evaluator = evaluator_with_mocked_llm
+
+ class BadJsonSpan:
+ attributes = MappingProxyType(
+ {
+ "openinference.span.kind": "RETRIEVER",
+ "input.value": "test query",
+ "output.value": "not valid json",
+ }
+ )
+
+ # Should not raise, should skip the span
+ groundings = evaluator._extract_context_groundings([BadJsonSpan()])
+ assert len(groundings) == 0
+
+ @pytest.mark.asyncio
+ async def test_serialization_of_dict_chunks(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test that dict chunks are properly serialized."""
+ evaluator = evaluator_with_mocked_llm
+
+ chunks = evaluator._normalize_chunks(
+ [
+ {"title": "Document 1", "content": "Some content"},
+ {"title": "Document 2", "content": "More content"},
+ ]
+ )
+
+ assert len(chunks) == 2
+ assert all(isinstance(c, str) for c in chunks)
+ # Should be JSON serialized
+ assert '"title"' in chunks[0] or '"content"' in chunks[0]
diff --git a/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py b/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py
new file mode 100644
index 000000000..c72ad4f6b
--- /dev/null
+++ b/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py
@@ -0,0 +1,634 @@
+"""Tests for LegacyFaithfulnessEvaluator.
+
+Tests span extraction, claim extraction (3-stage pipeline), and claim evaluation.
+"""
+
+import json
+from types import MappingProxyType
+from typing import Any
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams
+from uipath.eval.evaluators import LegacyFaithfulnessEvaluator
+from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria
+from uipath.eval.models.models import (
+ AgentExecution,
+ LegacyEvaluatorCategory,
+ LegacyEvaluatorType,
+)
+
+
+def _make_base_params() -> EvaluatorBaseParams:
+ """Create base parameters for faithfulness evaluator."""
+ return EvaluatorBaseParams(
+ id="faithfulness",
+ category=LegacyEvaluatorCategory.LlmAsAJudge,
+ evaluator_type=LegacyEvaluatorType.Faithfulness,
+ name="Faithfulness",
+ description="Evaluates faithfulness of claims against context",
+ created_at="2025-01-01T00:00:00Z",
+ updated_at="2025-01-01T00:00:00Z",
+ target_output_key="*",
+ )
+
+
+@pytest.fixture
+def evaluator_with_mocked_llm():
+ """Fixture to create evaluator with mocked LLM service."""
+ with patch("uipath.platform.UiPath"):
+ evaluator = LegacyFaithfulnessEvaluator(
+ **_make_base_params().model_dump(),
+ config={},
+ model="gpt-4.1-2025-04-14",
+ )
+ return evaluator
+
+
+def _make_mock_span(tool_name: str, output_data: dict[str, Any]):
+ """Create a mock span with tool call data."""
+
+ class MockSpan:
+ def __init__(self):
+ self.attributes = MappingProxyType(
+ {
+ "openinference.span.kind": tool_name,
+ "output.value": json.dumps(output_data),
+ }
+ )
+
+ return MockSpan()
+
+
+def _make_retriever_span(query: str, documents: list[str]):
+ """Create a mock RETRIEVER span with context grounding data."""
+ return _make_mock_span(
+ "RETRIEVER",
+ {"documents": [{"id": str(i), "text": doc} for i, doc in enumerate(documents)]},
+ )
+
+
+class TestLegacyFaithfulnessEvaluator:
+ """Test suite for LegacyFaithfulnessEvaluator."""
+
+ @pytest.mark.asyncio
+ async def test_context_source_extraction_from_tool_calls(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test extraction of context sources from tool call spans."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Create mock spans with tool outputs
+ span1 = _make_mock_span("TOOL_CALL", {"result": "Tool output 1"})
+ span2 = _make_mock_span("TOOL_CALL", {"result": "Tool output 2"})
+
+ # Extract context sources
+ sources = evaluator._extract_context_sources([span1, span2])
+
+ assert len(sources) == 2
+ assert all("content" in s and "source" in s for s in sources)
+
+ @pytest.mark.asyncio
+ async def test_context_source_extraction_from_retriever(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test extraction of context sources from RETRIEVER spans."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Create mock RETRIEVER span with documents
+ span = _make_retriever_span(
+ "construction", ["Building materials info", "Safety codes"]
+ )
+
+ # Extract context sources (should extract each document individually)
+ sources = evaluator._extract_context_sources([span])
+
+ assert len(sources) == 2
+ assert all(s["source"] == "Context Grounding" for s in sources)
+ # Check that we have both documents
+ contents = [s["content"] for s in sources]
+ assert any("Building materials" in c for c in contents)
+ assert any("Safety codes" in c for c in contents)
+
+ @pytest.mark.asyncio
+ async def test_context_source_extraction_skips_invalid_spans(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test that spans without proper structure are skipped."""
+ evaluator = evaluator_with_mocked_llm
+
+ class InvalidSpan:
+ attributes = MappingProxyType(
+ {
+ "openinference.span.kind": "TOOL_CALL",
+ # Missing output.value
+ }
+ )
+
+ # Should skip invalid span
+ sources = evaluator._extract_context_sources([InvalidSpan()])
+ assert len(sources) == 0
+
+ @pytest.mark.asyncio
+ async def test_select_verifiable_sentences(self, evaluator_with_mocked_llm) -> None:
+ """Test Stage 1: Selection of verifiable sentences."""
+ evaluator = evaluator_with_mocked_llm
+
+ agent_output = (
+ "The capital of France is Paris. Do you agree? This is important."
+ )
+
+ mock_response = {"sentences": ["The capital of France is Paris."]}
+
+ with patch.object(
+ evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+ ) as mock_llm:
+ mock_llm.return_value = mock_response
+
+ sentences = await evaluator._select_verifiable_sentences(agent_output)
+
+ assert len(sentences) == 1
+ assert "capital of France" in sentences[0]
+
+ @pytest.mark.asyncio
+ async def test_disambiguate_sentences(self, evaluator_with_mocked_llm) -> None:
+ """Test Stage 2: Disambiguation of sentences."""
+ evaluator = evaluator_with_mocked_llm
+
+ verifiable_sentences = ["It is located in Western Europe."]
+ full_output = "France is a country. It is located in Western Europe."
+
+ mock_response = {
+ "disambiguated": [
+ {
+ "original": "It is located in Western Europe.",
+ "disambiguated": "France is located in Western Europe.",
+ }
+ ]
+ }
+
+ with patch.object(
+ evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+ ) as mock_llm:
+ mock_llm.return_value = mock_response
+
+ result = await evaluator._disambiguate_sentences(
+ verifiable_sentences, full_output
+ )
+
+ assert len(result) == 1
+ assert result[0]["disambiguated"] == "France is located in Western Europe."
+
+ @pytest.mark.asyncio
+ async def test_decompose_to_claims(self, evaluator_with_mocked_llm) -> None:
+ """Test Stage 3: Decomposition into standalone claims."""
+ evaluator = evaluator_with_mocked_llm
+
+ disambiguated = [
+ {
+ "original": "Paris and Lyon have populations over 1 million.",
+ "disambiguated": "Paris and Lyon have populations over 1 million.",
+ }
+ ]
+ full_output = (
+ "France has major cities. Paris and Lyon have populations over 1 million."
+ )
+
+ mock_response = {
+ "claims": [
+ {
+ "claim": "Paris has a population over 1 million",
+ "original_sentence": "1",
+ },
+ {
+ "claim": "Lyon has a population over 1 million",
+ "original_sentence": "1",
+ },
+ ]
+ }
+
+ with patch.object(
+ evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+ ) as mock_llm:
+ mock_llm.return_value = mock_response
+
+ claims = await evaluator._decompose_to_claims(disambiguated, full_output)
+
+ assert len(claims) == 2
+ assert any("Paris" in c["text"] for c in claims)
+ assert any("Lyon" in c["text"] for c in claims)
+
+ @pytest.mark.asyncio
+ async def test_evaluate_claim_stance_supports(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test claim stance evaluation when source supports claim."""
+ evaluator = evaluator_with_mocked_llm
+
+ claim = "Paris is the capital of France"
+ context = "The capital of France is Paris, a major European city."
+
+ mock_response = {"stance": "SUPPORTS"}
+
+ with patch.object(
+ evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+ ) as mock_llm:
+ mock_llm.return_value = mock_response
+
+ stance = await evaluator._evaluate_claim_stance(claim, context)
+
+ assert stance == "SUPPORTS"
+
+ @pytest.mark.asyncio
+ async def test_evaluate_claim_stance_contradicts(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test claim stance evaluation when source contradicts claim."""
+ evaluator = evaluator_with_mocked_llm
+
+ claim = "Paris is in Germany"
+ context = "Paris is a city in France, not Germany."
+
+ mock_response = {"stance": "CONTRADICTS"}
+
+ with patch.object(
+ evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+ ) as mock_llm:
+ mock_llm.return_value = mock_response
+
+ stance = await evaluator._evaluate_claim_stance(claim, context)
+
+ assert stance == "CONTRADICTS"
+
+ @pytest.mark.asyncio
+ async def test_evaluate_claim_stance_irrelevant(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test claim stance evaluation when source is irrelevant."""
+ evaluator = evaluator_with_mocked_llm
+
+ claim = "Paris is the capital of France"
+ context = "The Eiffel Tower is made of iron."
+
+ mock_response = {"stance": "IRRELEVANT"}
+
+ with patch.object(
+ evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+ ) as mock_llm:
+ mock_llm.return_value = mock_response
+
+ stance = await evaluator._evaluate_claim_stance(claim, context)
+
+ assert stance == "IRRELEVANT"
+
+ @pytest.mark.asyncio
+ async def test_evaluate_claims_against_context(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test evaluation of claims against multiple context sources."""
+ evaluator = evaluator_with_mocked_llm
+
+ claims = [
+ {"text": "Paris is in France", "original_sentence": "1"},
+ {"text": "Tokyo is in Japan", "original_sentence": "2"},
+ ]
+ context_sources = [
+ {"content": "Paris is the capital of France", "source": "Source 1"},
+ {"content": "Tokyo is the capital of Japan", "source": "Source 2"},
+ ]
+
+ # Mock stance evaluations
+ with patch.object(
+ evaluator, "_evaluate_claim_stance", new_callable=AsyncMock
+ ) as mock_stance:
+ # Return SUPPORTS for both claims
+ mock_stance.side_effect = ["SUPPORTS", "SUPPORTS", "SUPPORTS", "SUPPORTS"]
+
+ evaluations = await evaluator._evaluate_claims_against_context(
+ claims, context_sources
+ )
+
+ assert len(evaluations) == 2
+ assert all(e["is_grounded"] for e in evaluations)
+ assert len(evaluations[0]["supporting_sources"]) == 2
+ assert len(evaluations[1]["supporting_sources"]) == 2
+
+ @pytest.mark.asyncio
+ async def test_claim_grounding_with_contradicting_source(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test that claims with contradicting sources are not grounded."""
+ evaluator = evaluator_with_mocked_llm
+
+ claims = [
+ {"text": "The Earth is flat", "original_sentence": "1"},
+ ]
+ context_sources = [
+ {"content": "The Earth is spherical", "source": "Science Source"},
+ ]
+
+ with patch.object(
+ evaluator, "_evaluate_claim_stance", new_callable=AsyncMock
+ ) as mock_stance:
+ mock_stance.return_value = "CONTRADICTS"
+
+ evaluations = await evaluator._evaluate_claims_against_context(
+ claims, context_sources
+ )
+
+ assert len(evaluations) == 1
+ assert not evaluations[0]["is_grounded"]
+ assert len(evaluations[0]["contradicting_sources"]) == 1
+
+ @pytest.mark.asyncio
+ async def test_full_evaluation_with_no_agent_output(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test evaluation when no agent output is provided."""
+ evaluator = evaluator_with_mocked_llm
+
+ agent_execution = AgentExecution(
+ agent_input={},
+ agent_trace=[],
+ agent_output="",
+ )
+
+ result = await evaluator.evaluate(
+ agent_execution,
+ evaluation_criteria=LegacyEvaluationCriteria(
+ expected_output="",
+ expected_agent_behavior="",
+ ),
+ )
+
+ assert result.score == 0.0
+ assert "no agent output" in result.details.lower()
+
+ @pytest.mark.asyncio
+ async def test_full_evaluation_with_no_context_sources(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test evaluation when no context sources are available."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Create a span without output (no context source)
+ class NoOutputSpan:
+ attributes = MappingProxyType(
+ {
+ "openinference.span.kind": "TOOL_CALL",
+ # No output.value
+ }
+ )
+
+ agent_execution = AgentExecution(
+ agent_input={},
+ agent_trace=[],
+ agent_output="The sky is blue.",
+ )
+
+ with patch.object(evaluator, "_extract_context_sources", return_value=[]):
+ result = await evaluator.evaluate(
+ agent_execution,
+ evaluation_criteria=LegacyEvaluationCriteria(
+ expected_output="The sky is blue.",
+ expected_agent_behavior="",
+ ),
+ )
+
+ assert result.score == 0.0
+ assert "no context sources" in result.details.lower()
+
+ @pytest.mark.asyncio
+ async def test_full_evaluation_with_no_verifiable_claims(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test evaluation when no verifiable claims are found."""
+ evaluator = evaluator_with_mocked_llm
+
+ agent_execution = AgentExecution(
+ agent_input={},
+ agent_trace=[],
+ agent_output="Just a greeting.",
+ )
+
+ with (
+ patch.object(
+ evaluator,
+ "_extract_context_sources",
+ return_value=[{"content": "Some context", "source": "Test"}],
+ ),
+ patch.object(
+ evaluator, "_extract_claims", new_callable=AsyncMock
+ ) as mock_claims,
+ ):
+ mock_claims.return_value = []
+
+ result = await evaluator.evaluate(
+ agent_execution,
+ evaluation_criteria=LegacyEvaluationCriteria(
+ expected_output="Just a greeting.",
+ expected_agent_behavior="",
+ ),
+ )
+
+ assert result.score == 100.0
+ assert "no verifiable claims" in result.details.lower()
+
+ @pytest.mark.asyncio
+ async def test_full_evaluation_with_grounded_claims(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test full evaluation flow with grounded claims."""
+ evaluator = evaluator_with_mocked_llm
+
+ agent_execution = AgentExecution(
+ agent_input={},
+ agent_trace=[],
+ agent_output="Paris is in France.",
+ )
+
+ # Mock the extraction and evaluation steps
+ with (
+ patch.object(
+ evaluator,
+ "_extract_context_sources",
+ return_value=[
+ {
+ "content": "Paris is the capital of France",
+ "source": "Context Grounding",
+ }
+ ],
+ ),
+ patch.object(
+ evaluator, "_extract_claims", new_callable=AsyncMock
+ ) as mock_claims,
+ patch.object(
+ evaluator, "_evaluate_claims_against_context", new_callable=AsyncMock
+ ) as mock_eval,
+ ):
+ mock_claims.return_value = [
+ {"text": "Paris is in France", "original_sentence": "1"}
+ ]
+ mock_eval.return_value = [
+ {
+ "claim": "Paris is in France",
+ "original_sentence": "1",
+ "is_grounded": True,
+ "supporting_sources": ["Context Grounding"],
+ "contradicting_sources": [],
+ }
+ ]
+
+ result = await evaluator.evaluate(
+ agent_execution,
+ evaluation_criteria=LegacyEvaluationCriteria(
+ expected_output="Paris is in France.",
+ expected_agent_behavior="",
+ ),
+ )
+
+ assert result.score == 100.0
+ assert "GROUNDED CLAIMS" in result.details
+ assert "Paris is in France" in result.details
+
+ @pytest.mark.asyncio
+ async def test_full_evaluation_with_mixed_claims(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test full evaluation with both grounded and ungrounded claims."""
+ evaluator = evaluator_with_mocked_llm
+
+ agent_execution = AgentExecution(
+ agent_input={},
+ agent_trace=[],
+ agent_output="Paris is in France. The sky is green.",
+ )
+
+ # Mock the extraction and evaluation steps
+ with (
+ patch.object(
+ evaluator,
+ "_extract_context_sources",
+ return_value=[
+ {
+ "content": "Paris is the capital of France",
+ "source": "Context Grounding",
+ }
+ ],
+ ),
+ patch.object(
+ evaluator, "_extract_claims", new_callable=AsyncMock
+ ) as mock_claims,
+ patch.object(
+ evaluator, "_evaluate_claims_against_context", new_callable=AsyncMock
+ ) as mock_eval,
+ ):
+ mock_claims.return_value = [
+ {"text": "Paris is in France", "original_sentence": "1"},
+ {"text": "The sky is green", "original_sentence": "2"},
+ ]
+ mock_eval.return_value = [
+ {
+ "claim": "Paris is in France",
+ "original_sentence": "1",
+ "is_grounded": True,
+ "supporting_sources": ["Context Grounding"],
+ "contradicting_sources": [],
+ },
+ {
+ "claim": "The sky is green",
+ "original_sentence": "2",
+ "is_grounded": False,
+ "supporting_sources": [],
+ "contradicting_sources": [],
+ },
+ ]
+
+ result = await evaluator.evaluate(
+ agent_execution,
+ evaluation_criteria=LegacyEvaluationCriteria(
+ expected_output="Paris is in France. The sky is green.",
+ expected_agent_behavior="",
+ ),
+ )
+
+ assert result.score == 50.0
+ assert "1/2" in result.details
+ assert "GROUNDED CLAIMS" in result.details
+ assert "UNGROUNDED CLAIMS" in result.details
+
+ @pytest.mark.asyncio
+ async def test_serialize_content_handles_various_types(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test serialization of various content types."""
+ evaluator = evaluator_with_mocked_llm
+
+ # Test string
+ result = evaluator._serialize_content("simple string")
+ assert result == "simple string"
+
+ # Test dict
+ result = evaluator._serialize_content({"key": "value"})
+ assert "key" in result and "value" in result
+
+ # Test list
+ result = evaluator._serialize_content(["item1", "item2"])
+ assert "item1" in result and "item2" in result
+
+ @pytest.mark.asyncio
+ async def test_evaluate_claim_stance_with_invalid_response(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test that invalid stance responses default to IRRELEVANT."""
+ evaluator = evaluator_with_mocked_llm
+
+ claim = "Test claim"
+ context = "Test context"
+
+ mock_response = {"stance": "INVALID_STANCE"}
+
+ with patch.object(
+ evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+ ) as mock_llm:
+ mock_llm.return_value = mock_response
+
+ stance = await evaluator._evaluate_claim_stance(claim, context)
+
+ assert stance == "IRRELEVANT"
+
+ @pytest.mark.asyncio
+ async def test_format_justification_structure(
+ self, evaluator_with_mocked_llm
+ ) -> None:
+ """Test the structure of formatted justification."""
+ evaluator = evaluator_with_mocked_llm
+
+ claim_evaluations = [
+ {
+ "claim": "Grounded claim 1",
+ "original_sentence": "1",
+ "is_grounded": True,
+ "supporting_sources": ["Source A", "Source B"],
+ "contradicting_sources": [],
+ },
+ {
+ "claim": "Ungrounded claim",
+ "original_sentence": "2",
+ "is_grounded": False,
+ "supporting_sources": [],
+ "contradicting_sources": ["Source C"],
+ },
+ ]
+
+ justification = evaluator._format_justification(50.0, claim_evaluations)
+
+ assert "Overall Faithfulness: 50.0/100" in justification
+ assert "1/2" in justification
+ assert "GROUNDED CLAIMS" in justification
+ assert "UNGROUNDED CLAIMS" in justification
+ assert "Grounded claim 1" in justification
+ assert "Ungrounded claim" in justification
+ assert "Source A" in justification
+ assert "Source C" in justification
diff --git a/uv.lock b/uv.lock
index 3f0c5cc09..c4fb23541 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2486,7 +2486,7 @@ wheels = [
[[package]]
name = "uipath"
-version = "2.4.6"
+version = "2.4.7"
source = { editable = "." }
dependencies = [
{ name = "applicationinsights" },