diff --git a/pyproject.toml b/pyproject.toml
index da1cf62eb..9d9e3a14a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.4.6"
+version = "2.4.7"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py
index 253ce36d4..851c1aa0f 100644
--- a/src/uipath/_cli/_evals/_evaluator_factory.py
+++ b/src/uipath/_cli/_evals/_evaluator_factory.py
@@ -22,7 +22,9 @@
 from uipath.eval.evaluators import (
     BaseEvaluator,
     LegacyBaseEvaluator,
+    LegacyContextPrecisionEvaluator,
     LegacyExactMatchEvaluator,
+    LegacyFaithfulnessEvaluator,
     LegacyJsonSimilarityEvaluator,
     LegacyLlmAsAJudgeEvaluator,
     LegacyTrajectoryEvaluator,
@@ -68,6 +70,7 @@
     ToolCallOutputEvaluator,
     ToolCallOutputEvaluatorConfig,
 )
+from uipath.eval.models import LegacyEvaluatorType
 
 logger = logging.getLogger(__name__)
 
@@ -428,11 +431,8 @@ def _create_legacy_json_similarity_evaluator(
     def _create_legacy_llm_as_judge_evaluator(
         params: LLMEvaluatorParams,
         agent_model: str | None = None,
-    ) -> LegacyLlmAsAJudgeEvaluator:
-        """Create an LLM-as-a-judge evaluator."""
-        if not params.prompt:
-            raise ValueError("LLM evaluator must include 'prompt' field")
-
+    ) -> LegacyBaseEvaluator[Any]:
+        """Create an LLM-as-a-judge evaluator or context precision evaluator based on type."""
         if not params.model:
             raise ValueError("LLM evaluator must include 'model' field")
 
@@ -449,10 +449,16 @@ def _create_legacy_llm_as_judge_evaluator(
             )
             params = params.model_copy(update={"model": agent_model})
 
-        logger.info(
-            f"Creating LLM-as-judge evaluator '{params.name}' with model: {params.model}"
-        )
-        return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={})
+        # Check evaluator type to determine which evaluator to create
+        if params.evaluator_type == LegacyEvaluatorType.ContextPrecision:
+            return LegacyContextPrecisionEvaluator(**params.model_dump(), config={})
+        elif params.evaluator_type == LegacyEvaluatorType.Faithfulness:
+            return LegacyFaithfulnessEvaluator(**params.model_dump(), config={})
+        else:
+            if not params.prompt:
+                raise ValueError("LLM evaluator must include 'prompt' field")
+
+            return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={})
 
     @staticmethod
     def _create_legacy_trajectory_evaluator(
diff --git a/src/uipath/eval/evaluators/__init__.py b/src/uipath/eval/evaluators/__init__.py
index bc79e071d..248b5d571 100644
--- a/src/uipath/eval/evaluators/__init__.py
+++ b/src/uipath/eval/evaluators/__init__.py
@@ -10,7 +10,9 @@
 
 # Legacy evaluators
 from .legacy_base_evaluator import LegacyBaseEvaluator
+from .legacy_context_precision_evaluator import LegacyContextPrecisionEvaluator
 from .legacy_exact_match_evaluator import LegacyExactMatchEvaluator
+from .legacy_faithfulness_evaluator import LegacyFaithfulnessEvaluator
 from .legacy_json_similarity_evaluator import LegacyJsonSimilarityEvaluator
 from .legacy_llm_as_judge_evaluator import LegacyLlmAsAJudgeEvaluator
 from .legacy_trajectory_evaluator import LegacyTrajectoryEvaluator
@@ -46,7 +48,9 @@
 __all__ = [
     # Legacy evaluators
     "LegacyBaseEvaluator",
+    "LegacyContextPrecisionEvaluator",
     "LegacyExactMatchEvaluator",
+    "LegacyFaithfulnessEvaluator",
     "LegacyJsonSimilarityEvaluator",
     "LegacyLlmAsAJudgeEvaluator",
     "LegacyTrajectoryEvaluator",
diff --git a/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py b/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py
new file mode 100644
index 000000000..88667408c
--- /dev/null
+++ b/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py
@@ -0,0 +1,349 @@
+"""Legacy Context Precision evaluator for assessing the relevance of context chunks to queries."""
+
+import ast
+import json
+from typing import Any, Optional
+
+from uipath.eval.models import NumericEvaluationResult
+
+from ...platform.chat import UiPathLlmChatService
+from ..models.models import AgentExecution, EvaluationResult
+from .legacy_base_evaluator import (
+    LegacyBaseEvaluator,
+    LegacyEvaluationCriteria,
+    LegacyEvaluatorConfig,
+    track_evaluation_metrics,
+)
+from .legacy_evaluator_utils import clean_model_name, serialize_object
+
+
+class LegacyContextPrecisionEvaluatorConfig(LegacyEvaluatorConfig):
+    """Configuration for legacy context precision evaluators."""
+
+    name: str = "LegacyContextPrecisionEvaluator"
+    model: str = ""
+    prompt: str = """You are an expert evaluator assessing the relevance of context chunks to a given query.
+
+TASK: Evaluate how relevant each provided context chunk is to answering the query.
+Your scoring should be deterministic - the same chunk-query pair should always receive the same score.
+
+EVALUATION CRITERIA:
+Score each chunk using the HIGHEST applicable range (if multiple apply, use the highest):
+
+- HIGHLY RELEVANT (80-100) - Directly answers or addresses the query:
+  * 95-100: Contains the exact, complete answer to the query
+  * 85-94: Directly addresses the query with comprehensive information (but not the complete answer)
+  * 80-84: Provides a direct but partial answer to the query
+
+- MODERATELY RELEVANT (50-79) - Provides useful supporting information:
+  * 70-79: Contains substantial supporting information that helps understand the topic
+  * 60-69: Provides relevant context or background information
+  * 50-59: Has some connection to the query but limited usefulness
+
+- SLIGHTLY RELEVANT (20-49) - Contains tangentially related information:
+  * 35-49: Mentions related concepts, terms, or entities from the query
+  * 20-34: Very indirect connection to the query topic
+
+- NOT RELEVANT (0-19) - Has no meaningful connection to the query:
+  * 10-19: Contains some keywords from the query but no meaningful connection
+  * 0-9: Completely unrelated to the query or empty/malformed content
+
+IMPORTANT INSTRUCTIONS:
+1. Evaluate EACH chunk independently - do not let one chunk influence another's score
+2. Base relevance ONLY on how well the chunk helps answer the specific query
+3. Consider semantic meaning, not just keyword matches
+4. If a chunk is empty or malformed, assign a score of 0
+5. Scores must be integers between 0 and 100 inclusive
+6. Be consistent: similar content should receive similar scores
+7. Use the specific sub-ranges above to guide precise scoring
+8. HIERARCHY RULE: If a chunk meets criteria for multiple ranges, always assign the HIGHEST applicable score
+
+OUTPUT FORMAT:
+You MUST respond using the provided tool with a JSON object containing:
+- A "relevancies" field that is an array
+- Each array element must be an object with "relevancy_score" (integer 0-100)
+- The array must have the same number of elements as context chunks provided
+- Order matters: the first score corresponds to the first chunk, etc.
+
+EXAMPLE STRUCTURE (do not copy values, this is just format):
+{
+  "relevancies": [
+    {"relevancy_score": 85},
+    {"relevancy_score": 45},
+    {"relevancy_score": 0}
+  ]
+}
+
+<query>
+{{Query}}
+</query>
+
+<context_chunks>
+{{Chunks}}
+</context_chunks>
+
+Evaluate each chunk's relevance to the query and respond with the structured output."""
+
+
+class LegacyContextPrecisionEvaluator(
+    LegacyBaseEvaluator[LegacyContextPrecisionEvaluatorConfig]
+):
+    """Legacy evaluator that assesses context precision using an LLM.
+
+    This evaluator extracts context grounding spans from agent execution traces
+    and uses an LLM to score the relevance of each chunk to its corresponding query.
+    The final score is the mean of all chunk relevancy scores (normalized to 0-1).
+    """
+
+    model: str
+    query_placeholder: str = "{{Query}}"
+    chunks_placeholder: str = "{{Chunks}}"
+    llm: Optional[UiPathLlmChatService] = None
+
+    def model_post_init(self, __context: Any):
+        """Initialize the LLM service after model creation."""
+        super().model_post_init(__context)
+        self._initialize_llm()
+
+    def _initialize_llm(self):
+        """Initialize the LLM used for evaluation."""
+        from uipath.platform import UiPath
+
+        uipath = UiPath()
+        self.llm = uipath.llm
+
+    @track_evaluation_metrics
+    async def evaluate(
+        self,
+        agent_execution: AgentExecution,
+        evaluation_criteria: LegacyEvaluationCriteria,
+    ) -> EvaluationResult:
+        """Evaluate context precision from agent execution traces.
+
+        Args:
+            agent_execution: The execution details containing agent_trace with spans
+            evaluation_criteria: Legacy evaluation criteria (unused for context precision)
+
+        Returns:
+            NumericEvaluationResult with normalized score (0-1) and detailed justification
+        """
+        # Extract context grounding spans from the trace
+        context_groundings = self._extract_context_groundings(
+            agent_execution.agent_trace
+        )
+
+        if not context_groundings:
+            return NumericEvaluationResult(
+                score=0.0,
+                details="No context grounding tool calls found in the agent execution trace.",
+            )
+
+        # Evaluate each context grounding call
+        all_scores = []
+        evaluation_details = []
+
+        for idx, grounding in enumerate(context_groundings, 1):
+            query = grounding.get("query", "")
+            chunks = grounding.get("chunks", [])
+
+            if not query or not chunks:
+                evaluation_details.append(
+                    f"{idx}. Query: (empty) - SKIPPED (no query or chunks)"
+                )
+                continue
+
+            scores = await self._evaluate_context_grounding(query, chunks)
+
+            if scores:
+                mean_score = sum(scores) / len(scores)
+                all_scores.append(mean_score)
+
+                # Format score summaries for this grounding
+                score_summaries = [f"Relevancy: {s:d}/100" for s in scores]
+                evaluation_details.append(
+                    f'{idx}. Query: "{query}"\n'
+                    f"\tAvg. Score: {mean_score:.1f}/100 ({len(scores)} chunks). "
+                    f"Chunk Relevancies: [{', '.join(score_summaries)}]."
+                )
+
+        if not all_scores:
+            return NumericEvaluationResult(
+                score=0.0,
+                details="No valid context chunks were found for evaluation.",
+            )
+
+        # Calculate overall mean score (0-100 range)
+        overall_mean = sum(all_scores) / len(all_scores)
+        overall_mean = max(0, min(100, overall_mean))
+
+        # Build justification
+        justification = f"Overall Context Precision: {overall_mean:.1f}/100 ({len(context_groundings)} Context Tool Call(s) evaluated).\n"
+        if evaluation_details:
+            justification += "---\nPer-Context Tool Call Details:\n\n"
+            justification += "\n\n".join(evaluation_details)
+
+        return NumericEvaluationResult(
+            score=overall_mean,
+            details=justification,
+        )
+
+    def _parse_span_value(self, value_str: str) -> Any:
+        """Parse span value that could be JSON or Python literal syntax.
+
+        Args:
+            value_str: String that could be JSON or Python literal (dict/list)
+
+        Returns:
+            Parsed Python object (dict, list, etc.)
+
+        Raises:
+            ValueError: If string cannot be parsed as JSON or literal
+        """
+        try:
+            # Try JSON first (most common)
+            return json.loads(value_str)
+        except json.JSONDecodeError:
+            try:
+                # Fall back to Python literal_eval for Python syntax
+                return ast.literal_eval(value_str)
+            except (ValueError, SyntaxError) as e:
+                raise ValueError(f"Cannot parse value: {value_str}") from e
+
+    def _extract_context_groundings(
+        self, agent_trace: list[Any]
+    ) -> list[dict[str, Any]]:
+        """Extract context groundings from agent execution trace.
+
+        Looks for spans with input.value and output.value attributes that represent
+        context grounding tool calls.
+        """
+        context_groundings = []
+
+        for span in agent_trace:
+            if not hasattr(span, "attributes") or span.attributes is None:
+                continue
+
+            attrs = span.attributes
+
+            if attrs.get("openinference.span.kind", None) != "RETRIEVER":
+                # NOTE: all tool calls can be extracted using this approach
+                continue
+
+            # Look for spans with input.value and output.value (context grounding calls)
+            query = attrs.get("input.value")
+            try:
+                chunks = self._normalize_chunks(
+                    json.loads(attrs.get("output.value")).get("documents")
+                )
+
+                if chunks:
+                    context_groundings.append(
+                        {
+                            "query": str(query),
+                            "chunks": chunks,
+                        }
+                    )
+            except (ValueError, KeyError, TypeError):
+                # Skip spans that don't have the expected structure
+                continue
+
+        return context_groundings
+
+    def _normalize_chunks(self, results: Any) -> list[str]:
+        """Normalize various chunk representations to a list of strings."""
+        if isinstance(results, list):
+            return [self._serialize_chunk(chunk) for chunk in results]
+        elif isinstance(results, dict):
+            # Handle dict representations of chunks
+            return [self._serialize_chunk(results)]
+        elif isinstance(results, str):
+            return [results]
+        else:
+            return [str(results)]
+
+    def _serialize_chunk(self, chunk: Any) -> str:
+        """Serialize a single chunk to string format."""
+        return serialize_object(chunk, sort_keys=True)
+
+    async def _evaluate_context_grounding(
+        self, query: str, chunks: list[str]
+    ) -> list[int]:
+        """Evaluate the relevance of chunks to a query using the LLM.
+
+        Args:
+            query: The query string
+            chunks: List of context chunks to evaluate
+
+        Returns:
+            List of relevancy scores (0-100) for each chunk
+        """
+        # Create evaluation prompt
+        chunks_text = "\n".join(chunks)
+        prompt = self.evaluator_config.prompt.replace(
+            self.query_placeholder, query
+        ).replace(self.chunks_placeholder, chunks_text)
+
+        # Get LLM response
+        response_obj = await self._get_structured_llm_response(prompt)
+
+        # Extract relevancy scores from response
+        relevancies = response_obj.get("relevancies", [])
+        if not relevancies:
+            raise ValueError("No relevancies found in LLM response")
+
+        scores = []
+        for rel in relevancies:
+            if isinstance(rel, dict) and "relevancy_score" in rel:
+                score = rel["relevancy_score"]
+                # Clamp score to 0-100
+                score = max(0, min(100, int(score)))
+                scores.append(score)
+
+        return scores
+
+    async def _get_structured_llm_response(
+        self, evaluation_prompt: str
+    ) -> dict[str, Any]:
+        """Get structured LLM response using the context precision schema."""
+        # Remove community-agents suffix from llm model name
+        model = clean_model_name(self.model)
+
+        # Prepare the request
+        request_data = {
+            "model": model,
+            "messages": [
+                {"role": "system", "content": "Context Precision Evaluation"},
+                {"role": "user", "content": evaluation_prompt},
+            ],
+            "response_format": {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "context_precision_evaluation",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "relevancies": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "relevancy_score": {
+                                            "type": "number",
+                                            "description": "Relevancy score for the chunk (0-100).",
+                                        }
+                                    },
+                                    "required": ["relevancy_score"],
+                                },
+                                "description": "List of relevancy scores for each context chunk",
+                            }
+                        },
+                        "required": ["relevancies"],
+                    },
+                },
+            },
+        }
+
+        assert self.llm, "LLM should be initialized before calling this method."
+        response = await self.llm.chat_completions(**request_data)
+        content = response.choices[-1].message.content or "{}"
+        return json.loads(content)
diff --git a/src/uipath/eval/evaluators/legacy_evaluator_utils.py b/src/uipath/eval/evaluators/legacy_evaluator_utils.py
new file mode 100644
index 000000000..b8c20f372
--- /dev/null
+++ b/src/uipath/eval/evaluators/legacy_evaluator_utils.py
@@ -0,0 +1,75 @@
+"""Utility functions for legacy evaluators."""
+
+import json
+from typing import Any, Optional
+
+from ..._utils.constants import COMMUNITY_agents_SUFFIX
+
+
+def clean_model_name(model: str) -> str:
+    """Remove community-agents suffix from model name.
+
+    Args:
+        model: Model name that may have the community suffix
+
+    Returns:
+        Model name without the community suffix
+    """
+    if model.endswith(COMMUNITY_agents_SUFFIX):
+        return model.replace(COMMUNITY_agents_SUFFIX, "")
+    return model
+
+
+def serialize_object(
+    content: Any,
+    sort_keys: bool = False,
+) -> str:
+    """Serialize content to string format.
+
+    Args:
+        content: Content to serialize (str, dict, list, etc.)
+        sort_keys: Whether to sort dict keys (default: False)
+
+    Returns:
+        Serialized string representation
+    """
+    if isinstance(content, str):
+        return content
+    elif isinstance(content, dict):
+        if sort_keys:
+            content = dict(sorted(content.items()))
+        return json.dumps(content, default=str, separators=(",", ":"))
+    else:
+        return json.dumps(content, default=str, separators=(",", ":"))
+
+
+def safe_get_span_attributes(span: Any) -> Optional[dict[str, Any]]:
+    """Safely extract attributes from a span.
+
+    Args:
+        span: The span object
+
+    Returns:
+        Span attributes dict, or None if not available
+    """
+    if not hasattr(span, "attributes") or span.attributes is None:
+        return None
+    return span.attributes
+
+
+def parse_json_value(value: str) -> Any:
+    """Safely parse a JSON string value.
+
+    Args:
+        value: JSON string to parse
+
+    Returns:
+        Parsed JSON object
+
+    Raises:
+        ValueError: If string cannot be parsed as JSON
+    """
+    try:
+        return json.loads(value)
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Cannot parse JSON value: {value}") from e
diff --git a/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py b/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py
new file mode 100644
index 000000000..ac0373384
--- /dev/null
+++ b/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py
@@ -0,0 +1,513 @@
+"""Legacy Faithfulness evaluator for assessing whether agent output claims are grounded in context."""
+
+import json
+from typing import Any, Optional
+
+from uipath.eval.models import NumericEvaluationResult
+from uipath.platform.chat import UiPathLlmChatService
+
+from ..models.models import AgentExecution, EvaluationResult
+from .legacy_base_evaluator import (
+    LegacyBaseEvaluator,
+    LegacyEvaluationCriteria,
+    LegacyEvaluatorConfig,
+    track_evaluation_metrics,
+)
+from .legacy_evaluator_utils import (
+    clean_model_name,
+    serialize_object,
+)
+
+
+class LegacyFaithfulnessEvaluatorConfig(LegacyEvaluatorConfig):
+    """Configuration for legacy faithfulness evaluators."""
+
+    name: str = "LegacyFaithfulnessEvaluator"
+    model: str = ""
+
+
+class LegacyFaithfulnessEvaluator(
+    LegacyBaseEvaluator[LegacyFaithfulnessEvaluatorConfig]
+):
+    """Legacy evaluator that assesses faithfulness using an LLM.
+
+    This evaluator extracts claims from agent output using a 3-stage pipeline
+    (selection, disambiguation, decomposition) and evaluates whether each claim
+    is grounded in the available context sources extracted from agent traces.
+    The final score is the percentage of claims that are grounded.
+    """
+
+    model: str
+    llm: Optional[UiPathLlmChatService] = None
+
+    def model_post_init(self, __context: Any):
+        """Initialize the LLM service after model creation."""
+        super().model_post_init(__context)
+        self._initialize_llm()
+
+    def _initialize_llm(self):
+        """Initialize the LLM used for evaluation."""
+        from uipath.platform import UiPath
+
+        uipath = UiPath()
+        self.llm = uipath.llm
+
+    @track_evaluation_metrics
+    async def evaluate(
+        self,
+        agent_execution: AgentExecution,
+        evaluation_criteria: LegacyEvaluationCriteria,
+    ) -> EvaluationResult:
+        """Evaluate faithfulness of agent output against available context.
+
+        Args:
+            agent_execution: The execution details containing agent_trace with spans
+            evaluation_criteria: Legacy evaluation criteria containing expected_output
+
+        Returns:
+            NumericEvaluationResult with normalized score (0-100) and detailed justification
+        """
+        # Extract agent output
+        agent_output = str(evaluation_criteria.expected_output or "")
+        if not agent_output or not agent_output.strip():
+            return NumericEvaluationResult(
+                score=0.0,
+                details="No agent output provided for faithfulness evaluation.",
+            )
+
+        # Extract context sources from traces
+        context_sources = self._extract_context_sources(agent_execution.agent_trace)
+
+        if not context_sources:
+            return NumericEvaluationResult(
+                score=0.0,
+                details="No context sources found in the agent execution trace.",
+            )
+
+        # Stage 1: Extract verifiable claims from agent output
+        claims = await self._extract_claims(agent_output)
+
+        if not claims:
+            return NumericEvaluationResult(
+                score=100.0,
+                details="No verifiable claims found in agent output.",
+            )
+
+        # Stage 2: Evaluate each claim against context sources
+        claim_evaluations = await self._evaluate_claims_against_context(
+            claims, context_sources
+        )
+
+        # Calculate score
+        grounded_claims = [c for c in claim_evaluations if c["is_grounded"]]
+        score = (
+            (len(grounded_claims) / len(claim_evaluations)) * 100
+            if claim_evaluations
+            else 0.0
+        )
+        score = max(0, min(100, score))
+
+        # Build justification
+        justification = self._format_justification(score, claim_evaluations)
+
+        return NumericEvaluationResult(
+            score=score,
+            details=justification,
+        )
+
+    def _extract_context_sources(self, agent_trace: list[Any]) -> list[dict[str, str]]:
+        """Extract context sources from agent execution trace.
+
+        Looks for tool call outputs and context grounding spans that provide context.
+
+        Returns:
+            List of context source dicts with 'content' and 'source' keys
+        """
+        context_sources = []
+
+        for span in agent_trace:
+            if not hasattr(span, "attributes") or span.attributes is None:
+                continue
+
+            attrs = span.attributes
+
+            tool_name = attrs.get("openinference.span.kind")
+            if not tool_name or tool_name == "UNKNOWN":
+                continue
+
+            output_value = attrs.get("output.value")
+            if not output_value:
+                continue
+
+            try:
+                output_data = (
+                    json.loads(output_value)
+                    if isinstance(output_value, str)
+                    else output_value
+                )
+
+                # For RETRIEVER spans, extract individual documents
+                if tool_name == "RETRIEVER":
+                    documents = output_data.get("documents", [])
+                    if documents:
+                        for doc in documents:
+                            content = self._serialize_content(doc)
+                            context_sources.append(
+                                {"content": content, "source": "Context Grounding"}
+                            )
+                else:
+                    # For other tool calls, extract the full output
+                    content = self._serialize_content(output_data)
+                    context_sources.append({"content": content, "source": tool_name})
+            except (ValueError, TypeError):
+                continue
+
+        return context_sources
+
+    def _serialize_content(self, content: Any) -> str:
+        """Serialize content to string format."""
+        return serialize_object(content, sort_keys=False)
+
+    async def _extract_claims(self, agent_output: str) -> list[dict[str, str]]:
+        """Extract verifiable claims from agent output using 3-stage pipeline.
+
+        Stages:
+        1. Selection: Filter to verifiable sentences
+        2. Disambiguation: Resolve internal ambiguities
+        3. Decomposition: Extract standalone claims
+
+        Returns:
+            List of claim dicts with 'text' and 'original_sentence' keys
+        """
+        # Stage 1: Selection
+        verifiable_sentences = await self._select_verifiable_sentences(agent_output)
+        if not verifiable_sentences:
+            return []
+
+        # Stage 2: Disambiguation
+        disambiguated_sentences = await self._disambiguate_sentences(
+            verifiable_sentences, agent_output
+        )
+        if not disambiguated_sentences:
+            return []
+
+        # Stage 3: Decomposition
+        claims = await self._decompose_to_claims(disambiguated_sentences, agent_output)
+        return claims
+
+    async def _select_verifiable_sentences(self, agent_output: str) -> list[str]:
+        """Stage 1: Filter agent output to verifiable sentences."""
+        prompt = f"""You are an expert evaluator identifying verifiable claims.
+
+TASK: Identify sentences in the agent output that contain verifiable, factual claims.
+Filter out subjective opinions, instructions, questions, and meta-commentary.
+
+OUTPUT FORMAT: Return a JSON object with a "sentences" field containing an array of strings.
+Each string should be a complete sentence from the original output.
+
+<agent_output>
+{agent_output}
+</agent_output>
+
+Identify and return only the verifiable sentences."""
+
+        response_obj = await self._get_structured_llm_response(
+            prompt,
+            schema_name="claim_selection",
+            schema={
+                "type": "object",
+                "properties": {
+                    "sentences": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of verifiable sentences from agent output",
+                    }
+                },
+                "required": ["sentences"],
+            },
+        )
+
+        return response_obj.get("sentences", [])
+
+    async def _disambiguate_sentences(
+        self, sentences: list[str], full_output: str
+    ) -> list[dict[str, str]]:
+        """Stage 2: Resolve ambiguities in sentences."""
+        if not sentences:
+            return []
+
+        sentences_text = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(sentences))
+
+        prompt = f"""You are an expert at disambiguating claims.
+
+TASK: For each sentence, resolve any internal ambiguities using the full agent output as context.
+Replace pronouns, references, and implicit information with explicit, standalone versions.
+
+<full_agent_output>
+{full_output}
+</full_agent_output>
+
+<sentences_to_disambiguate>
+{sentences_text}
+</sentences_to_disambiguate>
+
+OUTPUT FORMAT: Return a JSON object with a "disambiguated" field containing an array of objects.
+Each object must have:
+- "original": the original sentence
+- "disambiguated": the disambiguated version"""
+
+        response_obj = await self._get_structured_llm_response(
+            prompt,
+            schema_name="claim_disambiguation",
+            schema={
+                "type": "object",
+                "properties": {
+                    "disambiguated": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "original": {"type": "string"},
+                                "disambiguated": {"type": "string"},
+                            },
+                            "required": ["original", "disambiguated"],
+                        },
+                        "description": "List of disambiguated sentences",
+                    }
+                },
+                "required": ["disambiguated"],
+            },
+        )
+
+        return response_obj.get("disambiguated", [])
+
+    async def _decompose_to_claims(
+        self, disambiguated: list[dict[str, str]], full_output: str
+    ) -> list[dict[str, str]]:
+        """Stage 3: Decompose sentences into standalone verifiable claims."""
+        if not disambiguated:
+            return []
+
+        sentences_text = "\n".join(
+            f"{i + 1}. {item.get('disambiguated', '')}"
+            for i, item in enumerate(disambiguated)
+        )
+
+        prompt = f"""You are an expert at claim decomposition.
+
+TASK: Break down each sentence into standalone, atomic claims that can be independently verified.
+Each claim should be self-contained and not depend on other claims for context.
+
+<sentences>
+{sentences_text}
+</sentences>
+
+<full_context>
+{full_output}
+</full_context>
+
+OUTPUT FORMAT: Return a JSON object with a "claims" field containing an array of objects.
+Each object must have:
+- "claim": the standalone claim
+- "original_sentence": which sentence it came from (number)"""
+
+        response_obj = await self._get_structured_llm_response(
+            prompt,
+            schema_name="claim_decomposition",
+            schema={
+                "type": "object",
+                "properties": {
+                    "claims": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "claim": {"type": "string"},
+                                "original_sentence": {"type": "string"},
+                            },
+                            "required": ["claim", "original_sentence"],
+                        },
+                        "description": "List of decomposed claims",
+                    }
+                },
+                "required": ["claims"],
+            },
+        )
+
+        claims_data = response_obj.get("claims", [])
+        return [
+            {
+                "text": c.get("claim", ""),
+                "original_sentence": c.get("original_sentence", ""),
+            }
+            for c in claims_data
+            if c.get("claim", "").strip()
+        ]
+
+    async def _evaluate_claims_against_context(
+        self,
+        claims: list[dict[str, str]],
+        context_sources: list[dict[str, str]],
+    ) -> list[dict[str, Any]]:
+        """Evaluate each claim against context sources.
+
+        Returns:
+            List of claim evaluations with grounding status and source attribution
+        """
+        claim_evaluations = []
+
+        for claim in claims:
+            claim_text = claim.get("text", "")
+            if not claim_text.strip():
+                continue
+
+            supporting_sources = []
+            contradicting_sources = []
+
+            # Evaluate claim against each context source
+            for source in context_sources:
+                source_content = source.get("content", "")
+                source_name = source.get("source", "Unknown")
+
+                stance = await self._evaluate_claim_stance(claim_text, source_content)
+
+                if stance == "SUPPORTS":
+                    supporting_sources.append(source_name)
+                elif stance == "CONTRADICTS":
+                    contradicting_sources.append(source_name)
+
+            # A claim is grounded if it has supporting sources and no contradicting ones
+            is_grounded = (
+                len(supporting_sources) > 0 and len(contradicting_sources) == 0
+            )
+
+            claim_evaluations.append(
+                {
+                    "claim": claim_text,
+                    "original_sentence": claim.get("original_sentence", ""),
+                    "is_grounded": is_grounded,
+                    "supporting_sources": supporting_sources,
+                    "contradicting_sources": contradicting_sources,
+                }
+            )
+
+        return claim_evaluations
+
+    async def _evaluate_claim_stance(self, claim: str, context: str) -> str:
+        """Evaluate whether a context source supports, contradicts, or is irrelevant to a claim.
+
+        Returns:
+            One of: "SUPPORTS", "CONTRADICTS", "IRRELEVANT"
+        """
+        prompt = f"""You are an expert evaluator assessing the relationship between claims and sources.
+
+TASK: Determine if the source supports, contradicts, or is irrelevant to the claim.
+
+DEFINITION:
+- SUPPORTS: The source provides evidence that makes the claim more likely to be true
+- CONTRADICTS: The source provides evidence that makes the claim false or less likely
+- IRRELEVANT: The source does not address the claim at all
+
+<claim>
+{claim}
+</claim>
+
+<source>
+{context}
+</source>
+
+OUTPUT FORMAT: Return a JSON object with a "stance" field.
+The stance must be exactly one of: "SUPPORTS", "CONTRADICTS", or "IRRELEVANT"."""
+
+        response_obj = await self._get_structured_llm_response(
+            prompt,
+            schema_name="claim_stance_evaluation",
+            schema={
+                "type": "object",
+                "properties": {
+                    "stance": {
+                        "type": "string",
+                        "enum": ["SUPPORTS", "CONTRADICTS", "IRRELEVANT"],
+                        "description": "Stance of the source relative to the claim",
+                    }
+                },
+                "required": ["stance"],
+            },
+        )
+
+        stance = response_obj.get("stance", "IRRELEVANT").upper()
+        if stance not in ["SUPPORTS", "CONTRADICTS", "IRRELEVANT"]:
+            stance = "IRRELEVANT"
+
+        return stance
+
+    def _format_justification(
+        self, score: float, claim_evaluations: list[dict[str, Any]]
+    ) -> str:
+        """Format detailed justification with claim breakdown."""
+        grounded_claims = [c for c in claim_evaluations if c["is_grounded"]]
+        ungrounded_claims = [c for c in claim_evaluations if not c["is_grounded"]]
+
+        justification = (
+            f"Overall Faithfulness: {score:.1f}/100 "
+            f"({len(grounded_claims)}/{len(claim_evaluations)} claims grounded).\n"
+        )
+
+        if claim_evaluations:
+            justification += "---\n"
+
+            if grounded_claims:
+                justification += "\n✓ GROUNDED CLAIMS:\n\n"
+                for i, eval_item in enumerate(grounded_claims, 1):
+                    justification += f'{i}. "{eval_item["claim"]}"\n'
+                    if eval_item["supporting_sources"]:
+                        sources_str = ", ".join(eval_item["supporting_sources"])
+                        justification += f"   Supporting Sources: {sources_str}\n"
+                    justification += "\n"
+
+            if ungrounded_claims:
+                justification += "\n✗ UNGROUNDED CLAIMS:\n\n"
+                for i, eval_item in enumerate(ungrounded_claims, 1):
+                    justification += f'{i}. "{eval_item["claim"]}"\n'
+                    if eval_item["contradicting_sources"]:
+                        sources_str = ", ".join(eval_item["contradicting_sources"])
+                        justification += f"   Contradicting Sources: {sources_str}\n"
+                    if (
+                        not eval_item["supporting_sources"]
+                        and not eval_item["contradicting_sources"]
+                    ):
+                        justification += "   No supporting sources found in context.\n"
+                    justification += "\n"
+
+        return justification.rstrip()
+
+    async def _get_structured_llm_response(
+        self,
+        evaluation_prompt: str,
+        schema_name: str,
+        schema: dict[str, Any],
+    ) -> dict[str, Any]:
+        """Get structured LLM response using JSON schema."""
+        # Remove community-agents suffix from llm model name
+        model = clean_model_name(self.model)
+
+        # Prepare the request
+        request_data = {
+            "model": model,
+            "messages": [
+                {"role": "system", "content": "Faithfulness Evaluation"},
+                {"role": "user", "content": evaluation_prompt},
+            ],
+            "response_format": {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": schema_name,
+                    "schema": schema,
+                },
+            },
+        }
+
+        assert self.llm, "LLM should be initialized before calling this method."
+        response = await self.llm.chat_completions(**request_data)
+        content = response.choices[-1].message.content or "{}"
+        return json.loads(content)
diff --git a/tests/cli/evaluators/test_legacy_context_precision_evaluator.py b/tests/cli/evaluators/test_legacy_context_precision_evaluator.py
new file mode 100644
index 000000000..761facad3
--- /dev/null
+++ b/tests/cli/evaluators/test_legacy_context_precision_evaluator.py
@@ -0,0 +1,313 @@
+"""Tests for LegacyContextPrecisionEvaluator.
+
+Tests span extraction, chunk normalization, and LLM evaluation.
+"""
+
+import json
+from types import MappingProxyType
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams
+from uipath.eval.evaluators import LegacyContextPrecisionEvaluator
+from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria
+from uipath.eval.models.models import (
+    AgentExecution,
+    LegacyEvaluatorCategory,
+    LegacyEvaluatorType,
+)
+
+
+def _make_base_params() -> EvaluatorBaseParams:
+    """Create base parameters for context precision evaluator."""
+    return EvaluatorBaseParams(
+        id="context-precision",
+        category=LegacyEvaluatorCategory.LlmAsAJudge,
+        evaluator_type=LegacyEvaluatorType.ContextPrecision,
+        name="Context Precision",
+        description="Evaluates context chunk relevance",
+        created_at="2025-01-01T00:00:00Z",
+        updated_at="2025-01-01T00:00:00Z",
+        target_output_key="*",
+    )
+
+
+@pytest.fixture
+def evaluator_with_mocked_llm():
+    """Fixture to create evaluator with mocked LLM service."""
+    with patch("uipath.platform.UiPath"):
+        evaluator = LegacyContextPrecisionEvaluator(
+            **_make_base_params().model_dump(),
+            config={},
+            model="gpt-4.1-2025-04-14",
+        )
+    return evaluator
+
+
+def _make_mock_span(input_query: str, output_chunks: list[str]):
+    """Create a mock span with context grounding data."""
+
+    class MockSpan:
+        def __init__(self):
+            self.attributes = MappingProxyType(
+                {
+                    "openinference.span.kind": "RETRIEVER",
+                    "input.mime_type": "text/plain",
+                    "input.value": input_query,
+                    "output.value": json.dumps(
+                        {
+                            "documents": [
+                                {"id": str(i), "text": chunk}
+                                for i, chunk in enumerate(output_chunks)
+                            ]
+                        }
+                    ),
+                    "output.mime_type": "application/json",
+                }
+            )
+
+    return MockSpan()
+
+
+class TestLegacyContextPrecisionEvaluator:
+    """Test suite for LegacyContextPrecisionEvaluator."""
+
+    @pytest.mark.asyncio
+    async def test_span_extraction_with_valid_data(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test extraction of context groundings from spans."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Create mock span with context grounding data
+        span = _make_mock_span(
+            input_query="construction industry",
+            output_chunks=["Building materials", "Safety codes", "Project management"],
+        )
+
+        # Extract context groundings
+        groundings = evaluator._extract_context_groundings([span])
+
+        assert len(groundings) == 1
+        assert groundings[0]["query"] == "construction industry"
+        assert len(groundings[0]["chunks"]) == 3
+        # Chunks are JSON-serialized because they come from the output
+        assert any("Building materials" in chunk for chunk in groundings[0]["chunks"])
+
+    @pytest.mark.asyncio
+    async def test_span_extraction_skips_invalid_spans(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test that spans without proper structure are skipped."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Create spans: one valid, one invalid
+        valid_span = _make_mock_span(
+            input_query="test query",
+            output_chunks=["chunk1"],
+        )
+
+        class InvalidSpan:
+            attributes = MappingProxyType(
+                {
+                    "openinference.span.kind": "RETRIEVER",
+                    # Missing input.value and output.value
+                }
+            )
+
+        groundings = evaluator._extract_context_groundings([valid_span, InvalidSpan()])
+
+        assert len(groundings) == 1
+        assert groundings[0]["query"] == "test query"
+
+    @pytest.mark.asyncio
+    async def test_chunk_normalization(self, evaluator_with_mocked_llm) -> None:
+        """Test normalization of various chunk formats."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Test list of strings
+        chunks = evaluator._normalize_chunks(["chunk1", "chunk2"])
+        assert len(chunks) == 2
+        assert all(isinstance(c, str) for c in chunks)
+
+        # Test list of dicts
+        chunks = evaluator._normalize_chunks(
+            [
+                {"id": "1", "text": "content1"},
+                {"id": "2", "text": "content2"},
+            ]
+        )
+        assert len(chunks) == 2
+        assert all(isinstance(c, str) for c in chunks)
+
+        # Test single string
+        chunks = evaluator._normalize_chunks("single chunk")
+        assert len(chunks) == 1
+        assert chunks[0] == "single chunk"
+
+    @pytest.mark.asyncio
+    async def test_evaluation_with_mocked_llm(self, evaluator_with_mocked_llm) -> None:
+        """Test evaluation logic with mocked LLM."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Create mock spans
+        span = _make_mock_span(
+            input_query="python programming",
+            output_chunks=[
+                "Python syntax guide",
+                "Python libraries overview",
+                "JavaScript fundamentals",
+            ],
+        )
+
+        # Extract context groundings from the span
+        groundings = evaluator._extract_context_groundings([span])
+        assert len(groundings) == 1
+        assert groundings[0]["query"] == "python programming"
+        assert len(groundings[0]["chunks"]) == 3
+
+        # Test the grounding evaluation with mocked LLM response
+        mock_llm_response = {
+            "relevancies": [
+                {"relevancy_score": 95},
+                {"relevancy_score": 75},
+                {"relevancy_score": 45},
+            ]
+        }
+
+        with patch.object(
+            evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+        ) as mock_llm:
+            mock_llm.return_value = mock_llm_response
+
+            # Evaluate the context grounding
+            scores = await evaluator._evaluate_context_grounding(
+                groundings[0]["query"], groundings[0]["chunks"]
+            )
+
+            assert scores == [95, 75, 45]
+            assert abs(sum(scores) / len(scores) - 71.66666667) < 0.01
+
+    @pytest.mark.asyncio
+    async def test_evaluation_with_no_context_groundings(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test evaluation when no context groundings are found."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Create empty agent execution (no spans)
+        agent_execution = AgentExecution(
+            agent_input={},
+            agent_trace=[],
+            agent_output="",
+        )
+
+        result = await evaluator.evaluate(
+            agent_execution,
+            evaluation_criteria=LegacyEvaluationCriteria(
+                expected_output="",
+                expected_agent_behavior="",
+            ),
+        )
+
+        assert result.score == 0.0
+        assert "no context grounding" in result.details.lower()
+
+    @pytest.mark.asyncio
+    async def test_evaluation_multiple_context_calls(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test evaluation logic with multiple context grounding calls."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Create two mock spans
+        span1 = _make_mock_span(
+            input_query="query 1",
+            output_chunks=["chunk1a", "chunk1b"],
+        )
+        span2 = _make_mock_span(
+            input_query="query 2",
+            output_chunks=["chunk2a", "chunk2b", "chunk2c"],
+        )
+
+        # Extract context groundings from the spans
+        groundings = evaluator._extract_context_groundings([span1, span2])
+        assert len(groundings) == 2
+        assert groundings[0]["query"] == "query 1"
+        assert groundings[1]["query"] == "query 2"
+
+        # Mock the LLM responses
+        mock_llm_response_1 = {
+            "relevancies": [{"relevancy_score": 90}, {"relevancy_score": 80}]
+        }
+        mock_llm_response_2 = {
+            "relevancies": [
+                {"relevancy_score": 85},
+                {"relevancy_score": 75},
+                {"relevancy_score": 65},
+            ]
+        }
+
+        with patch.object(
+            evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+        ) as mock_llm:
+            # Return different responses for each call
+            mock_llm.side_effect = [mock_llm_response_1, mock_llm_response_2]
+
+            # Evaluate both context groundings
+            scores1 = await evaluator._evaluate_context_grounding(
+                groundings[0]["query"], groundings[0]["chunks"]
+            )
+            scores2 = await evaluator._evaluate_context_grounding(
+                groundings[1]["query"], groundings[1]["chunks"]
+            )
+
+            # Verify individual scores
+            assert scores1 == [90, 80]
+            assert scores2 == [85, 75, 65]
+
+            # Verify means
+            mean1 = sum(scores1) / len(scores1)  # 85
+            mean2 = sum(scores2) / len(scores2)  # 75
+            overall_mean = (mean1 + mean2) / 2  # 80
+            assert overall_mean == 80.0
+
+    @pytest.mark.asyncio
+    async def test_span_extraction_handles_json_parse_errors(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test that spans with invalid JSON are skipped."""
+        evaluator = evaluator_with_mocked_llm
+
+        class BadJsonSpan:
+            attributes = MappingProxyType(
+                {
+                    "openinference.span.kind": "RETRIEVER",
+                    "input.value": "test query",
+                    "output.value": "not valid json",
+                }
+            )
+
+        # Should not raise, should skip the span
+        groundings = evaluator._extract_context_groundings([BadJsonSpan()])
+        assert len(groundings) == 0
+
+    @pytest.mark.asyncio
+    async def test_serialization_of_dict_chunks(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test that dict chunks are properly serialized."""
+        evaluator = evaluator_with_mocked_llm
+
+        chunks = evaluator._normalize_chunks(
+            [
+                {"title": "Document 1", "content": "Some content"},
+                {"title": "Document 2", "content": "More content"},
+            ]
+        )
+
+        assert len(chunks) == 2
+        assert all(isinstance(c, str) for c in chunks)
+        # Should be JSON serialized
+        assert '"title"' in chunks[0] or '"content"' in chunks[0]
diff --git a/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py b/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py
new file mode 100644
index 000000000..c72ad4f6b
--- /dev/null
+++ b/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py
@@ -0,0 +1,634 @@
+"""Tests for LegacyFaithfulnessEvaluator.
+
+Tests span extraction, claim extraction (3-stage pipeline), and claim evaluation.
+"""
+
+import json
+from types import MappingProxyType
+from typing import Any
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams
+from uipath.eval.evaluators import LegacyFaithfulnessEvaluator
+from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria
+from uipath.eval.models.models import (
+    AgentExecution,
+    LegacyEvaluatorCategory,
+    LegacyEvaluatorType,
+)
+
+
+def _make_base_params() -> EvaluatorBaseParams:
+    """Create base parameters for faithfulness evaluator."""
+    return EvaluatorBaseParams(
+        id="faithfulness",
+        category=LegacyEvaluatorCategory.LlmAsAJudge,
+        evaluator_type=LegacyEvaluatorType.Faithfulness,
+        name="Faithfulness",
+        description="Evaluates faithfulness of claims against context",
+        created_at="2025-01-01T00:00:00Z",
+        updated_at="2025-01-01T00:00:00Z",
+        target_output_key="*",
+    )
+
+
+@pytest.fixture
+def evaluator_with_mocked_llm():
+    """Fixture to create evaluator with mocked LLM service."""
+    with patch("uipath.platform.UiPath"):
+        evaluator = LegacyFaithfulnessEvaluator(
+            **_make_base_params().model_dump(),
+            config={},
+            model="gpt-4.1-2025-04-14",
+        )
+    return evaluator
+
+
+def _make_mock_span(tool_name: str, output_data: dict[str, Any]):
+    """Create a mock span with tool call data."""
+
+    class MockSpan:
+        def __init__(self):
+            self.attributes = MappingProxyType(
+                {
+                    "openinference.span.kind": tool_name,
+                    "output.value": json.dumps(output_data),
+                }
+            )
+
+    return MockSpan()
+
+
+def _make_retriever_span(query: str, documents: list[str]):
+    """Create a mock RETRIEVER span with context grounding data."""
+    return _make_mock_span(
+        "RETRIEVER",
+        {"documents": [{"id": str(i), "text": doc} for i, doc in enumerate(documents)]},
+    )
+
+
+class TestLegacyFaithfulnessEvaluator:
+    """Test suite for LegacyFaithfulnessEvaluator."""
+
+    @pytest.mark.asyncio
+    async def test_context_source_extraction_from_tool_calls(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test extraction of context sources from tool call spans."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Create mock spans with tool outputs
+        span1 = _make_mock_span("TOOL_CALL", {"result": "Tool output 1"})
+        span2 = _make_mock_span("TOOL_CALL", {"result": "Tool output 2"})
+
+        # Extract context sources
+        sources = evaluator._extract_context_sources([span1, span2])
+
+        assert len(sources) == 2
+        assert all("content" in s and "source" in s for s in sources)
+
+    @pytest.mark.asyncio
+    async def test_context_source_extraction_from_retriever(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test extraction of context sources from RETRIEVER spans."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Create mock RETRIEVER span with documents
+        span = _make_retriever_span(
+            "construction", ["Building materials info", "Safety codes"]
+        )
+
+        # Extract context sources (should extract each document individually)
+        sources = evaluator._extract_context_sources([span])
+
+        assert len(sources) == 2
+        assert all(s["source"] == "Context Grounding" for s in sources)
+        # Check that we have both documents
+        contents = [s["content"] for s in sources]
+        assert any("Building materials" in c for c in contents)
+        assert any("Safety codes" in c for c in contents)
+
+    @pytest.mark.asyncio
+    async def test_context_source_extraction_skips_invalid_spans(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test that spans without proper structure are skipped."""
+        evaluator = evaluator_with_mocked_llm
+
+        class InvalidSpan:
+            attributes = MappingProxyType(
+                {
+                    "openinference.span.kind": "TOOL_CALL",
+                    # Missing output.value
+                }
+            )
+
+        # Should skip invalid span
+        sources = evaluator._extract_context_sources([InvalidSpan()])
+        assert len(sources) == 0
+
+    @pytest.mark.asyncio
+    async def test_select_verifiable_sentences(self, evaluator_with_mocked_llm) -> None:
+        """Test Stage 1: Selection of verifiable sentences."""
+        evaluator = evaluator_with_mocked_llm
+
+        agent_output = (
+            "The capital of France is Paris. Do you agree? This is important."
+        )
+
+        mock_response = {"sentences": ["The capital of France is Paris."]}
+
+        with patch.object(
+            evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+        ) as mock_llm:
+            mock_llm.return_value = mock_response
+
+            sentences = await evaluator._select_verifiable_sentences(agent_output)
+
+            assert len(sentences) == 1
+            assert "capital of France" in sentences[0]
+
+    @pytest.mark.asyncio
+    async def test_disambiguate_sentences(self, evaluator_with_mocked_llm) -> None:
+        """Test Stage 2: Disambiguation of sentences."""
+        evaluator = evaluator_with_mocked_llm
+
+        verifiable_sentences = ["It is located in Western Europe."]
+        full_output = "France is a country. It is located in Western Europe."
+
+        mock_response = {
+            "disambiguated": [
+                {
+                    "original": "It is located in Western Europe.",
+                    "disambiguated": "France is located in Western Europe.",
+                }
+            ]
+        }
+
+        with patch.object(
+            evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+        ) as mock_llm:
+            mock_llm.return_value = mock_response
+
+            result = await evaluator._disambiguate_sentences(
+                verifiable_sentences, full_output
+            )
+
+            assert len(result) == 1
+            assert result[0]["disambiguated"] == "France is located in Western Europe."
+
+    @pytest.mark.asyncio
+    async def test_decompose_to_claims(self, evaluator_with_mocked_llm) -> None:
+        """Test Stage 3: Decomposition into standalone claims."""
+        evaluator = evaluator_with_mocked_llm
+
+        disambiguated = [
+            {
+                "original": "Paris and Lyon have populations over 1 million.",
+                "disambiguated": "Paris and Lyon have populations over 1 million.",
+            }
+        ]
+        full_output = (
+            "France has major cities. Paris and Lyon have populations over 1 million."
+        )
+
+        mock_response = {
+            "claims": [
+                {
+                    "claim": "Paris has a population over 1 million",
+                    "original_sentence": "1",
+                },
+                {
+                    "claim": "Lyon has a population over 1 million",
+                    "original_sentence": "1",
+                },
+            ]
+        }
+
+        with patch.object(
+            evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+        ) as mock_llm:
+            mock_llm.return_value = mock_response
+
+            claims = await evaluator._decompose_to_claims(disambiguated, full_output)
+
+            assert len(claims) == 2
+            assert any("Paris" in c["text"] for c in claims)
+            assert any("Lyon" in c["text"] for c in claims)
+
+    @pytest.mark.asyncio
+    async def test_evaluate_claim_stance_supports(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test claim stance evaluation when source supports claim."""
+        evaluator = evaluator_with_mocked_llm
+
+        claim = "Paris is the capital of France"
+        context = "The capital of France is Paris, a major European city."
+
+        mock_response = {"stance": "SUPPORTS"}
+
+        with patch.object(
+            evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+        ) as mock_llm:
+            mock_llm.return_value = mock_response
+
+            stance = await evaluator._evaluate_claim_stance(claim, context)
+
+            assert stance == "SUPPORTS"
+
+    @pytest.mark.asyncio
+    async def test_evaluate_claim_stance_contradicts(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test claim stance evaluation when source contradicts claim."""
+        evaluator = evaluator_with_mocked_llm
+
+        claim = "Paris is in Germany"
+        context = "Paris is a city in France, not Germany."
+
+        mock_response = {"stance": "CONTRADICTS"}
+
+        with patch.object(
+            evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+        ) as mock_llm:
+            mock_llm.return_value = mock_response
+
+            stance = await evaluator._evaluate_claim_stance(claim, context)
+
+            assert stance == "CONTRADICTS"
+
+    @pytest.mark.asyncio
+    async def test_evaluate_claim_stance_irrelevant(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test claim stance evaluation when source is irrelevant."""
+        evaluator = evaluator_with_mocked_llm
+
+        claim = "Paris is the capital of France"
+        context = "The Eiffel Tower is made of iron."
+
+        mock_response = {"stance": "IRRELEVANT"}
+
+        with patch.object(
+            evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+        ) as mock_llm:
+            mock_llm.return_value = mock_response
+
+            stance = await evaluator._evaluate_claim_stance(claim, context)
+
+            assert stance == "IRRELEVANT"
+
+    @pytest.mark.asyncio
+    async def test_evaluate_claims_against_context(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test evaluation of claims against multiple context sources."""
+        evaluator = evaluator_with_mocked_llm
+
+        claims = [
+            {"text": "Paris is in France", "original_sentence": "1"},
+            {"text": "Tokyo is in Japan", "original_sentence": "2"},
+        ]
+        context_sources = [
+            {"content": "Paris is the capital of France", "source": "Source 1"},
+            {"content": "Tokyo is the capital of Japan", "source": "Source 2"},
+        ]
+
+        # Mock stance evaluations
+        with patch.object(
+            evaluator, "_evaluate_claim_stance", new_callable=AsyncMock
+        ) as mock_stance:
+            # Return SUPPORTS for both claims
+            mock_stance.side_effect = ["SUPPORTS", "SUPPORTS", "SUPPORTS", "SUPPORTS"]
+
+            evaluations = await evaluator._evaluate_claims_against_context(
+                claims, context_sources
+            )
+
+            assert len(evaluations) == 2
+            assert all(e["is_grounded"] for e in evaluations)
+            assert len(evaluations[0]["supporting_sources"]) == 2
+            assert len(evaluations[1]["supporting_sources"]) == 2
+
+    @pytest.mark.asyncio
+    async def test_claim_grounding_with_contradicting_source(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test that claims with contradicting sources are not grounded."""
+        evaluator = evaluator_with_mocked_llm
+
+        claims = [
+            {"text": "The Earth is flat", "original_sentence": "1"},
+        ]
+        context_sources = [
+            {"content": "The Earth is spherical", "source": "Science Source"},
+        ]
+
+        with patch.object(
+            evaluator, "_evaluate_claim_stance", new_callable=AsyncMock
+        ) as mock_stance:
+            mock_stance.return_value = "CONTRADICTS"
+
+            evaluations = await evaluator._evaluate_claims_against_context(
+                claims, context_sources
+            )
+
+            assert len(evaluations) == 1
+            assert not evaluations[0]["is_grounded"]
+            assert len(evaluations[0]["contradicting_sources"]) == 1
+
+    @pytest.mark.asyncio
+    async def test_full_evaluation_with_no_agent_output(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test evaluation when no agent output is provided."""
+        evaluator = evaluator_with_mocked_llm
+
+        agent_execution = AgentExecution(
+            agent_input={},
+            agent_trace=[],
+            agent_output="",
+        )
+
+        result = await evaluator.evaluate(
+            agent_execution,
+            evaluation_criteria=LegacyEvaluationCriteria(
+                expected_output="",
+                expected_agent_behavior="",
+            ),
+        )
+
+        assert result.score == 0.0
+        assert "no agent output" in result.details.lower()
+
+    @pytest.mark.asyncio
+    async def test_full_evaluation_with_no_context_sources(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test evaluation when no context sources are available."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Create a span without output (no context source)
+        class NoOutputSpan:
+            attributes = MappingProxyType(
+                {
+                    "openinference.span.kind": "TOOL_CALL",
+                    # No output.value
+                }
+            )
+
+        agent_execution = AgentExecution(
+            agent_input={},
+            agent_trace=[],
+            agent_output="The sky is blue.",
+        )
+
+        with patch.object(evaluator, "_extract_context_sources", return_value=[]):
+            result = await evaluator.evaluate(
+                agent_execution,
+                evaluation_criteria=LegacyEvaluationCriteria(
+                    expected_output="The sky is blue.",
+                    expected_agent_behavior="",
+                ),
+            )
+
+            assert result.score == 0.0
+            assert "no context sources" in result.details.lower()
+
+    @pytest.mark.asyncio
+    async def test_full_evaluation_with_no_verifiable_claims(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test evaluation when no verifiable claims are found."""
+        evaluator = evaluator_with_mocked_llm
+
+        agent_execution = AgentExecution(
+            agent_input={},
+            agent_trace=[],
+            agent_output="Just a greeting.",
+        )
+
+        with (
+            patch.object(
+                evaluator,
+                "_extract_context_sources",
+                return_value=[{"content": "Some context", "source": "Test"}],
+            ),
+            patch.object(
+                evaluator, "_extract_claims", new_callable=AsyncMock
+            ) as mock_claims,
+        ):
+            mock_claims.return_value = []
+
+            result = await evaluator.evaluate(
+                agent_execution,
+                evaluation_criteria=LegacyEvaluationCriteria(
+                    expected_output="Just a greeting.",
+                    expected_agent_behavior="",
+                ),
+            )
+
+            assert result.score == 100.0
+            assert "no verifiable claims" in result.details.lower()
+
+    @pytest.mark.asyncio
+    async def test_full_evaluation_with_grounded_claims(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test full evaluation flow with grounded claims."""
+        evaluator = evaluator_with_mocked_llm
+
+        agent_execution = AgentExecution(
+            agent_input={},
+            agent_trace=[],
+            agent_output="Paris is in France.",
+        )
+
+        # Mock the extraction and evaluation steps
+        with (
+            patch.object(
+                evaluator,
+                "_extract_context_sources",
+                return_value=[
+                    {
+                        "content": "Paris is the capital of France",
+                        "source": "Context Grounding",
+                    }
+                ],
+            ),
+            patch.object(
+                evaluator, "_extract_claims", new_callable=AsyncMock
+            ) as mock_claims,
+            patch.object(
+                evaluator, "_evaluate_claims_against_context", new_callable=AsyncMock
+            ) as mock_eval,
+        ):
+            mock_claims.return_value = [
+                {"text": "Paris is in France", "original_sentence": "1"}
+            ]
+            mock_eval.return_value = [
+                {
+                    "claim": "Paris is in France",
+                    "original_sentence": "1",
+                    "is_grounded": True,
+                    "supporting_sources": ["Context Grounding"],
+                    "contradicting_sources": [],
+                }
+            ]
+
+            result = await evaluator.evaluate(
+                agent_execution,
+                evaluation_criteria=LegacyEvaluationCriteria(
+                    expected_output="Paris is in France.",
+                    expected_agent_behavior="",
+                ),
+            )
+
+            assert result.score == 100.0
+            assert "GROUNDED CLAIMS" in result.details
+            assert "Paris is in France" in result.details
+
+    @pytest.mark.asyncio
+    async def test_full_evaluation_with_mixed_claims(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test full evaluation with both grounded and ungrounded claims."""
+        evaluator = evaluator_with_mocked_llm
+
+        agent_execution = AgentExecution(
+            agent_input={},
+            agent_trace=[],
+            agent_output="Paris is in France. The sky is green.",
+        )
+
+        # Mock the extraction and evaluation steps
+        with (
+            patch.object(
+                evaluator,
+                "_extract_context_sources",
+                return_value=[
+                    {
+                        "content": "Paris is the capital of France",
+                        "source": "Context Grounding",
+                    }
+                ],
+            ),
+            patch.object(
+                evaluator, "_extract_claims", new_callable=AsyncMock
+            ) as mock_claims,
+            patch.object(
+                evaluator, "_evaluate_claims_against_context", new_callable=AsyncMock
+            ) as mock_eval,
+        ):
+            mock_claims.return_value = [
+                {"text": "Paris is in France", "original_sentence": "1"},
+                {"text": "The sky is green", "original_sentence": "2"},
+            ]
+            mock_eval.return_value = [
+                {
+                    "claim": "Paris is in France",
+                    "original_sentence": "1",
+                    "is_grounded": True,
+                    "supporting_sources": ["Context Grounding"],
+                    "contradicting_sources": [],
+                },
+                {
+                    "claim": "The sky is green",
+                    "original_sentence": "2",
+                    "is_grounded": False,
+                    "supporting_sources": [],
+                    "contradicting_sources": [],
+                },
+            ]
+
+            result = await evaluator.evaluate(
+                agent_execution,
+                evaluation_criteria=LegacyEvaluationCriteria(
+                    expected_output="Paris is in France. The sky is green.",
+                    expected_agent_behavior="",
+                ),
+            )
+
+            assert result.score == 50.0
+            assert "1/2" in result.details
+            assert "GROUNDED CLAIMS" in result.details
+            assert "UNGROUNDED CLAIMS" in result.details
+
+    @pytest.mark.asyncio
+    async def test_serialize_content_handles_various_types(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test serialization of various content types."""
+        evaluator = evaluator_with_mocked_llm
+
+        # Test string
+        result = evaluator._serialize_content("simple string")
+        assert result == "simple string"
+
+        # Test dict
+        result = evaluator._serialize_content({"key": "value"})
+        assert "key" in result and "value" in result
+
+        # Test list
+        result = evaluator._serialize_content(["item1", "item2"])
+        assert "item1" in result and "item2" in result
+
+    @pytest.mark.asyncio
+    async def test_evaluate_claim_stance_with_invalid_response(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test that invalid stance responses default to IRRELEVANT."""
+        evaluator = evaluator_with_mocked_llm
+
+        claim = "Test claim"
+        context = "Test context"
+
+        mock_response = {"stance": "INVALID_STANCE"}
+
+        with patch.object(
+            evaluator, "_get_structured_llm_response", new_callable=AsyncMock
+        ) as mock_llm:
+            mock_llm.return_value = mock_response
+
+            stance = await evaluator._evaluate_claim_stance(claim, context)
+
+            assert stance == "IRRELEVANT"
+
+    @pytest.mark.asyncio
+    async def test_format_justification_structure(
+        self, evaluator_with_mocked_llm
+    ) -> None:
+        """Test the structure of formatted justification."""
+        evaluator = evaluator_with_mocked_llm
+
+        claim_evaluations = [
+            {
+                "claim": "Grounded claim 1",
+                "original_sentence": "1",
+                "is_grounded": True,
+                "supporting_sources": ["Source A", "Source B"],
+                "contradicting_sources": [],
+            },
+            {
+                "claim": "Ungrounded claim",
+                "original_sentence": "2",
+                "is_grounded": False,
+                "supporting_sources": [],
+                "contradicting_sources": ["Source C"],
+            },
+        ]
+
+        justification = evaluator._format_justification(50.0, claim_evaluations)
+
+        assert "Overall Faithfulness: 50.0/100" in justification
+        assert "1/2" in justification
+        assert "GROUNDED CLAIMS" in justification
+        assert "UNGROUNDED CLAIMS" in justification
+        assert "Grounded claim 1" in justification
+        assert "Ungrounded claim" in justification
+        assert "Source A" in justification
+        assert "Source C" in justification
diff --git a/uv.lock b/uv.lock
index 3f0c5cc09..c4fb23541 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2486,7 +2486,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.4.6"
+version = "2.4.7"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },