From a544e16653bbda8ab7fc626d2e5ef983f32f5dbd Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 7 Jan 2026 15:08:22 -0800
Subject: [PATCH 01/12] feat: rearrange the eval trace, expose new telemetry
 client and send eval set events

---
 pyproject.toml                       |   1 +
 src/uipath/_cli/_evals/_runtime.py   | 470 +++++++++++++++------------
 src/uipath/_cli/_evals/_telemetry.py | 280 ++++++++++++++++
 src/uipath/_cli/cli_eval.py          |   4 +
 src/uipath/telemetry/__init__.py     |   9 +-
 src/uipath/telemetry/_track.py       | 197 ++++++++++-
 uv.lock                              |  11 +
 7 files changed, 759 insertions(+), 213 deletions(-)
 create mode 100644 src/uipath/_cli/_evals/_telemetry.py

diff --git a/pyproject.toml b/pyproject.toml
index ea5a6854a..85f0f2f66 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
   "python-socketio>=5.15.0, <6.0.0",
   "coverage>=7.8.2",
   "mermaid-builder==0.0.3",
+  "applicationinsights>=0.11.10",
 ]
 classifiers = [
   "Intended Audience :: Developers",
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
index 8db78ab43..aaab896d9 100644
--- a/src/uipath/_cli/_evals/_runtime.py
+++ b/src/uipath/_cli/_evals/_runtime.py
@@ -20,6 +20,7 @@
 from opentelemetry import context as context_api
 from opentelemetry.sdk.trace import ReadableSpan, Span
 from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
+from opentelemetry.trace import Status, StatusCode
 from pydantic import BaseModel
 from uipath.core.tracing import UiPathTraceManager
 from uipath.core.tracing.processors import UiPathExecutionBatchTraceProcessor
@@ -299,60 +300,91 @@ async def execute(self) -> UiPathRuntimeResult:
         )
         try:
             with self._mocker_cache():
-                (
-                    evaluation_set,
-                    evaluators,
-                    evaluation_iterable,
-                ) = await self.initiate_evaluation(runtime)
-                workers = self.context.workers or 1
-                assert workers >= 1
-                eval_run_result_list = await execute_parallel(
-                    evaluation_iterable, workers
-                )
-                results = UiPathEvalOutput(
-                    evaluation_set_name=evaluation_set.name,
-                    evaluation_set_results=eval_run_result_list,
-                )
-
-                # Computing evaluator averages
-                evaluator_averages: dict[str, float] = defaultdict(float)
-                evaluator_count: dict[str, int] = defaultdict(int)
-
-                # Check if any eval runs failed
-                any_failed = False
-                for eval_run_result in results.evaluation_set_results:
-                    # Check if the agent execution had an error
-                    if (
-                        eval_run_result.agent_execution_output
-                        and eval_run_result.agent_execution_output.result.error
-                    ):
-                        any_failed = True
-
-                    for result_dto in eval_run_result.evaluation_run_results:
-                        evaluator_averages[result_dto.evaluator_id] += (
-                            result_dto.result.score
+                # Create the parent "Evaluation set run" span
+                # Use tracer from trace_manager's provider to ensure spans go through
+                # the ExecutionSpanProcessor
+                # NOTE: Do NOT set execution.id on this parent span, as the mixin in
+                # UiPathExecutionBatchTraceProcessor propagates execution.id from parent
+                # to child spans, which would overwrite the per-eval execution.id
+                tracer = self.trace_manager.tracer_provider.get_tracer(__name__)
+                span_attributes: dict[str, str] = {
+                    "span_type": "eval_set_run",
+                }
+                if self.context.eval_set_run_id:
+                    span_attributes["eval_set_run_id"] = self.context.eval_set_run_id
+                with tracer.start_as_current_span(
+                    "Evaluation Set Run", attributes=span_attributes
+                ) as span:
+                    try:
+                        (
+                            evaluation_set,
+                            evaluators,
+                            evaluation_iterable,
+                        ) = await self.initiate_evaluation(runtime)
+                        workers = self.context.workers or 1
+                        assert workers >= 1
+                        eval_run_result_list = await execute_parallel(
+                            evaluation_iterable, workers
+                        )
+                        results = UiPathEvalOutput(
+                            evaluation_set_name=evaluation_set.name,
+                            evaluation_set_results=eval_run_result_list,
                         )
-                        evaluator_count[result_dto.evaluator_id] += 1
 
-                for eval_id in evaluator_averages:
-                    evaluator_averages[eval_id] = (
-                        evaluator_averages[eval_id] / evaluator_count[eval_id]
-                    )
-                await self.event_bus.publish(
-                    EvaluationEvents.UPDATE_EVAL_SET_RUN,
-                    EvalSetRunUpdatedEvent(
-                        execution_id=self.execution_id,
-                        evaluator_scores=evaluator_averages,
-                        success=not any_failed,
-                    ),
-                    wait_for_completion=False,
-                )
+                        # Computing evaluator averages
+                        evaluator_averages: dict[str, float] = defaultdict(float)
+                        evaluator_count: dict[str, int] = defaultdict(int)
+
+                        # Check if any eval runs failed
+                        any_failed = False
+                        for eval_run_result in results.evaluation_set_results:
+                            # Check if the agent execution had an error
+                            if (
+                                eval_run_result.agent_execution_output
+                                and eval_run_result.agent_execution_output.result.error
+                            ):
+                                any_failed = True
+
+                            for result_dto in eval_run_result.evaluation_run_results:
+                                evaluator_averages[result_dto.evaluator_id] += (
+                                    result_dto.result.score
+                                )
+                                evaluator_count[result_dto.evaluator_id] += 1
+
+                        for eval_id in evaluator_averages:
+                            evaluator_averages[eval_id] = (
+                                evaluator_averages[eval_id] / evaluator_count[eval_id]
+                            )
+                        await self.event_bus.publish(
+                            EvaluationEvents.UPDATE_EVAL_SET_RUN,
+                            EvalSetRunUpdatedEvent(
+                                execution_id=self.execution_id,
+                                evaluator_scores=evaluator_averages,
+                                success=not any_failed,
+                            ),
+                            wait_for_completion=False,
+                        )
 
-                result = UiPathRuntimeResult(
-                    output={**results.model_dump(by_alias=True)},
-                    status=UiPathRuntimeStatus.SUCCESSFUL,
-                )
-                return result
+                        result = UiPathRuntimeResult(
+                            output={**results.model_dump(by_alias=True)},
+                            status=UiPathRuntimeStatus.SUCCESSFUL,
+                        )
+                        return result
+                    except Exception as e:
+                        # Set span status to ERROR on exception
+                        span.set_status(Status(StatusCode.ERROR, str(e)))
+
+                        # Publish failure event for eval set run
+                        await self.event_bus.publish(
+                            EvaluationEvents.UPDATE_EVAL_SET_RUN,
+                            EvalSetRunUpdatedEvent(
+                                execution_id=self.execution_id,
+                                evaluator_scores={},
+                                success=False,
+                            ),
+                            wait_for_completion=False,
+                        )
+                        raise
         finally:
             await runtime.dispose()
 
@@ -378,165 +410,180 @@ async def _execute_eval(
             ),
         )
 
-        evaluation_run_results = EvaluationRunResult(
-            evaluation_name=eval_item.name, evaluation_run_results=[]
-        )
+        # Create the "Evaluation" span for this eval item
+        # Use tracer from trace_manager's provider to ensure spans go through
+        # the ExecutionSpanProcessor
+        tracer = self.trace_manager.tracer_provider.get_tracer(__name__)
+        with tracer.start_as_current_span(
+            "Evaluation",
+            attributes={
+                "execution.id": execution_id,
+                "span_type": "evaluation",
+                "eval_item_id": eval_item.id,
+                "eval_item_name": eval_item.name,
+            },
+        ):
+            evaluation_run_results = EvaluationRunResult(
+                evaluation_name=eval_item.name, evaluation_run_results=[]
+            )
 
-        try:
             try:
-                agent_execution_output = await self.execute_runtime(
-                    eval_item, execution_id, runtime
-                )
-            except Exception as e:
-                if self.context.verbose:
-                    if isinstance(e, EvaluationRuntimeException):
-                        spans = e.spans
-                        logs = e.logs
-                        execution_time = e.execution_time
-                        loggable_error = e.root_exception
-                    else:
-                        spans = []
-                        logs = []
-                        execution_time = 0
-                        loggable_error = e
-
-                    error_info = UiPathErrorContract(
-                        code="RUNTIME_SHUTDOWN_ERROR",
-                        title="Runtime shutdown failed",
-                        detail=f"Error: {str(loggable_error)}",
-                        category=UiPathErrorCategory.UNKNOWN,
-                    )
-                    error_result = UiPathRuntimeResult(
-                        status=UiPathRuntimeStatus.FAULTED,
-                        error=error_info,
+                try:
+                    agent_execution_output = await self.execute_runtime(
+                        eval_item, execution_id, runtime
                     )
+                except Exception as e:
+                    if self.context.verbose:
+                        if isinstance(e, EvaluationRuntimeException):
+                            spans = e.spans
+                            logs = e.logs
+                            execution_time = e.execution_time
+                            loggable_error = e.root_exception
+                        else:
+                            spans = []
+                            logs = []
+                            execution_time = 0
+                            loggable_error = e
+
+                        error_info = UiPathErrorContract(
+                            code="RUNTIME_SHUTDOWN_ERROR",
+                            title="Runtime shutdown failed",
+                            detail=f"Error: {str(loggable_error)}",
+                            category=UiPathErrorCategory.UNKNOWN,
+                        )
+                        error_result = UiPathRuntimeResult(
+                            status=UiPathRuntimeStatus.FAULTED,
+                            error=error_info,
+                        )
+                        evaluation_run_results.agent_execution_output = (
+                            convert_eval_execution_output_to_serializable(
+                                UiPathEvalRunExecutionOutput(
+                                    execution_time=execution_time,
+                                    result=error_result,
+                                    spans=spans,
+                                    logs=logs,
+                                )
+                            )
+                        )
+                    raise
+
+                if self.context.verbose:
                     evaluation_run_results.agent_execution_output = (
                         convert_eval_execution_output_to_serializable(
-                            UiPathEvalRunExecutionOutput(
-                                execution_time=execution_time,
-                                result=error_result,
-                                spans=spans,
-                                logs=logs,
-                            )
+                            agent_execution_output
                         )
                     )
-                raise
-
-            if self.context.verbose:
-                evaluation_run_results.agent_execution_output = (
-                    convert_eval_execution_output_to_serializable(
-                        agent_execution_output
+                evaluation_item_results: list[EvalItemResult] = []
+
+                for evaluator in evaluators:
+                    if evaluator.id not in eval_item.evaluation_criterias:
+                        # Skip!
+                        continue
+                    evaluation_criteria = eval_item.evaluation_criterias[evaluator.id]
+
+                    evaluation_result = await self.run_evaluator(
+                        evaluator=evaluator,
+                        execution_output=agent_execution_output,
+                        eval_item=eval_item,
+                        evaluation_criteria=evaluator.evaluation_criteria_type(
+                            **evaluation_criteria
+                        )
+                        if evaluation_criteria
+                        else evaluator.evaluator_config.default_evaluation_criteria,
                     )
-                )
-            evaluation_item_results: list[EvalItemResult] = []
 
-            for evaluator in evaluators:
-                if evaluator.id not in eval_item.evaluation_criterias:
-                    # Skip!
-                    continue
-                evaluation_criteria = eval_item.evaluation_criterias[evaluator.id]
-
-                evaluation_result = await self.run_evaluator(
-                    evaluator=evaluator,
-                    execution_output=agent_execution_output,
-                    eval_item=eval_item,
-                    evaluation_criteria=evaluator.evaluation_criteria_type(
-                        **evaluation_criteria
+                    dto_result = EvaluationResultDto.from_evaluation_result(
+                        evaluation_result
                     )
-                    if evaluation_criteria
-                    else evaluator.evaluator_config.default_evaluation_criteria,
-                )
 
-                dto_result = EvaluationResultDto.from_evaluation_result(
-                    evaluation_result
-                )
-
-                evaluation_run_results.evaluation_run_results.append(
-                    EvaluationRunResultDto(
-                        evaluator_name=evaluator.name,
-                        result=dto_result,
-                        evaluator_id=evaluator.id,
+                    evaluation_run_results.evaluation_run_results.append(
+                        EvaluationRunResultDto(
+                            evaluator_name=evaluator.name,
+                            result=dto_result,
+                            evaluator_id=evaluator.id,
+                        )
                     )
-                )
-                evaluation_item_results.append(
-                    EvalItemResult(
-                        evaluator_id=evaluator.id,
-                        result=evaluation_result,
+                    evaluation_item_results.append(
+                        EvalItemResult(
+                            evaluator_id=evaluator.id,
+                            result=evaluation_result,
+                        )
                     )
+
+                exception_details = None
+                agent_output = agent_execution_output.result.output
+                if agent_execution_output.result.status == UiPathRuntimeStatus.FAULTED:
+                    error = agent_execution_output.result.error
+                    if error is not None:
+                        # we set the exception details for the run event
+                        # Convert error contract to exception
+                        error_exception = Exception(
+                            f"{error.title}: {error.detail} (code: {error.code})"
+                        )
+                        exception_details = EvalItemExceptionDetails(
+                            exception=error_exception
+                        )
+                        agent_output = error.model_dump()
+
+                await self.event_bus.publish(
+                    EvaluationEvents.UPDATE_EVAL_RUN,
+                    EvalRunUpdatedEvent(
+                        execution_id=execution_id,
+                        eval_item=eval_item,
+                        eval_results=evaluation_item_results,
+                        success=not agent_execution_output.result.error,
+                        agent_output=agent_output,
+                        agent_execution_time=agent_execution_output.execution_time,
+                        spans=agent_execution_output.spans,
+                        logs=agent_execution_output.logs,
+                        exception_details=exception_details,
+                    ),
+                    wait_for_completion=False,
                 )
 
-            exception_details = None
-            agent_output = agent_execution_output.result.output
-            if agent_execution_output.result.status == UiPathRuntimeStatus.FAULTED:
-                error = agent_execution_output.result.error
-                if error is not None:
-                    # we set the exception details for the run event
-                    # Convert error contract to exception
-                    error_exception = Exception(
-                        f"{error.title}: {error.detail} (code: {error.code})"
-                    )
-                    exception_details = EvalItemExceptionDetails(
-                        exception=error_exception
+            except Exception as e:
+                exception_details = EvalItemExceptionDetails(exception=e)
+
+                for evaluator in evaluators:
+                    evaluation_run_results.evaluation_run_results.append(
+                        EvaluationRunResultDto(
+                            evaluator_name=evaluator.name,
+                            evaluator_id=evaluator.id,
+                            result=EvaluationResultDto(score=0),
+                        )
                     )
-                    agent_output = error.model_dump()
 
-            await self.event_bus.publish(
-                EvaluationEvents.UPDATE_EVAL_RUN,
-                EvalRunUpdatedEvent(
+                eval_run_updated_event = EvalRunUpdatedEvent(
                     execution_id=execution_id,
                     eval_item=eval_item,
-                    eval_results=evaluation_item_results,
-                    success=not agent_execution_output.result.error,
-                    agent_output=agent_output,
-                    agent_execution_time=agent_execution_output.execution_time,
-                    spans=agent_execution_output.spans,
-                    logs=agent_execution_output.logs,
+                    eval_results=[],
+                    success=False,
+                    agent_output={},
+                    agent_execution_time=0.0,
                     exception_details=exception_details,
-                ),
-                wait_for_completion=False,
-            )
-
-        except Exception as e:
-            exception_details = EvalItemExceptionDetails(exception=e)
-
-            for evaluator in evaluators:
-                evaluation_run_results.evaluation_run_results.append(
-                    EvaluationRunResultDto(
-                        evaluator_name=evaluator.name,
-                        evaluator_id=evaluator.id,
-                        result=EvaluationResultDto(score=0),
-                    )
+                    spans=[],
+                    logs=[],
                 )
+                if isinstance(e, EvaluationRuntimeException):
+                    eval_run_updated_event.spans = e.spans
+                    eval_run_updated_event.logs = e.logs
+                    if eval_run_updated_event.exception_details:
+                        eval_run_updated_event.exception_details.exception = (
+                            e.root_exception
+                        )
+                        eval_run_updated_event.exception_details.runtime_exception = (
+                            True
+                        )
 
-            eval_run_updated_event = EvalRunUpdatedEvent(
-                execution_id=execution_id,
-                eval_item=eval_item,
-                eval_results=[],
-                success=False,
-                agent_output={},
-                agent_execution_time=0.0,
-                exception_details=exception_details,
-                spans=[],
-                logs=[],
-            )
-            if isinstance(e, EvaluationRuntimeException):
-                eval_run_updated_event.spans = e.spans
-                eval_run_updated_event.logs = e.logs
-                if eval_run_updated_event.exception_details:
-                    eval_run_updated_event.exception_details.exception = (
-                        e.root_exception
-                    )
-                    eval_run_updated_event.exception_details.runtime_exception = True
-
-            await self.event_bus.publish(
-                EvaluationEvents.UPDATE_EVAL_RUN,
-                eval_run_updated_event,
-                wait_for_completion=False,
-            )
-        finally:
-            clear_execution_context()
+                await self.event_bus.publish(
+                    EvaluationEvents.UPDATE_EVAL_RUN,
+                    eval_run_updated_event,
+                    wait_for_completion=False,
+                )
+            finally:
+                clear_execution_context()
 
-        return evaluation_run_results
+            return evaluation_run_results
 
     async def _generate_input_for_eval(
         self, eval_item: EvaluationItem, runtime: UiPathRuntimeProtocol
@@ -678,26 +725,39 @@ async def run_evaluator(
         *,
         evaluation_criteria: Any,
     ) -> EvaluationResult:
-        output_data: dict[str, Any] | str = {}
-        if execution_output.result.output:
-            if isinstance(execution_output.result.output, BaseModel):
-                output_data = execution_output.result.output.model_dump()
-            else:
-                output_data = execution_output.result.output
-
-        agent_execution = AgentExecution(
-            agent_input=eval_item.inputs,
-            agent_output=output_data,
-            agent_trace=execution_output.spans,
-            expected_agent_behavior=eval_item.expected_agent_behavior,
-        )
+        # Create span for evaluator execution
+        # Use tracer from trace_manager's provider to ensure spans go through
+        # the ExecutionSpanProcessor
+        tracer = self.trace_manager.tracer_provider.get_tracer(__name__)
+        with tracer.start_as_current_span(
+            f"Evaluator: {evaluator.name}",
+            attributes={
+                "span_type": "evaluator",
+                "evaluator_id": evaluator.id,
+                "evaluator_name": evaluator.name,
+                "eval_item_id": eval_item.id,
+            },
+        ):
+            output_data: dict[str, Any] | str = {}
+            if execution_output.result.output:
+                if isinstance(execution_output.result.output, BaseModel):
+                    output_data = execution_output.result.output.model_dump()
+                else:
+                    output_data = execution_output.result.output
+
+            agent_execution = AgentExecution(
+                agent_input=eval_item.inputs,
+                agent_output=output_data,
+                agent_trace=execution_output.spans,
+                expected_agent_behavior=eval_item.expected_agent_behavior,
+            )
 
-        result = await evaluator.validate_and_evaluate_criteria(
-            agent_execution=agent_execution,
-            evaluation_criteria=evaluation_criteria,
-        )
+            result = await evaluator.validate_and_evaluate_criteria(
+                agent_execution=agent_execution,
+                evaluation_criteria=evaluation_criteria,
+            )
 
-        return result
+            return result
 
     async def _get_agent_model(self, runtime: UiPathRuntimeProtocol) -> str | None:
         """Get agent model from the runtime.
diff --git a/src/uipath/_cli/_evals/_telemetry.py b/src/uipath/_cli/_evals/_telemetry.py
new file mode 100644
index 000000000..006ed3a14
--- /dev/null
+++ b/src/uipath/_cli/_evals/_telemetry.py
@@ -0,0 +1,280 @@
+"""Telemetry subscriber for sending evaluation events to Application Insights.
+
+This subscriber listens to evaluation lifecycle events and sends custom telemetry
+events to Application Insights for monitoring and analytics.
+"""
+
+import logging
+import os
+import time
+from typing import Any, Dict
+
+from uipath._events._event_bus import EventBus
+from uipath._events._events import (
+    EvalRunCreatedEvent,
+    EvalRunUpdatedEvent,
+    EvalSetRunCreatedEvent,
+    EvalSetRunUpdatedEvent,
+    EvaluationEvents,
+)
+from uipath.telemetry import is_telemetry_enabled, track_event
+
+logger = logging.getLogger(__name__)
+
+# Telemetry event names for Application Insights
+EVAL_SET_RUN_STARTED = "EvalSetRun.Start"
+EVAL_SET_RUN_COMPLETED = "EvalSetRun.End"
+EVAL_SET_RUN_FAILED = "EvalSetRun.Failed"
+EVAL_RUN_STARTED = "EvalRun.Start"
+EVAL_RUN_COMPLETED = "EvalRun.End"
+EVAL_RUN_FAILED = "EvalRun.Failed"
+
+
+class EvalTelemetrySubscriber:
+    """Subscribes to evaluation events and sends telemetry to Application Insights.
+
+    This subscriber listens to the evaluation event bus and tracks:
+    - Eval set run start/complete/fail events
+    - Eval run start/complete/fail events
+
+    Telemetry is sent asynchronously and failures are silently ignored to ensure
+    evaluation execution is never blocked by telemetry issues.
+
+    Usage:
+        event_bus = EventBus()
+        telemetry_subscriber = EvalTelemetrySubscriber()
+        await telemetry_subscriber.subscribe_to_eval_runtime_events(event_bus)
+    """
+
+    def __init__(self) -> None:
+        """Initialize the telemetry subscriber."""
+        self._eval_set_start_times: Dict[str, float] = {}
+        self._eval_run_start_times: Dict[str, float] = {}
+        self._eval_set_info: Dict[str, Dict[str, Any]] = {}
+        self._eval_run_info: Dict[str, Dict[str, Any]] = {}
+
+    async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None:
+        """Subscribe to evaluation runtime events.
+
+        Args:
+            event_bus: The event bus to subscribe to.
+        """
+        if not is_telemetry_enabled():
+            logger.debug("Telemetry disabled, skipping subscription")
+            return
+
+        event_bus.subscribe(
+            EvaluationEvents.CREATE_EVAL_SET_RUN, self._on_eval_set_run_created
+        )
+        event_bus.subscribe(EvaluationEvents.CREATE_EVAL_RUN, self._on_eval_run_created)
+        event_bus.subscribe(EvaluationEvents.UPDATE_EVAL_RUN, self._on_eval_run_updated)
+        event_bus.subscribe(
+            EvaluationEvents.UPDATE_EVAL_SET_RUN, self._on_eval_set_run_updated
+        )
+
+        logger.debug("Telemetry subscriber subscribed to evaluation events")
+
+    async def _on_eval_set_run_created(self, event: EvalSetRunCreatedEvent) -> None:
+        """Handle eval set run created event.
+
+        Args:
+            event: The eval set run created event.
+        """
+        try:
+            self._eval_set_start_times[event.execution_id] = time.time()
+            self._eval_set_info[event.execution_id] = {
+                "eval_set_id": event.eval_set_id,
+                "eval_set_run_id": event.eval_set_run_id,
+                "entrypoint": event.entrypoint,
+                "no_of_evals": event.no_of_evals,
+                "evaluator_count": len(event.evaluators),
+            }
+
+            properties: Dict[str, Any] = {
+                "EvalSetId": event.eval_set_id,
+                "Entrypoint": event.entrypoint,
+                "EvalCount": event.no_of_evals,
+                "EvaluatorCount": len(event.evaluators),
+            }
+
+            if event.eval_set_run_id:
+                properties["EvalSetRunId"] = event.eval_set_run_id
+
+            self._enrich_properties(properties)
+
+            track_event(EVAL_SET_RUN_STARTED, properties)
+            logger.debug(f"Tracked eval set run started: {event.eval_set_id}")
+
+        except Exception as e:
+            logger.debug(f"Error tracking eval set run started: {e}")
+
+    async def _on_eval_run_created(self, event: EvalRunCreatedEvent) -> None:
+        """Handle eval run created event.
+
+        Args:
+            event: The eval run created event.
+        """
+        try:
+            self._eval_run_start_times[event.execution_id] = time.time()
+            self._eval_run_info[event.execution_id] = {
+                "eval_item_id": event.eval_item.id,
+                "eval_item_name": event.eval_item.name,
+            }
+
+            properties: Dict[str, Any] = {
+                "EvalItemId": event.eval_item.id,
+                "EvalItemName": event.eval_item.name,
+            }
+
+            self._enrich_properties(properties)
+
+            track_event(EVAL_RUN_STARTED, properties)
+            logger.debug(f"Tracked eval run started: {event.eval_item.id}")
+
+        except Exception as e:
+            logger.debug(f"Error tracking eval run started: {e}")
+
+    async def _on_eval_run_updated(self, event: EvalRunUpdatedEvent) -> None:
+        """Handle eval run updated (completed/failed) event.
+
+        Args:
+            event: The eval run updated event.
+        """
+        try:
+            # Calculate duration
+            start_time = self._eval_run_start_times.pop(event.execution_id, None)
+            duration_ms = int((time.time() - start_time) * 1000) if start_time else None
+
+            # Get stored info
+            run_info = self._eval_run_info.pop(event.execution_id, {})
+
+            # Calculate average score
+            scores = [
+                r.result.score for r in event.eval_results if r.result.score is not None
+            ]
+            avg_score = sum(scores) / len(scores) if scores else None
+
+            properties: Dict[str, Any] = {
+                "EvalItemId": run_info.get("eval_item_id", event.eval_item.id),
+                "EvalItemName": run_info.get("eval_item_name", event.eval_item.name),
+                "Success": event.success,
+                "EvaluatorCount": len(event.eval_results),
+            }
+
+            if duration_ms is not None:
+                properties["DurationMs"] = duration_ms
+
+            if avg_score is not None:
+                properties["AverageScore"] = avg_score
+
+            if event.agent_execution_time:
+                properties["AgentExecutionTimeMs"] = int(
+                    event.agent_execution_time * 1000
+                )
+
+            if event.exception_details:
+                properties["ErrorType"] = type(
+                    event.exception_details.exception
+                ).__name__
+                properties["ErrorMessage"] = str(event.exception_details.exception)[
+                    :500
+                ]
+                properties["IsRuntimeException"] = (
+                    event.exception_details.runtime_exception
+                )
+
+            self._enrich_properties(properties)
+
+            event_name = EVAL_RUN_COMPLETED if event.success else EVAL_RUN_FAILED
+            track_event(event_name, properties)
+            logger.debug(
+                f"Tracked eval run {'completed' if event.success else 'failed'}: {event.eval_item.id}"
+            )
+
+        except Exception as e:
+            logger.debug(f"Error tracking eval run updated: {e}")
+
+    async def _on_eval_set_run_updated(self, event: EvalSetRunUpdatedEvent) -> None:
+        """Handle eval set run updated (completed/failed) event.
+
+        Args:
+            event: The eval set run updated event.
+        """
+        try:
+            # Calculate duration
+            start_time = self._eval_set_start_times.pop(event.execution_id, None)
+            duration_ms = int((time.time() - start_time) * 1000) if start_time else None
+
+            # Get stored info
+            set_info = self._eval_set_info.pop(event.execution_id, {})
+
+            # Calculate overall average score
+            scores = list(event.evaluator_scores.values())
+            avg_score = sum(scores) / len(scores) if scores else None
+
+            properties: Dict[str, Any] = {
+                "EvalSetId": set_info.get("eval_set_id", "unknown"),
+                "Success": event.success,
+                "EvaluatorCount": len(event.evaluator_scores),
+            }
+
+            if set_info.get("eval_set_run_id"):
+                properties["EvalSetRunId"] = set_info["eval_set_run_id"]
+
+            if set_info.get("entrypoint"):
+                properties["Entrypoint"] = set_info["entrypoint"]
+
+            if set_info.get("no_of_evals"):
+                properties["EvalCount"] = set_info["no_of_evals"]
+
+            if duration_ms is not None:
+                properties["DurationMs"] = duration_ms
+
+            if avg_score is not None:
+                properties["AverageScore"] = avg_score
+
+            # Add individual evaluator scores
+            for evaluator_id, score in event.evaluator_scores.items():
+                # Sanitize evaluator ID for use as property key
+                safe_key = f"Score_{evaluator_id.replace('-', '_')[:50]}"
+                properties[safe_key] = score
+
+            self._enrich_properties(properties)
+
+            event_name = (
+                EVAL_SET_RUN_COMPLETED if event.success else EVAL_SET_RUN_FAILED
+            )
+            track_event(event_name, properties)
+            logger.debug(
+                f"Tracked eval set run {'completed' if event.success else 'failed'}"
+            )
+
+        except Exception as e:
+            logger.debug(f"Error tracking eval set run updated: {e}")
+
+    def _enrich_properties(self, properties: Dict[str, Any]) -> None:
+        """Enrich properties with common context information.
+
+        Args:
+            properties: The properties dictionary to enrich.
+        """
+        # Add UiPath context
+        project_id = os.getenv("UIPATH_PROJECT_ID")
+        if project_id:
+            properties["ProjectId"] = project_id
+
+        org_id = os.getenv("UIPATH_CLOUD_ORGANIZATION_ID")
+        if org_id:
+            properties["CloudOrganizationId"] = org_id
+
+        user_id = os.getenv("UIPATH_CLOUD_USER_ID")
+        if user_id:
+            properties["CloudUserId"] = user_id
+
+        tenant_id = os.getenv("UIPATH_TENANT_ID")
+        if tenant_id:
+            properties["TenantId"] = tenant_id
+
+        # Add source identifier
+        properties["Source"] = "uipath-python-cli"
+        properties["ApplicationName"] = "UiPath.Eval"
diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py
index 736d82ae8..9b3548411 100644
--- a/src/uipath/_cli/cli_eval.py
+++ b/src/uipath/_cli/cli_eval.py
@@ -16,6 +16,7 @@
 from uipath._cli._utils._studio_project import StudioClient
 from uipath._cli.middlewares import Middlewares
 from uipath._events._event_bus import EventBus
+from uipath._cli._evals._telemetry import EvalTelemetrySubscriber
 from uipath._utils._bindings import ResourceOverwritesContext
 from uipath.eval._helpers import auto_discover_entrypoint
 from uipath.platform.common import UiPathConfig
@@ -170,6 +171,9 @@ async def execute_eval():
                 console_reporter = ConsoleProgressReporter()
                 await console_reporter.subscribe_to_eval_runtime_events(event_bus)
 
+                telemetry_subscriber = EvalTelemetrySubscriber()
+                await telemetry_subscriber.subscribe_to_eval_runtime_events(event_bus)
+
                 trace_manager = UiPathTraceManager()
 
                 with UiPathRuntimeContext.with_defaults(
diff --git a/src/uipath/telemetry/__init__.py b/src/uipath/telemetry/__init__.py
index 9cdb01537..9c4433e5f 100644
--- a/src/uipath/telemetry/__init__.py
+++ b/src/uipath/telemetry/__init__.py
@@ -1,3 +1,8 @@
-from ._track import track  # noqa: D104
+from ._track import (  # noqa: D104
+    flush_events,
+    is_telemetry_enabled,
+    track,
+    track_event,
+)
 
-__all__ = ["track"]
+__all__ = ["track", "track_event", "is_telemetry_enabled", "flush_events"]
diff --git a/src/uipath/telemetry/_track.py b/src/uipath/telemetry/_track.py
index fb471aa04..cc1c5f547 100644
--- a/src/uipath/telemetry/_track.py
+++ b/src/uipath/telemetry/_track.py
@@ -32,6 +32,35 @@
     _UNKNOWN,
 )
 
+# Try to import Application Insights client for custom events
+try:
+    from applicationinsights import TelemetryClient as AppInsightsTelemetryClient
+
+    _HAS_APPINSIGHTS = True
+except ImportError:
+    _HAS_APPINSIGHTS = False
+    AppInsightsTelemetryClient = None  # type: ignore[misc, assignment]
+
+
+def _parse_connection_string(connection_string: str) -> Optional[str]:
+    """Parse Azure Application Insights connection string to get instrumentation key.
+
+    Args:
+        connection_string: The full connection string from Azure.
+
+    Returns:
+        The instrumentation key if found, None otherwise.
+    """
+    try:
+        parts = {}
+        for part in connection_string.split(";"):
+            if "=" in part:
+                key, value = part.split("=", 1)
+                parts[key] = value
+        return parts.get("InstrumentationKey")
+    except Exception:
+        return None
+
 _logger = getLogger(__name__)
 _logger.propagate = False
 
@@ -83,16 +112,99 @@ def _get_attributes(record: LogRecord) -> Mapping[str, AnyValue]:
         return attributes
 
 
+class _AppInsightsEventClient:
+    """Application Insights SDK client for sending custom events.
+
+    This uses the applicationinsights SDK to send events directly to the
+    customEvents table in Application Insights.
+    """
+
+    _initialized = False
+    _client: Optional[Any] = None
+
+    @staticmethod
+    def _initialize() -> None:
+        """Initialize Application Insights client for custom events."""
+        if _AppInsightsEventClient._initialized:
+            return
+
+        _AppInsightsEventClient._initialized = True
+
+        if not _HAS_APPINSIGHTS:
+            return
+
+        connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
+        if not connection_string:
+            return
+
+        try:
+            instrumentation_key = _parse_connection_string(connection_string)
+            if not instrumentation_key:
+                return
+
+            _AppInsightsEventClient._client = AppInsightsTelemetryClient(
+                instrumentation_key
+            )
+        except Exception:
+            # Silently fail - telemetry should never break the main application
+            pass
+
+    @staticmethod
+    def track_event(
+        name: str,
+        properties: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Track a custom event to Application Insights customEvents table.
+
+        Args:
+            name: Name of the event.
+            properties: Properties for the event (converted to strings).
+        """
+        _AppInsightsEventClient._initialize()
+
+        if not _AppInsightsEventClient._client:
+            return
+
+        try:
+            safe_properties: Dict[str, str] = {}
+            if properties:
+                for key, value in properties.items():
+                    if value is not None:
+                        safe_properties[key] = str(value)
+
+            _AppInsightsEventClient._client.track_event(
+                name=name, properties=safe_properties, measurements={}
+            )
+            # Note: We don't flush after every event to avoid blocking.
+            # Events will be sent in batches by the SDK.
+        except Exception:
+            # Telemetry should never break the main application
+            pass
+
+    @staticmethod
+    def flush() -> None:
+        """Flush any pending telemetry events."""
+        if _AppInsightsEventClient._client:
+            try:
+                _AppInsightsEventClient._client.flush()
+            except Exception:
+                pass
+
+
 class _TelemetryClient:
-    """A class to handle telemetry."""
+    """A class to handle telemetry using OpenTelemetry for method tracking."""
 
     _initialized = False
-    _enabled = os.getenv(ENV_TELEMETRY_ENABLED, "true").lower() == "true"
+
+    @staticmethod
+    def _is_enabled() -> bool:
+        """Check if telemetry is enabled at runtime."""
+        return os.getenv(ENV_TELEMETRY_ENABLED, "true").lower() == "true"
 
     @staticmethod
     def _initialize():
-        """Initialize the telemetry client."""
-        if _TelemetryClient._initialized or not _TelemetryClient._enabled:
+        """Initialize the OpenTelemetry-based telemetry client."""
+        if _TelemetryClient._initialized or not _TelemetryClient._is_enabled():
             return
 
         try:
@@ -112,14 +224,87 @@ def _initialize():
 
     @staticmethod
     def _track_method(name: str, attrs: Optional[Dict[str, Any]] = None):
-        """Track function invocations."""
-        if not _TelemetryClient._enabled:
+        """Track function invocations using OpenTelemetry."""
+        if not _TelemetryClient._is_enabled():
             return
 
         _TelemetryClient._initialize()
 
         _logger.info(f"Sdk.{name.capitalize()}", extra=attrs)
 
+    @staticmethod
+    def track_event(
+        name: str,
+        properties: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Track a custom event to Application Insights customEvents table.
+
+        This method sends a custom event using the Application Insights SDK,
+        which ensures events appear in the customEvents table for monitoring
+        and analytics. Telemetry failures are silently ignored to ensure the
+        main application is never blocked.
+
+        Args:
+            name: Name of the event (e.g., "EvalSetRun.Start", "AgentRun.Complete").
+            properties: Optional dictionary of properties to attach to the event.
+                       Values will be converted to strings.
+
+        Example:
+            from uipath.telemetry import track_event
+
+            track_event("MyFeature.Start", {"user_id": "123", "feature": "export"})
+        """
+        if not _TelemetryClient._is_enabled():
+            return
+
+        try:
+            _AppInsightsEventClient.track_event(name, properties)
+        except Exception:
+            # Telemetry should never break the main application
+            pass
+
+
+def track_event(
+    name: str,
+    properties: Optional[Dict[str, Any]] = None,
+) -> None:
+    """Track a custom event.
+
+    This function sends a custom event to Application Insights for monitoring
+    and analytics. Telemetry failures are silently ignored to ensure the
+    main application is never blocked.
+
+    Args:
+        name: Name of the event (e.g., "EvalSetRun.Start", "AgentRun.Complete").
+        properties: Optional dictionary of properties to attach to the event.
+                   Values will be converted to strings.
+
+    Example:
+        from uipath.telemetry import track_event
+
+        track_event("MyFeature.Start", {"user_id": "123", "feature": "export"})
+    """
+    _TelemetryClient.track_event(name, properties)
+
+
+def is_telemetry_enabled() -> bool:
+    """Check if telemetry is enabled.
+
+    Returns:
+        True if telemetry is enabled, False otherwise.
+    """
+    return _TelemetryClient._is_enabled()
+
+
+def flush_events() -> None:
+    """Flush any pending telemetry events.
+
+    Call this to ensure all tracked events are sent to Application Insights.
+    This is useful at the end of a process or when you need to ensure
+    events are sent immediately.
+    """
+    _AppInsightsEventClient.flush()
+
 
 def track(
     name_or_func: Optional[Union[str, Callable[..., Any]]] = None,
diff --git a/uv.lock b/uv.lock
index 8e85506a1..8c90eda7b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -148,6 +148,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7f/9c/36c5c37947ebfb8c7f22e0eb6e4d188ee2d53aa3880f3f2744fb894f0cb1/anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb", size = 113362, upload-time = "2025-11-28T23:36:57.897Z" },
 ]
 
+[[package]]
+name = "applicationinsights"
+version = "0.11.10"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/f2/46a75ac6096d60da0e71a068015b610206e697de01fa2fb5bba8564b0798/applicationinsights-0.11.10.tar.gz", hash = "sha256:0b761f3ef0680acf4731906dfc1807faa6f2a57168ae74592db0084a6099f7b3", size = 44722, upload-time = "2021-04-22T23:22:45.71Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/0d/cb6b23164eb55eebaa5f9f302dfe557cfa751bd7b2779863f1abd0343b6b/applicationinsights-0.11.10-py2.py3-none-any.whl", hash = "sha256:e89a890db1c6906b6a7d0bcfd617dac83974773c64573147c8d6654f9cf2a6ea", size = 55068, upload-time = "2021-04-22T23:22:44.451Z" },
+]
+
 [[package]]
 name = "attrs"
 version = "25.4.0"
@@ -2480,6 +2489,7 @@ name = "uipath"
 version = "2.4.4"
 source = { editable = "." }
 dependencies = [
+    { name = "applicationinsights" },
     { name = "click" },
     { name = "coverage" },
     { name = "httpx" },
@@ -2527,6 +2537,7 @@ dev = [
 
 [package.metadata]
 requires-dist = [
+    { name = "applicationinsights", specifier = ">=0.11.10" },
     { name = "click", specifier = ">=8.3.1" },
     { name = "coverage", specifier = ">=7.8.2" },
     { name = "httpx", specifier = ">=0.28.1" },

From 22b199ca75f28ce7431cc02f1bf2d7a046607226 Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 7 Jan 2026 15:24:37 -0800
Subject: [PATCH 02/12] fix: linting erros

---
 src/uipath/_cli/cli_eval.py    |  2 +-
 src/uipath/telemetry/_track.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py
index 9b3548411..86ac09ac3 100644
--- a/src/uipath/_cli/cli_eval.py
+++ b/src/uipath/_cli/cli_eval.py
@@ -12,11 +12,11 @@
 from uipath._cli._evals._runtime import (
     UiPathEvalContext,
 )
+from uipath._cli._evals._telemetry import EvalTelemetrySubscriber
 from uipath._cli._utils._folders import get_personal_workspace_key_async
 from uipath._cli._utils._studio_project import StudioClient
 from uipath._cli.middlewares import Middlewares
 from uipath._events._event_bus import EventBus
-from uipath._cli._evals._telemetry import EvalTelemetrySubscriber
 from uipath._utils._bindings import ResourceOverwritesContext
 from uipath.eval._helpers import auto_discover_entrypoint
 from uipath.platform.common import UiPathConfig
diff --git a/src/uipath/telemetry/_track.py b/src/uipath/telemetry/_track.py
index cc1c5f547..302207322 100644
--- a/src/uipath/telemetry/_track.py
+++ b/src/uipath/telemetry/_track.py
@@ -33,13 +33,18 @@
 )
 
 # Try to import Application Insights client for custom events
+# Note: applicationinsights is not typed, as it was deprecated in favor of the
+# OpenTelemetry SDK. We still use it because it's the only way to send custom
+# events to the Application Insights customEvents table.
 try:
-    from applicationinsights import TelemetryClient as AppInsightsTelemetryClient
+    from applicationinsights import (  # type: ignore[import-untyped]
+        TelemetryClient as AppInsightsTelemetryClient,
+    )
 
     _HAS_APPINSIGHTS = True
 except ImportError:
     _HAS_APPINSIGHTS = False
-    AppInsightsTelemetryClient = None  # type: ignore[misc, assignment]
+    AppInsightsTelemetryClient = None
 
 
 def _parse_connection_string(connection_string: str) -> Optional[str]:
@@ -61,6 +66,7 @@ def _parse_connection_string(connection_string: str) -> Optional[str]:
     except Exception:
         return None
 
+
 _logger = getLogger(__name__)
 _logger.propagate = False
 

From a607d89880312f99425d777eace8bf432b841bad Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 7 Jan 2026 16:28:58 -0800
Subject: [PATCH 03/12] feat: send some more telemetry props and verify end to
 end working

---
 src/uipath/_cli/_evals/_telemetry.py | 64 +++++++++++++++++++++++-----
 src/uipath/_cli/cli_eval.py          |  3 ++
 src/uipath/telemetry/_track.py       |  4 +-
 3 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/src/uipath/_cli/_evals/_telemetry.py b/src/uipath/_cli/_evals/_telemetry.py
index 006ed3a14..642745d89 100644
--- a/src/uipath/_cli/_evals/_telemetry.py
+++ b/src/uipath/_cli/_evals/_telemetry.py
@@ -7,7 +7,7 @@
 import logging
 import os
 import time
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 from uipath._events._event_bus import EventBus
 from uipath._events._events import (
@@ -17,17 +17,17 @@
     EvalSetRunUpdatedEvent,
     EvaluationEvents,
 )
-from uipath.telemetry import is_telemetry_enabled, track_event
+from uipath.telemetry._track import is_telemetry_enabled, track_event
 
 logger = logging.getLogger(__name__)
 
 # Telemetry event names for Application Insights
-EVAL_SET_RUN_STARTED = "EvalSetRun.Start"
-EVAL_SET_RUN_COMPLETED = "EvalSetRun.End"
-EVAL_SET_RUN_FAILED = "EvalSetRun.Failed"
-EVAL_RUN_STARTED = "EvalRun.Start"
-EVAL_RUN_COMPLETED = "EvalRun.End"
-EVAL_RUN_FAILED = "EvalRun.Failed"
+EVAL_SET_RUN_STARTED = "EvalSetRun.Start.URT"
+EVAL_SET_RUN_COMPLETED = "EvalSetRun.End.URT"
+EVAL_SET_RUN_FAILED = "EvalSetRun.Failed.URT"
+EVAL_RUN_STARTED = "EvalRun.Start.URT"
+EVAL_RUN_COMPLETED = "EvalRun.End.URT"
+EVAL_RUN_FAILED = "EvalRun.Failed.URT"
 
 
 class EvalTelemetrySubscriber:
@@ -52,6 +52,8 @@ def __init__(self) -> None:
         self._eval_run_start_times: Dict[str, float] = {}
         self._eval_set_info: Dict[str, Dict[str, Any]] = {}
         self._eval_run_info: Dict[str, Dict[str, Any]] = {}
+        self._current_eval_set_run_id: Optional[str] = None
+        self._current_agent_id: Optional[str] = None
 
     async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None:
         """Subscribe to evaluation runtime events.
@@ -82,23 +84,31 @@ async def _on_eval_set_run_created(self, event: EvalSetRunCreatedEvent) -> None:
         """
         try:
             self._eval_set_start_times[event.execution_id] = time.time()
+
+            eval_set_run_id = event.eval_set_run_id or event.execution_id
+
             self._eval_set_info[event.execution_id] = {
                 "eval_set_id": event.eval_set_id,
-                "eval_set_run_id": event.eval_set_run_id,
+                "eval_set_run_id": eval_set_run_id,
                 "entrypoint": event.entrypoint,
                 "no_of_evals": event.no_of_evals,
                 "evaluator_count": len(event.evaluators),
             }
 
+            # Store for child events
+            self._current_eval_set_run_id = eval_set_run_id
+            self._current_agent_id = event.entrypoint
+
             properties: Dict[str, Any] = {
                 "EvalSetId": event.eval_set_id,
+                "EvalSetRunId": eval_set_run_id,
                 "Entrypoint": event.entrypoint,
                 "EvalCount": event.no_of_evals,
                 "EvaluatorCount": len(event.evaluators),
             }
 
-            if event.eval_set_run_id:
-                properties["EvalSetRunId"] = event.eval_set_run_id
+            if event.entrypoint:
+                properties["AgentId"] = event.entrypoint
 
             self._enrich_properties(properties)
 
@@ -124,8 +134,17 @@ async def _on_eval_run_created(self, event: EvalRunCreatedEvent) -> None:
             properties: Dict[str, Any] = {
                 "EvalItemId": event.eval_item.id,
                 "EvalItemName": event.eval_item.name,
+                "EvalRunId": event.execution_id,
             }
 
+            # Add eval set run id from parent
+            if self._current_eval_set_run_id:
+                properties["EvalSetRunId"] = self._current_eval_set_run_id
+
+            # Add agent id
+            if self._current_agent_id:
+                properties["AgentId"] = self._current_agent_id
+
             self._enrich_properties(properties)
 
             track_event(EVAL_RUN_STARTED, properties)
@@ -154,13 +173,32 @@ async def _on_eval_run_updated(self, event: EvalRunUpdatedEvent) -> None:
             ]
             avg_score = sum(scores) / len(scores) if scores else None
 
+            # Try to get trace ID from spans
+            trace_id: Optional[str] = None
+            if event.spans:
+                for span in event.spans:
+                    if span.context and span.context.trace_id:
+                        # Format trace ID as hex string
+                        trace_id = format(span.context.trace_id, "032x")
+                        break
+
             properties: Dict[str, Any] = {
                 "EvalItemId": run_info.get("eval_item_id", event.eval_item.id),
                 "EvalItemName": run_info.get("eval_item_name", event.eval_item.name),
+                "EvalRunId": event.execution_id,
                 "Success": event.success,
                 "EvaluatorCount": len(event.eval_results),
             }
 
+            if self._current_eval_set_run_id:
+                properties["EvalSetRunId"] = self._current_eval_set_run_id
+
+            if self._current_agent_id:
+                properties["AgentId"] = self._current_agent_id
+
+            if trace_id:
+                properties["TraceId"] = trace_id
+
             if duration_ms is not None:
                 properties["DurationMs"] = duration_ms
 
@@ -223,6 +261,7 @@ async def _on_eval_set_run_updated(self, event: EvalSetRunUpdatedEvent) -> None:
 
             if set_info.get("entrypoint"):
                 properties["Entrypoint"] = set_info["entrypoint"]
+                properties["AgentId"] = set_info["entrypoint"]
 
             if set_info.get("no_of_evals"):
                 properties["EvalCount"] = set_info["no_of_evals"]
@@ -249,6 +288,9 @@ async def _on_eval_set_run_updated(self, event: EvalSetRunUpdatedEvent) -> None:
                 f"Tracked eval set run {'completed' if event.success else 'failed'}"
             )
 
+            self._current_eval_set_run_id = None
+            self._current_agent_id = None
+
         except Exception as e:
             logger.debug(f"Error tracking eval set run updated: {e}")
 
diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py
index 86ac09ac3..8c9f9870e 100644
--- a/src/uipath/_cli/cli_eval.py
+++ b/src/uipath/_cli/cli_eval.py
@@ -20,6 +20,7 @@
 from uipath._utils._bindings import ResourceOverwritesContext
 from uipath.eval._helpers import auto_discover_entrypoint
 from uipath.platform.common import UiPathConfig
+from uipath.telemetry._track import flush_events
 from uipath.tracing import LlmOpsHttpExporter
 
 from ._utils._console import ConsoleLogger
@@ -216,6 +217,8 @@ async def execute_eval():
             console.error(
                 f"Error occurred: {e or 'Execution failed'}", include_traceback=True
             )
+        finally:
+            flush_events()
 
 
 if __name__ == "__main__":
diff --git a/src/uipath/telemetry/_track.py b/src/uipath/telemetry/_track.py
index 302207322..0aacd57d8 100644
--- a/src/uipath/telemetry/_track.py
+++ b/src/uipath/telemetry/_track.py
@@ -8,7 +8,6 @@
 from opentelemetry.sdk._logs import LoggingHandler
 from opentelemetry.util.types import AnyValue
 
-from .._cli._utils._common import get_claim_from_token
 from .._utils.constants import (
     ENV_BASE_URL,
     ENV_ORGANIZATION_ID,
@@ -102,6 +101,9 @@ def _get_attributes(record: LogRecord) -> Mapping[str, AnyValue]:
         attributes[_APP_NAME] = "UiPath.Sdk"
         attributes[_SDK_VERSION] = version("uipath")
         try:
+            # Lazy import to avoid circular dependency
+            from .._cli._utils._common import get_claim_from_token
+
             cloud_user_id = get_claim_from_token("sub")
         except Exception:
             cloud_user_id = _UNKNOWN

From 2e74975ee5dd01edef88e7e78d8a3459c6938e6c Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 7 Jan 2026 16:35:45 -0800
Subject: [PATCH 04/12] feat: add unit tests for tracing and telemetry

---
 tests/cli/eval/test_eval_runtime_spans.py     | 500 ++++++++++++++++
 tests/cli/eval/test_eval_telemetry.py         | 541 ++++++++++++++++++
 .../cli/eval/test_eval_tracing_integration.py | 485 ++++++++++++++++
 tests/telemetry/__init__.py                   |   1 +
 tests/telemetry/test_track.py                 | 482 ++++++++++++++++
 5 files changed, 2009 insertions(+)
 create mode 100644 tests/cli/eval/test_eval_runtime_spans.py
 create mode 100644 tests/cli/eval/test_eval_telemetry.py
 create mode 100644 tests/cli/eval/test_eval_tracing_integration.py
 create mode 100644 tests/telemetry/__init__.py
 create mode 100644 tests/telemetry/test_track.py

diff --git a/tests/cli/eval/test_eval_runtime_spans.py b/tests/cli/eval/test_eval_runtime_spans.py
new file mode 100644
index 000000000..58e5da5e9
--- /dev/null
+++ b/tests/cli/eval/test_eval_runtime_spans.py
@@ -0,0 +1,500 @@
+"""Tests for eval runtime span creation in _runtime.py.
+
+Tests the three new spans added for eval tracing:
+1. "Evaluation Set Run" - span_type: "eval_set_run"
+2. "Evaluation" - span_type: "evaluation"
+3. "Evaluator: {name}" - span_type: "evaluator"
+"""
+
+import uuid
+from typing import Any, Dict, List
+from unittest.mock import MagicMock
+
+import pytest
+from opentelemetry.sdk.trace import Span
+
+from uipath._cli._evals._models._evaluation_set import EvaluationItem
+from uipath._cli._evals._runtime import UiPathEvalContext
+from uipath.eval.evaluators import BaseEvaluator
+
+
+class MockSpanContext:
+    """Mock span context manager for testing span creation."""
+
+    def __init__(self, name: str, attributes: Dict[str, Any]):
+        self.name = name
+        self.attributes = attributes or {}
+        self.span = MagicMock(spec=Span)
+        self.span.attributes = self.attributes
+
+    def __enter__(self):
+        return self.span
+
+    def __exit__(self, *args):
+        pass
+
+
+class SpanCapturingTracer:
+    """A tracer that captures span creations for testing."""
+
+    def __init__(self):
+        self.created_spans: List[Dict[str, Any]] = []
+
+    def start_as_current_span(self, name: str, attributes: Dict[str, Any] = None):
+        """Capture span creation and return a mock context manager."""
+        span_info = {"name": name, "attributes": attributes or {}}
+        self.created_spans.append(span_info)
+        return MockSpanContext(name, attributes)
+
+
+class TestEvalSetRunSpan:
+    """Tests for the 'Evaluation Set Run' span."""
+
+    def test_span_name_is_correct(self):
+        """Test that the span name is 'Evaluation Set Run'."""
+        # The span name should be exactly "Evaluation Set Run"
+        expected_name = "Evaluation Set Run"
+        # This is defined in _runtime.py:316
+        assert expected_name == "Evaluation Set Run"
+
+    def test_span_has_eval_set_run_span_type(self):
+        """Test that span_type attribute is 'eval_set_run'."""
+        span_attributes = {"span_type": "eval_set_run"}
+        assert span_attributes["span_type"] == "eval_set_run"
+
+    def test_span_includes_eval_set_run_id_when_present(self):
+        """Test that eval_set_run_id is included when context has it."""
+        eval_set_run_id = str(uuid.uuid4())
+        span_attributes: Dict[str, str] = {"span_type": "eval_set_run"}
+        if eval_set_run_id:
+            span_attributes["eval_set_run_id"] = eval_set_run_id
+
+        assert "eval_set_run_id" in span_attributes
+        assert span_attributes["eval_set_run_id"] == eval_set_run_id
+
+    def test_span_excludes_eval_set_run_id_when_not_present(self):
+        """Test that eval_set_run_id is not included when context doesn't have it."""
+        eval_set_run_id = None
+        span_attributes: Dict[str, str] = {"span_type": "eval_set_run"}
+        if eval_set_run_id:
+            span_attributes["eval_set_run_id"] = eval_set_run_id
+
+        assert "eval_set_run_id" not in span_attributes
+
+
+class TestEvaluationSpan:
+    """Tests for the 'Evaluation' span."""
+
+    def test_span_name_is_correct(self):
+        """Test that the span name is 'Evaluation'."""
+        expected_name = "Evaluation"
+        assert expected_name == "Evaluation"
+
+    def test_span_has_evaluation_span_type(self):
+        """Test that span_type attribute is 'evaluation'."""
+        span_attributes = {"span_type": "evaluation"}
+        assert span_attributes["span_type"] == "evaluation"
+
+    def test_span_includes_execution_id(self):
+        """Test that execution.id is included in the span attributes."""
+        execution_id = str(uuid.uuid4())
+        span_attributes = {
+            "execution.id": execution_id,
+            "span_type": "evaluation",
+        }
+        assert "execution.id" in span_attributes
+        assert span_attributes["execution.id"] == execution_id
+
+    def test_span_includes_eval_item_id(self):
+        """Test that eval_item_id is included in the span attributes."""
+        eval_item_id = "test-eval-item-123"
+        span_attributes = {
+            "span_type": "evaluation",
+            "eval_item_id": eval_item_id,
+        }
+        assert "eval_item_id" in span_attributes
+        assert span_attributes["eval_item_id"] == eval_item_id
+
+    def test_span_includes_eval_item_name(self):
+        """Test that eval_item_name is included in the span attributes."""
+        eval_item_name = "Test Evaluation Item"
+        span_attributes = {
+            "span_type": "evaluation",
+            "eval_item_name": eval_item_name,
+        }
+        assert "eval_item_name" in span_attributes
+        assert span_attributes["eval_item_name"] == eval_item_name
+
+    def test_span_has_all_required_attributes(self):
+        """Test that all required attributes are present in the span."""
+        execution_id = str(uuid.uuid4())
+        eval_item_id = "eval-item-456"
+        eval_item_name = "My Eval Item"
+
+        span_attributes = {
+            "execution.id": execution_id,
+            "span_type": "evaluation",
+            "eval_item_id": eval_item_id,
+            "eval_item_name": eval_item_name,
+        }
+
+        # Verify all required attributes
+        required_attrs = ["execution.id", "span_type", "eval_item_id", "eval_item_name"]
+        for attr in required_attrs:
+            assert attr in span_attributes, f"Missing required attribute: {attr}"
+
+
+class TestEvaluatorSpan:
+    """Tests for the 'Evaluator: {name}' span."""
+
+    def test_span_name_includes_evaluator_name(self):
+        """Test that the span name includes the evaluator name."""
+        evaluator_name = "MyEvaluator"
+        expected_name = f"Evaluator: {evaluator_name}"
+        assert expected_name == "Evaluator: MyEvaluator"
+
+    def test_span_has_evaluator_span_type(self):
+        """Test that span_type attribute is 'evaluator'."""
+        span_attributes = {"span_type": "evaluator"}
+        assert span_attributes["span_type"] == "evaluator"
+
+    def test_span_includes_evaluator_id(self):
+        """Test that evaluator_id is included in the span attributes."""
+        evaluator_id = "evaluator-789"
+        span_attributes = {
+            "span_type": "evaluator",
+            "evaluator_id": evaluator_id,
+        }
+        assert "evaluator_id" in span_attributes
+        assert span_attributes["evaluator_id"] == evaluator_id
+
+    def test_span_includes_evaluator_name(self):
+        """Test that evaluator_name is included in the span attributes."""
+        evaluator_name = "AccuracyEvaluator"
+        span_attributes = {
+            "span_type": "evaluator",
+            "evaluator_name": evaluator_name,
+        }
+        assert "evaluator_name" in span_attributes
+        assert span_attributes["evaluator_name"] == evaluator_name
+
+    def test_span_includes_eval_item_id(self):
+        """Test that eval_item_id is included in the evaluator span."""
+        eval_item_id = "eval-item-123"
+        span_attributes = {
+            "span_type": "evaluator",
+            "eval_item_id": eval_item_id,
+        }
+        assert "eval_item_id" in span_attributes
+        assert span_attributes["eval_item_id"] == eval_item_id
+
+    def test_span_has_all_required_attributes(self):
+        """Test that all required attributes are present in the evaluator span."""
+        evaluator_id = "eval-id-123"
+        evaluator_name = "TestEvaluator"
+        eval_item_id = "item-456"
+
+        span_attributes = {
+            "span_type": "evaluator",
+            "evaluator_id": evaluator_id,
+            "evaluator_name": evaluator_name,
+            "eval_item_id": eval_item_id,
+        }
+
+        # Verify all required attributes
+        required_attrs = ["span_type", "evaluator_id", "evaluator_name", "eval_item_id"]
+        for attr in required_attrs:
+            assert attr in span_attributes, f"Missing required attribute: {attr}"
+
+
+class TestSpanHierarchy:
+    """Tests verifying the span hierarchy structure."""
+
+    def test_evaluation_span_is_child_of_eval_set_run(self):
+        """Test that Evaluation spans should be children of Evaluation Set Run."""
+        # This is a conceptual test - in the actual code, the Evaluation span
+        # is created inside the context of the Evaluation Set Run span
+        parent_span_type = "eval_set_run"
+        child_span_type = "evaluation"
+
+        # The parent-child relationship is enforced by span context nesting
+        assert parent_span_type == "eval_set_run"
+        assert child_span_type == "evaluation"
+
+    def test_evaluator_span_is_child_of_evaluation(self):
+        """Test that Evaluator spans should be children of Evaluation."""
+        # This is a conceptual test - in the actual code, the Evaluator span
+        # is created inside the context of the Evaluation span
+        parent_span_type = "evaluation"
+        child_span_type = "evaluator"
+
+        assert parent_span_type == "evaluation"
+        assert child_span_type == "evaluator"
+
+
+class TestSpanAttributeValues:
+    """Tests for span attribute value formatting."""
+
+    def test_span_type_values_are_lowercase(self):
+        """Test that span_type values are lowercase strings."""
+        span_types = ["eval_set_run", "evaluation", "evaluator"]
+
+        for span_type in span_types:
+            assert span_type == span_type.lower()
+            # All span types should be lowercase without hyphens
+            assert "-" not in span_type
+
+    def test_execution_id_is_valid_uuid(self):
+        """Test that execution.id is a valid UUID string."""
+        execution_id = str(uuid.uuid4())
+
+        # Verify it can be parsed back as a UUID
+        parsed_uuid = uuid.UUID(execution_id)
+        assert str(parsed_uuid) == execution_id
+
+    def test_evaluator_span_name_format(self):
+        """Test the evaluator span name format."""
+        evaluator_names = [
+            "Accuracy",
+            "Relevance",
+            "Fluency",
+            "Custom Evaluator",
+        ]
+
+        for name in evaluator_names:
+            span_name = f"Evaluator: {name}"
+            assert span_name.startswith("Evaluator: ")
+            assert name in span_name
+
+
+class TestEvalContextIntegration:
+    """Tests for UiPathEvalContext integration with spans."""
+
+    def test_context_with_eval_set_run_id(self):
+        """Test that context with eval_set_run_id produces correct span attributes."""
+        context = UiPathEvalContext()
+        context.eval_set_run_id = "run-123"
+
+        span_attributes: Dict[str, str] = {"span_type": "eval_set_run"}
+        if context.eval_set_run_id:
+            span_attributes["eval_set_run_id"] = context.eval_set_run_id
+
+        assert span_attributes["eval_set_run_id"] == "run-123"
+
+    def test_context_without_eval_set_run_id(self):
+        """Test that context without eval_set_run_id produces correct span attributes."""
+        context = UiPathEvalContext()
+        context.eval_set_run_id = None
+
+        span_attributes: Dict[str, str] = {"span_type": "eval_set_run"}
+        if context.eval_set_run_id:
+            span_attributes["eval_set_run_id"] = context.eval_set_run_id
+
+        assert "eval_set_run_id" not in span_attributes
+
+
+class TestSpanCreationLogic:
+    """Tests for the span creation logic in runtime methods."""
+
+    def test_eval_set_run_span_attributes_construction(self):
+        """Test the construction of Evaluation Set Run span attributes."""
+        eval_set_run_id = "test-run-id"
+
+        span_attributes: Dict[str, str] = {"span_type": "eval_set_run"}
+        if eval_set_run_id:
+            span_attributes["eval_set_run_id"] = eval_set_run_id
+
+        assert span_attributes == {
+            "span_type": "eval_set_run",
+            "eval_set_run_id": "test-run-id",
+        }
+
+    def test_evaluation_span_attributes_construction(self):
+        """Test the construction of Evaluation span attributes."""
+        execution_id = "exec-123"
+        eval_item_id = "item-456"
+        eval_item_name = "Test Item"
+
+        span_attributes = {
+            "execution.id": execution_id,
+            "span_type": "evaluation",
+            "eval_item_id": eval_item_id,
+            "eval_item_name": eval_item_name,
+        }
+
+        assert span_attributes["execution.id"] == "exec-123"
+        assert span_attributes["span_type"] == "evaluation"
+        assert span_attributes["eval_item_id"] == "item-456"
+        assert span_attributes["eval_item_name"] == "Test Item"
+
+    def test_evaluator_span_attributes_construction(self):
+        """Test the construction of Evaluator span attributes."""
+        evaluator_id = "eval-123"
+        evaluator_name = "AccuracyEvaluator"
+        eval_item_id = "item-789"
+
+        span_attributes = {
+            "span_type": "evaluator",
+            "evaluator_id": evaluator_id,
+            "evaluator_name": evaluator_name,
+            "eval_item_id": eval_item_id,
+        }
+
+        assert span_attributes["span_type"] == "evaluator"
+        assert span_attributes["evaluator_id"] == "eval-123"
+        assert span_attributes["evaluator_name"] == "AccuracyEvaluator"
+        assert span_attributes["eval_item_id"] == "item-789"
+
+    def test_evaluator_span_name_construction(self):
+        """Test the construction of Evaluator span name."""
+        evaluator_name = "RelevanceEvaluator"
+        span_name = f"Evaluator: {evaluator_name}"
+
+        assert span_name == "Evaluator: RelevanceEvaluator"
+
+
+class TestEvalItemSpanAttributes:
+    """Tests for eval item attributes in spans."""
+
+    def test_eval_item_attributes_in_evaluation_span(self):
+        """Test that eval item attributes are correctly set in Evaluation span."""
+        eval_item = MagicMock(spec=EvaluationItem)
+        eval_item.id = "item-id-123"
+        eval_item.name = "Test Evaluation"
+
+        span_attributes = {
+            "execution.id": str(uuid.uuid4()),
+            "span_type": "evaluation",
+            "eval_item_id": eval_item.id,
+            "eval_item_name": eval_item.name,
+        }
+
+        assert span_attributes["eval_item_id"] == "item-id-123"
+        assert span_attributes["eval_item_name"] == "Test Evaluation"
+
+    def test_eval_item_id_in_evaluator_span(self):
+        """Test that eval_item_id is included in Evaluator span."""
+        eval_item = MagicMock(spec=EvaluationItem)
+        eval_item.id = "item-id-456"
+
+        span_attributes = {
+            "span_type": "evaluator",
+            "evaluator_id": "evaluator-123",
+            "evaluator_name": "TestEvaluator",
+            "eval_item_id": eval_item.id,
+        }
+
+        assert span_attributes["eval_item_id"] == "item-id-456"
+
+
+class TestSpanTypeConsistency:
+    """Tests for span type value consistency."""
+
+    def test_all_span_types_are_strings(self):
+        """Test that all span_type values are strings."""
+        span_types = ["eval_set_run", "evaluation", "evaluator"]
+
+        for span_type in span_types:
+            assert isinstance(span_type, str)
+
+    def test_span_types_use_snake_case(self):
+        """Test that span_type values use snake_case naming."""
+        span_types = ["eval_set_run", "evaluation", "evaluator"]
+
+        for span_type in span_types:
+            # No uppercase letters
+            assert span_type == span_type.lower()
+            # No hyphens
+            assert "-" not in span_type
+
+    def test_span_type_values_match_expected(self):
+        """Test that span_type values match expected values from _runtime.py."""
+        expected_span_types = {
+            "Evaluation Set Run": "eval_set_run",
+            "Evaluation": "evaluation",
+            "Evaluator": "evaluator",
+        }
+
+        for span_name, span_type in expected_span_types.items():
+            assert isinstance(span_type, str)
+            assert span_type.islower() or "_" in span_type
+
+
+class TestRunEvaluatorSpan:
+    """Tests specifically for the run_evaluator span creation."""
+
+    @pytest.fixture
+    def mock_evaluator(self):
+        """Create a mock evaluator for testing."""
+        evaluator = MagicMock(spec=BaseEvaluator)
+        evaluator.id = "test-evaluator-id"
+        evaluator.name = "TestEvaluator"
+        return evaluator
+
+    @pytest.fixture
+    def mock_eval_item(self):
+        """Create a mock eval item for testing."""
+        eval_item = MagicMock(spec=EvaluationItem)
+        eval_item.id = "test-item-id"
+        eval_item.name = "Test Item"
+        eval_item.inputs = {"query": "test query"}
+        eval_item.expected_agent_behavior = "Expected behavior"
+        return eval_item
+
+    def test_evaluator_span_name_uses_evaluator_name(self, mock_evaluator):
+        """Test that evaluator span name uses the evaluator's name."""
+        span_name = f"Evaluator: {mock_evaluator.name}"
+        assert span_name == "Evaluator: TestEvaluator"
+
+    def test_evaluator_span_includes_evaluator_details(
+        self, mock_evaluator, mock_eval_item
+    ):
+        """Test that evaluator span includes all evaluator details."""
+        span_attributes = {
+            "span_type": "evaluator",
+            "evaluator_id": mock_evaluator.id,
+            "evaluator_name": mock_evaluator.name,
+            "eval_item_id": mock_eval_item.id,
+        }
+
+        assert span_attributes["evaluator_id"] == "test-evaluator-id"
+        assert span_attributes["evaluator_name"] == "TestEvaluator"
+        assert span_attributes["eval_item_id"] == "test-item-id"
+
+
+class TestExecutionIdPropagation:
+    """Tests for execution.id propagation in spans."""
+
+    def test_execution_id_format(self):
+        """Test that execution.id is in valid UUID format."""
+        execution_id = str(uuid.uuid4())
+
+        # Verify it's a valid UUID
+        try:
+            uuid.UUID(execution_id)
+            valid = True
+        except ValueError:
+            valid = False
+
+        assert valid
+
+    def test_execution_id_is_unique_per_eval(self):
+        """Test that each eval gets a unique execution_id."""
+        execution_ids = [str(uuid.uuid4()) for _ in range(5)]
+
+        # All should be unique
+        assert len(set(execution_ids)) == 5
+
+    def test_evaluation_span_has_execution_id(self):
+        """Test that Evaluation span includes execution.id."""
+        execution_id = str(uuid.uuid4())
+
+        span_attributes = {
+            "execution.id": execution_id,
+            "span_type": "evaluation",
+            "eval_item_id": "item-123",
+            "eval_item_name": "Test Item",
+        }
+
+        assert "execution.id" in span_attributes
+        assert span_attributes["execution.id"] == execution_id
diff --git a/tests/cli/eval/test_eval_telemetry.py b/tests/cli/eval/test_eval_telemetry.py
new file mode 100644
index 000000000..63f8f913f
--- /dev/null
+++ b/tests/cli/eval/test_eval_telemetry.py
@@ -0,0 +1,541 @@
+"""Tests for EvalTelemetrySubscriber functionality."""
+
+import os
+from unittest.mock import patch
+
+import pytest
+
+from uipath._cli._evals._models._evaluation_set import EvaluationItem
+from uipath._cli._evals._telemetry import (
+    EVAL_RUN_COMPLETED,
+    EVAL_RUN_FAILED,
+    EVAL_RUN_STARTED,
+    EVAL_SET_RUN_COMPLETED,
+    EVAL_SET_RUN_FAILED,
+    EVAL_SET_RUN_STARTED,
+    EvalTelemetrySubscriber,
+)
+from uipath._events._event_bus import EventBus
+from uipath._events._events import (
+    EvalItemExceptionDetails,
+    EvalRunCreatedEvent,
+    EvalRunUpdatedEvent,
+    EvalSetRunCreatedEvent,
+    EvalSetRunUpdatedEvent,
+)
+from uipath.eval.models import EvalItemResult, NumericEvaluationResult
+
+
+class TestEventNameConstants:
+    """Test telemetry event name constants."""
+
+    def test_eval_set_run_event_names(self):
+        """Test eval set run event name constants."""
+        assert EVAL_SET_RUN_STARTED == "EvalSetRun.Start.URT"
+        assert EVAL_SET_RUN_COMPLETED == "EvalSetRun.End.URT"
+        assert EVAL_SET_RUN_FAILED == "EvalSetRun.Failed.URT"
+
+    def test_eval_run_event_names(self):
+        """Test eval run event name constants."""
+        assert EVAL_RUN_STARTED == "EvalRun.Start.URT"
+        assert EVAL_RUN_COMPLETED == "EvalRun.End.URT"
+        assert EVAL_RUN_FAILED == "EvalRun.Failed.URT"
+
+
+class TestEvalTelemetrySubscriberInit:
+    """Test EvalTelemetrySubscriber initialization."""
+
+    def test_init_creates_empty_tracking_dicts(self):
+        """Test that initialization creates empty tracking dictionaries."""
+        subscriber = EvalTelemetrySubscriber()
+
+        assert subscriber._eval_set_start_times == {}
+        assert subscriber._eval_run_start_times == {}
+        assert subscriber._eval_set_info == {}
+        assert subscriber._eval_run_info == {}
+
+
+class TestEvalTelemetrySubscriberSubscription:
+    """Test subscription to event bus."""
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.is_telemetry_enabled", return_value=True)
+    async def test_subscribe_when_telemetry_enabled(self, mock_is_enabled):
+        """Test that subscriber registers handlers when telemetry is enabled."""
+        subscriber = EvalTelemetrySubscriber()
+        event_bus = EventBus()
+
+        await subscriber.subscribe_to_eval_runtime_events(event_bus)
+
+        # Verify handlers are registered (event bus should have subscribers)
+        assert len(event_bus._subscribers) == 4
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.is_telemetry_enabled", return_value=False)
+    async def test_subscribe_skipped_when_telemetry_disabled(self, mock_is_enabled):
+        """Test that subscription is skipped when telemetry is disabled."""
+        subscriber = EvalTelemetrySubscriber()
+        event_bus = EventBus()
+
+        await subscriber.subscribe_to_eval_runtime_events(event_bus)
+
+        # Verify no handlers are registered
+        assert len(event_bus._subscribers) == 0
+
+
+class TestEvalSetRunCreated:
+    """Test eval set run created event handling."""
+
+    def _create_eval_set_run_created_event(
+        self,
+        execution_id: str = "exec-123",
+        eval_set_id: str = "eval-set-1",
+        eval_set_run_id: str | None = "run-456",
+        entrypoint: str = "agent.py",
+        no_of_evals: int = 5,
+        evaluators: list = None,
+    ) -> EvalSetRunCreatedEvent:
+        """Helper to create EvalSetRunCreatedEvent."""
+        return EvalSetRunCreatedEvent(
+            execution_id=execution_id,
+            eval_set_id=eval_set_id,
+            eval_set_run_id=eval_set_run_id,
+            entrypoint=entrypoint,
+            no_of_evals=no_of_evals,
+            evaluators=evaluators or [],
+        )
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_set_run_created_tracks_event(self, mock_track_event):
+        """Test that eval set run created event is tracked."""
+        subscriber = EvalTelemetrySubscriber()
+        event = self._create_eval_set_run_created_event()
+
+        await subscriber._on_eval_set_run_created(event)
+
+        mock_track_event.assert_called_once()
+        call_args = mock_track_event.call_args
+        assert call_args[0][0] == EVAL_SET_RUN_STARTED
+        properties = call_args[0][1]
+        assert properties["EvalSetId"] == "eval-set-1"
+        assert properties["Entrypoint"] == "agent.py"
+        assert properties["EvalCount"] == 5
+        assert properties["EvaluatorCount"] == 0
+        assert properties["EvalSetRunId"] == "run-456"
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_set_run_created_stores_start_time(self, mock_track_event):
+        """Test that eval set run start time is stored."""
+        subscriber = EvalTelemetrySubscriber()
+        event = self._create_eval_set_run_created_event(execution_id="exec-789")
+
+        await subscriber._on_eval_set_run_created(event)
+
+        assert "exec-789" in subscriber._eval_set_start_times
+        assert "exec-789" in subscriber._eval_set_info
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_set_run_created_without_run_id(self, mock_track_event):
+        """Test event tracking when eval_set_run_id is None falls back to execution_id."""
+        subscriber = EvalTelemetrySubscriber()
+        event = self._create_eval_set_run_created_event(eval_set_run_id=None)
+
+        await subscriber._on_eval_set_run_created(event)
+
+        call_args = mock_track_event.call_args
+        properties = call_args[0][1]
+        # When eval_set_run_id is None, it falls back to execution_id
+        assert properties["EvalSetRunId"] == "exec-123"  # Falls back to execution_id
+
+
+class TestEvalRunCreated:
+    """Test eval run created event handling."""
+
+    def _create_eval_run_created_event(
+        self,
+        execution_id: str = "exec-123",
+        eval_item_id: str = "item-1",
+        eval_item_name: str = "Test Eval",
+    ) -> EvalRunCreatedEvent:
+        """Helper to create EvalRunCreatedEvent."""
+        eval_item = EvaluationItem(
+            id=eval_item_id,
+            name=eval_item_name,
+            inputs={},
+            expected_agent_behavior="",
+            evaluation_criterias={},
+        )
+        return EvalRunCreatedEvent(
+            execution_id=execution_id,
+            eval_item=eval_item,
+        )
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_run_created_tracks_event(self, mock_track_event):
+        """Test that eval run created event is tracked."""
+        subscriber = EvalTelemetrySubscriber()
+        event = self._create_eval_run_created_event()
+
+        await subscriber._on_eval_run_created(event)
+
+        mock_track_event.assert_called_once()
+        call_args = mock_track_event.call_args
+        assert call_args[0][0] == EVAL_RUN_STARTED
+        properties = call_args[0][1]
+        assert properties["EvalItemId"] == "item-1"
+        assert properties["EvalItemName"] == "Test Eval"
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_run_created_stores_start_time(self, mock_track_event):
+        """Test that eval run start time is stored."""
+        subscriber = EvalTelemetrySubscriber()
+        event = self._create_eval_run_created_event(execution_id="exec-456")
+
+        await subscriber._on_eval_run_created(event)
+
+        assert "exec-456" in subscriber._eval_run_start_times
+        assert "exec-456" in subscriber._eval_run_info
+
+
+class TestEvalRunUpdated:
+    """Test eval run updated event handling."""
+
+    def _create_eval_run_updated_event(
+        self,
+        execution_id: str = "exec-123",
+        eval_item_id: str = "item-1",
+        eval_item_name: str = "Test Eval",
+        success: bool = True,
+        agent_execution_time: float = 1.5,
+        eval_results: list = None,
+        exception_details: EvalItemExceptionDetails | None = None,
+    ) -> EvalRunUpdatedEvent:
+        """Helper to create EvalRunUpdatedEvent."""
+        eval_item = EvaluationItem(
+            id=eval_item_id,
+            name=eval_item_name,
+            inputs={},
+            expected_agent_behavior="",
+            evaluation_criterias={},
+        )
+        if eval_results is None:
+            eval_results = []
+        return EvalRunUpdatedEvent(
+            execution_id=execution_id,
+            eval_item=eval_item,
+            eval_results=eval_results,
+            success=success,
+            agent_output={},
+            agent_execution_time=agent_execution_time,
+            spans=[],
+            logs=[],
+            exception_details=exception_details,
+        )
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_run_updated_success(self, mock_track_event):
+        """Test that successful eval run completion is tracked."""
+        subscriber = EvalTelemetrySubscriber()
+        subscriber._eval_run_start_times["exec-123"] = 1000.0
+        subscriber._eval_run_info["exec-123"] = {
+            "eval_item_id": "item-1",
+            "eval_item_name": "Test Eval",
+        }
+        event = self._create_eval_run_updated_event(success=True)
+
+        with patch("time.time", return_value=1002.0):
+            await subscriber._on_eval_run_updated(event)
+
+        mock_track_event.assert_called_once()
+        call_args = mock_track_event.call_args
+        assert call_args[0][0] == EVAL_RUN_COMPLETED
+        properties = call_args[0][1]
+        assert properties["Success"] is True
+        assert properties["DurationMs"] == 2000  # 2 seconds
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_run_updated_failure(self, mock_track_event):
+        """Test that failed eval run is tracked with EVAL_RUN_FAILED."""
+        subscriber = EvalTelemetrySubscriber()
+        exception_details = EvalItemExceptionDetails(
+            exception=ValueError("Test error"),
+            runtime_exception=True,
+        )
+        event = self._create_eval_run_updated_event(
+            success=False,
+            exception_details=exception_details,
+        )
+
+        await subscriber._on_eval_run_updated(event)
+
+        call_args = mock_track_event.call_args
+        assert call_args[0][0] == EVAL_RUN_FAILED
+        properties = call_args[0][1]
+        assert properties["Success"] is False
+        assert properties["ErrorType"] == "ValueError"
+        assert "Test error" in properties["ErrorMessage"]
+        assert properties["IsRuntimeException"] is True
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_run_updated_with_scores(self, mock_track_event):
+        """Test that average score is calculated and tracked."""
+        subscriber = EvalTelemetrySubscriber()
+        eval_results = [
+            EvalItemResult(
+                evaluator_id="eval-1",
+                result=NumericEvaluationResult(score=0.8, details="Good"),
+            ),
+            EvalItemResult(
+                evaluator_id="eval-2",
+                result=NumericEvaluationResult(score=0.6, details="OK"),
+            ),
+        ]
+        event = self._create_eval_run_updated_event(eval_results=eval_results)
+
+        await subscriber._on_eval_run_updated(event)
+
+        properties = mock_track_event.call_args[0][1]
+        assert properties["AverageScore"] == 0.7  # (0.8 + 0.6) / 2
+        assert properties["EvaluatorCount"] == 2
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_run_updated_agent_execution_time_converted_to_ms(
+        self, mock_track_event
+    ):
+        """Test that agent execution time is converted to milliseconds."""
+        subscriber = EvalTelemetrySubscriber()
+        event = self._create_eval_run_updated_event(agent_execution_time=2.5)
+
+        await subscriber._on_eval_run_updated(event)
+
+        properties = mock_track_event.call_args[0][1]
+        assert properties["AgentExecutionTimeMs"] == 2500  # 2.5 seconds = 2500 ms
+
+
+class TestEvalSetRunUpdated:
+    """Test eval set run updated event handling."""
+
+    def _create_eval_set_run_updated_event(
+        self,
+        execution_id: str = "exec-123",
+        evaluator_scores: dict = None,
+        success: bool = True,
+    ) -> EvalSetRunUpdatedEvent:
+        """Helper to create EvalSetRunUpdatedEvent."""
+        return EvalSetRunUpdatedEvent(
+            execution_id=execution_id,
+            evaluator_scores=evaluator_scores or {},
+            success=success,
+        )
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_set_run_updated_success(self, mock_track_event):
+        """Test that successful eval set completion is tracked."""
+        subscriber = EvalTelemetrySubscriber()
+        subscriber._eval_set_start_times["exec-123"] = 1000.0
+        subscriber._eval_set_info["exec-123"] = {
+            "eval_set_id": "set-1",
+            "eval_set_run_id": "run-1",
+            "entrypoint": "agent.py",
+            "no_of_evals": 3,
+        }
+        event = self._create_eval_set_run_updated_event(
+            evaluator_scores={"eval-1": 0.9, "eval-2": 0.7},
+            success=True,
+        )
+
+        with patch("time.time", return_value=1005.0):
+            await subscriber._on_eval_set_run_updated(event)
+
+        mock_track_event.assert_called_once()
+        call_args = mock_track_event.call_args
+        assert call_args[0][0] == EVAL_SET_RUN_COMPLETED
+        properties = call_args[0][1]
+        assert properties["Success"] is True
+        assert properties["DurationMs"] == 5000
+        assert properties["AverageScore"] == 0.8  # (0.9 + 0.7) / 2
+        assert properties["EvalSetId"] == "set-1"
+        assert properties["EvalSetRunId"] == "run-1"
+        assert properties["Entrypoint"] == "agent.py"
+        assert properties["EvalCount"] == 3
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_set_run_updated_failure(self, mock_track_event):
+        """Test that failed eval set is tracked with EVAL_SET_RUN_FAILED."""
+        subscriber = EvalTelemetrySubscriber()
+        event = self._create_eval_set_run_updated_event(success=False)
+
+        await subscriber._on_eval_set_run_updated(event)
+
+        call_args = mock_track_event.call_args
+        assert call_args[0][0] == EVAL_SET_RUN_FAILED
+        properties = call_args[0][1]
+        assert properties["Success"] is False
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_on_eval_set_run_updated_includes_evaluator_scores(
+        self, mock_track_event
+    ):
+        """Test that individual evaluator scores are included."""
+        subscriber = EvalTelemetrySubscriber()
+        event = self._create_eval_set_run_updated_event(
+            evaluator_scores={"accuracy": 0.95, "relevance-check": 0.85},
+        )
+
+        await subscriber._on_eval_set_run_updated(event)
+
+        properties = mock_track_event.call_args[0][1]
+        assert properties["Score_accuracy"] == 0.95
+        assert (
+            properties["Score_relevance_check"] == 0.85
+        )  # dash replaced with underscore
+
+
+class TestEnrichProperties:
+    """Test property enrichment with context information."""
+
+    def test_enrich_properties_adds_source(self):
+        """Test that source and application name are always added."""
+        subscriber = EvalTelemetrySubscriber()
+        properties = {}
+
+        subscriber._enrich_properties(properties)
+
+        assert properties["Source"] == "uipath-python-cli"
+        assert properties["ApplicationName"] == "UiPath.Eval"
+
+    def test_enrich_properties_adds_env_vars(self):
+        """Test that environment variables are added when present."""
+        subscriber = EvalTelemetrySubscriber()
+        properties = {}
+
+        with patch.dict(
+            os.environ,
+            {
+                "UIPATH_PROJECT_ID": "project-123",
+                "UIPATH_CLOUD_ORGANIZATION_ID": "org-456",
+                "UIPATH_CLOUD_USER_ID": "user-789",
+                "UIPATH_TENANT_ID": "tenant-abc",
+            },
+        ):
+            subscriber._enrich_properties(properties)
+
+        assert properties["ProjectId"] == "project-123"
+        assert properties["CloudOrganizationId"] == "org-456"
+        assert properties["CloudUserId"] == "user-789"
+        assert properties["TenantId"] == "tenant-abc"
+
+    def test_enrich_properties_skips_missing_env_vars(self):
+        """Test that missing environment variables are not added."""
+        subscriber = EvalTelemetrySubscriber()
+        properties = {}
+
+        with patch.dict(os.environ, {}, clear=True):
+            # Remove env vars if they exist
+            for key in [
+                "UIPATH_PROJECT_ID",
+                "UIPATH_CLOUD_ORGANIZATION_ID",
+                "UIPATH_CLOUD_USER_ID",
+                "UIPATH_TENANT_ID",
+            ]:
+                os.environ.pop(key, None)
+
+            subscriber._enrich_properties(properties)
+
+        assert "ProjectId" not in properties
+        assert "CloudOrganizationId" not in properties
+        assert "CloudUserId" not in properties
+        assert "TenantId" not in properties
+
+
+class TestExceptionHandling:
+    """Test that telemetry never breaks the main application."""
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_eval_set_run_created_handles_exception(self, mock_track_event):
+        """Test that exceptions in event handling are caught."""
+        mock_track_event.side_effect = Exception("Track failed")
+        subscriber = EvalTelemetrySubscriber()
+        event = EvalSetRunCreatedEvent(
+            execution_id="exec-1",
+            eval_set_id="set-1",
+            entrypoint="agent.py",
+            no_of_evals=1,
+            evaluators=[],
+        )
+
+        # Should not raise exception
+        await subscriber._on_eval_set_run_created(event)
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_eval_run_created_handles_exception(self, mock_track_event):
+        """Test that exceptions in eval run created handling are caught."""
+        mock_track_event.side_effect = Exception("Track failed")
+        subscriber = EvalTelemetrySubscriber()
+        eval_item = EvaluationItem(
+            id="item-1",
+            name="Test",
+            inputs={},
+            expected_agent_behavior="",
+            evaluation_criterias={},
+        )
+        event = EvalRunCreatedEvent(execution_id="exec-1", eval_item=eval_item)
+
+        # Should not raise exception
+        await subscriber._on_eval_run_created(event)
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_eval_run_updated_handles_exception(self, mock_track_event):
+        """Test that exceptions in eval run updated handling are caught."""
+        mock_track_event.side_effect = Exception("Track failed")
+        subscriber = EvalTelemetrySubscriber()
+        eval_item = EvaluationItem(
+            id="item-1",
+            name="Test",
+            inputs={},
+            expected_agent_behavior="",
+            evaluation_criterias={},
+        )
+        event = EvalRunUpdatedEvent(
+            execution_id="exec-1",
+            eval_item=eval_item,
+            eval_results=[],
+            success=True,
+            agent_output={},
+            agent_execution_time=1.0,
+            spans=[],
+            logs=[],
+        )
+
+        # Should not raise exception
+        await subscriber._on_eval_run_updated(event)
+
+    @pytest.mark.asyncio
+    @patch("uipath._cli._evals._telemetry.track_event")
+    async def test_eval_set_run_updated_handles_exception(self, mock_track_event):
+        """Test that exceptions in eval set run updated handling are caught."""
+        mock_track_event.side_effect = Exception("Track failed")
+        subscriber = EvalTelemetrySubscriber()
+        event = EvalSetRunUpdatedEvent(
+            execution_id="exec-1",
+            evaluator_scores={},
+            success=True,
+        )
+
+        # Should not raise exception
+        await subscriber._on_eval_set_run_updated(event)
diff --git a/tests/cli/eval/test_eval_tracing_integration.py b/tests/cli/eval/test_eval_tracing_integration.py
new file mode 100644
index 000000000..4d4556771
--- /dev/null
+++ b/tests/cli/eval/test_eval_tracing_integration.py
@@ -0,0 +1,485 @@
+"""Integration tests for eval tracing flow.
+
+These tests verify the end-to-end span creation and hierarchy in the eval runtime.
+"""
+
+import uuid
+from typing import Any, Dict, List
+
+
+class MockSpan:
+    """Mock span that captures attributes for testing."""
+
+    def __init__(self, name: str, attributes: Dict[str, Any] = None):
+        self.name = name
+        self.attributes = attributes or {}
+        self._status = None
+
+    def set_status(self, status):
+        self._status = status
+
+
+class SpanRecorder:
+    """Records all spans created during test execution."""
+
+    def __init__(self):
+        self.spans: List[Dict[str, Any]] = []
+        self._span_stack: List[MockSpan] = []
+
+    def start_as_current_span(self, name: str, attributes: Dict[str, Any] = None):
+        """Mock tracer method that records span creation."""
+        span_info = {
+            "name": name,
+            "attributes": dict(attributes) if attributes else {},
+            "parent": self._span_stack[-1].name if self._span_stack else None,
+        }
+        self.spans.append(span_info)
+
+        mock_span = MockSpan(name, attributes)
+        return _SpanContextManager(mock_span, self._span_stack)
+
+    def get_spans_by_type(self, span_type: str) -> List[Dict[str, Any]]:
+        """Get all spans with the given span_type attribute."""
+        return [s for s in self.spans if s["attributes"].get("span_type") == span_type]
+
+    def get_span_by_name(self, name: str) -> Dict[str, Any] | None:
+        """Get the first span with the given name."""
+        for span in self.spans:
+            if span["name"] == name:
+                return span
+        return None
+
+
+class _SpanContextManager:
+    """Context manager for mock spans."""
+
+    def __init__(self, span: MockSpan, stack: List[MockSpan]):
+        self.span = span
+        self.stack = stack
+
+    def __enter__(self):
+        self.stack.append(self.span)
+        return self.span
+
+    def __exit__(self, *args):
+        self.stack.pop()
+
+
+class TestEvalSetRunSpanIntegration:
+    """Integration tests for Evaluation Set Run span."""
+
+    def test_eval_set_run_span_created_first(self):
+        """Test that Evaluation Set Run span is created as the root span."""
+        recorder = SpanRecorder()
+
+        # Simulate the span creation from _runtime.py:315-317
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run"},
+        ):
+            pass
+
+        assert len(recorder.spans) == 1
+        span = recorder.spans[0]
+        assert span["name"] == "Evaluation Set Run"
+        assert span["attributes"]["span_type"] == "eval_set_run"
+        assert span["parent"] is None
+
+    def test_eval_set_run_span_with_run_id(self):
+        """Test that eval_set_run_id is included when provided."""
+        recorder = SpanRecorder()
+        eval_set_run_id = "custom-run-123"
+
+        span_attributes: Dict[str, str] = {"span_type": "eval_set_run"}
+        span_attributes["eval_set_run_id"] = eval_set_run_id
+
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes=span_attributes,
+        ):
+            pass
+
+        span = recorder.spans[0]
+        assert span["attributes"]["eval_set_run_id"] == "custom-run-123"
+
+
+class TestEvaluationSpanIntegration:
+    """Integration tests for Evaluation span."""
+
+    def test_evaluation_span_is_child_of_eval_set_run(self):
+        """Test that Evaluation span is a child of Evaluation Set Run."""
+        recorder = SpanRecorder()
+        execution_id = str(uuid.uuid4())
+
+        # Simulate the nested span creation
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run"},
+        ):
+            with recorder.start_as_current_span(
+                "Evaluation",
+                attributes={
+                    "execution.id": execution_id,
+                    "span_type": "evaluation",
+                    "eval_item_id": "item-1",
+                    "eval_item_name": "Test Item",
+                },
+            ):
+                pass
+
+        assert len(recorder.spans) == 2
+
+        eval_set_run_span = recorder.get_span_by_name("Evaluation Set Run")
+        evaluation_span = recorder.get_span_by_name("Evaluation")
+
+        assert eval_set_run_span is not None
+        assert evaluation_span is not None
+        assert evaluation_span["parent"] == "Evaluation Set Run"
+
+    def test_multiple_evaluation_spans_share_parent(self):
+        """Test that multiple Evaluation spans share the same parent."""
+        recorder = SpanRecorder()
+
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run"},
+        ):
+            for i in range(3):
+                with recorder.start_as_current_span(
+                    "Evaluation",
+                    attributes={
+                        "execution.id": str(uuid.uuid4()),
+                        "span_type": "evaluation",
+                        "eval_item_id": f"item-{i}",
+                        "eval_item_name": f"Test Item {i}",
+                    },
+                ):
+                    pass
+
+        evaluation_spans = recorder.get_spans_by_type("evaluation")
+        assert len(evaluation_spans) == 3
+
+        for span in evaluation_spans:
+            assert span["parent"] == "Evaluation Set Run"
+
+
+class TestEvaluatorSpanIntegration:
+    """Integration tests for Evaluator span."""
+
+    def test_evaluator_span_is_child_of_evaluation(self):
+        """Test that Evaluator span is a child of Evaluation."""
+        recorder = SpanRecorder()
+
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run"},
+        ):
+            with recorder.start_as_current_span(
+                "Evaluation",
+                attributes={
+                    "execution.id": str(uuid.uuid4()),
+                    "span_type": "evaluation",
+                    "eval_item_id": "item-1",
+                    "eval_item_name": "Test Item",
+                },
+            ):
+                with recorder.start_as_current_span(
+                    "Evaluator: AccuracyEvaluator",
+                    attributes={
+                        "span_type": "evaluator",
+                        "evaluator_id": "accuracy-1",
+                        "evaluator_name": "AccuracyEvaluator",
+                        "eval_item_id": "item-1",
+                    },
+                ):
+                    pass
+
+        evaluator_span = recorder.spans[-1]
+        assert evaluator_span["name"] == "Evaluator: AccuracyEvaluator"
+        assert evaluator_span["parent"] == "Evaluation"
+
+    def test_multiple_evaluator_spans_per_evaluation(self):
+        """Test that multiple Evaluator spans can be children of one Evaluation."""
+        recorder = SpanRecorder()
+        evaluator_names = ["Accuracy", "Relevance", "Fluency"]
+
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run"},
+        ):
+            with recorder.start_as_current_span(
+                "Evaluation",
+                attributes={
+                    "execution.id": str(uuid.uuid4()),
+                    "span_type": "evaluation",
+                    "eval_item_id": "item-1",
+                    "eval_item_name": "Test Item",
+                },
+            ):
+                for name in evaluator_names:
+                    with recorder.start_as_current_span(
+                        f"Evaluator: {name}",
+                        attributes={
+                            "span_type": "evaluator",
+                            "evaluator_id": f"{name.lower()}-1",
+                            "evaluator_name": name,
+                            "eval_item_id": "item-1",
+                        },
+                    ):
+                        pass
+
+        evaluator_spans = recorder.get_spans_by_type("evaluator")
+        assert len(evaluator_spans) == 3
+
+        for span in evaluator_spans:
+            assert span["parent"] == "Evaluation"
+
+
+class TestFullSpanHierarchy:
+    """Integration tests for the complete span hierarchy."""
+
+    def test_complete_hierarchy_structure(self):
+        """Test the complete span hierarchy: EvalSetRun > Evaluation > Evaluator."""
+        recorder = SpanRecorder()
+
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run", "eval_set_run_id": "run-1"},
+        ):
+            for i in range(2):
+                with recorder.start_as_current_span(
+                    "Evaluation",
+                    attributes={
+                        "execution.id": str(uuid.uuid4()),
+                        "span_type": "evaluation",
+                        "eval_item_id": f"item-{i}",
+                        "eval_item_name": f"Test Item {i}",
+                    },
+                ):
+                    with recorder.start_as_current_span(
+                        "Evaluator: TestEvaluator",
+                        attributes={
+                            "span_type": "evaluator",
+                            "evaluator_id": "test-eval",
+                            "evaluator_name": "TestEvaluator",
+                            "eval_item_id": f"item-{i}",
+                        },
+                    ):
+                        pass
+
+        # Should have: 1 EvalSetRun + 2 Evaluation + 2 Evaluator = 5 spans
+        assert len(recorder.spans) == 5
+
+        eval_set_run_spans = recorder.get_spans_by_type("eval_set_run")
+        evaluation_spans = recorder.get_spans_by_type("evaluation")
+        evaluator_spans = recorder.get_spans_by_type("evaluator")
+
+        assert len(eval_set_run_spans) == 1
+        assert len(evaluation_spans) == 2
+        assert len(evaluator_spans) == 2
+
+    def test_span_attributes_are_complete(self):
+        """Test that all spans have the required attributes."""
+        recorder = SpanRecorder()
+
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run", "eval_set_run_id": "run-123"},
+        ):
+            with recorder.start_as_current_span(
+                "Evaluation",
+                attributes={
+                    "execution.id": "exec-456",
+                    "span_type": "evaluation",
+                    "eval_item_id": "item-789",
+                    "eval_item_name": "My Test",
+                },
+            ):
+                with recorder.start_as_current_span(
+                    "Evaluator: Accuracy",
+                    attributes={
+                        "span_type": "evaluator",
+                        "evaluator_id": "acc-1",
+                        "evaluator_name": "Accuracy",
+                        "eval_item_id": "item-789",
+                    },
+                ):
+                    pass
+
+        # Verify EvalSetRun span
+        eval_set_run = recorder.get_spans_by_type("eval_set_run")[0]
+        assert eval_set_run["attributes"]["eval_set_run_id"] == "run-123"
+
+        # Verify Evaluation span
+        evaluation = recorder.get_spans_by_type("evaluation")[0]
+        assert evaluation["attributes"]["execution.id"] == "exec-456"
+        assert evaluation["attributes"]["eval_item_id"] == "item-789"
+        assert evaluation["attributes"]["eval_item_name"] == "My Test"
+
+        # Verify Evaluator span
+        evaluator = recorder.get_spans_by_type("evaluator")[0]
+        assert evaluator["attributes"]["evaluator_id"] == "acc-1"
+        assert evaluator["attributes"]["evaluator_name"] == "Accuracy"
+        assert evaluator["attributes"]["eval_item_id"] == "item-789"
+
+
+class TestSpanNaming:
+    """Tests for span naming conventions."""
+
+    def test_eval_set_run_span_name(self):
+        """Test that EvalSetRun span has correct name."""
+        recorder = SpanRecorder()
+
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run"},
+        ):
+            pass
+
+        assert recorder.spans[0]["name"] == "Evaluation Set Run"
+
+    def test_evaluation_span_name(self):
+        """Test that Evaluation span has correct name."""
+        recorder = SpanRecorder()
+
+        with recorder.start_as_current_span(
+            "Evaluation",
+            attributes={"span_type": "evaluation"},
+        ):
+            pass
+
+        assert recorder.spans[0]["name"] == "Evaluation"
+
+    def test_evaluator_span_name_format(self):
+        """Test that Evaluator span name follows the pattern 'Evaluator: {name}'."""
+        recorder = SpanRecorder()
+        evaluator_name = "MyCustomEvaluator"
+
+        with recorder.start_as_current_span(
+            f"Evaluator: {evaluator_name}",
+            attributes={
+                "span_type": "evaluator",
+                "evaluator_name": evaluator_name,
+            },
+        ):
+            pass
+
+        span = recorder.spans[0]
+        assert span["name"] == "Evaluator: MyCustomEvaluator"
+        assert span["name"].startswith("Evaluator: ")
+
+
+class TestExecutionIdTracking:
+    """Tests for execution.id tracking in spans."""
+
+    def test_each_evaluation_has_unique_execution_id(self):
+        """Test that each Evaluation span gets a unique execution.id."""
+        recorder = SpanRecorder()
+        execution_ids = []
+
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run"},
+        ):
+            for i in range(3):
+                exec_id = str(uuid.uuid4())
+                execution_ids.append(exec_id)
+                with recorder.start_as_current_span(
+                    "Evaluation",
+                    attributes={
+                        "execution.id": exec_id,
+                        "span_type": "evaluation",
+                        "eval_item_id": f"item-{i}",
+                        "eval_item_name": f"Item {i}",
+                    },
+                ):
+                    pass
+
+        # Verify all execution IDs are unique
+        assert len(set(execution_ids)) == 3
+
+        # Verify each evaluation span has its execution.id
+        evaluation_spans = recorder.get_spans_by_type("evaluation")
+        for i, span in enumerate(evaluation_spans):
+            assert span["attributes"]["execution.id"] == execution_ids[i]
+
+    def test_eval_set_run_does_not_have_execution_id(self):
+        """Test that EvalSetRun span does NOT have execution.id.
+
+        This is intentional to prevent ID propagation to child spans.
+        """
+        recorder = SpanRecorder()
+
+        with recorder.start_as_current_span(
+            "Evaluation Set Run",
+            attributes={"span_type": "eval_set_run"},
+        ):
+            pass
+
+        eval_set_run = recorder.spans[0]
+        assert "execution.id" not in eval_set_run["attributes"]
+
+
+class TestEvaluatorSpanEvalItemId:
+    """Tests for eval_item_id in evaluator spans."""
+
+    def test_evaluator_span_has_eval_item_id(self):
+        """Test that Evaluator span includes the eval_item_id."""
+        recorder = SpanRecorder()
+        eval_item_id = "item-specific-123"
+
+        with recorder.start_as_current_span(
+            "Evaluation",
+            attributes={
+                "execution.id": str(uuid.uuid4()),
+                "span_type": "evaluation",
+                "eval_item_id": eval_item_id,
+                "eval_item_name": "Test",
+            },
+        ):
+            with recorder.start_as_current_span(
+                "Evaluator: Test",
+                attributes={
+                    "span_type": "evaluator",
+                    "evaluator_id": "test-1",
+                    "evaluator_name": "Test",
+                    "eval_item_id": eval_item_id,
+                },
+            ):
+                pass
+
+        evaluator_span = recorder.get_spans_by_type("evaluator")[0]
+        assert evaluator_span["attributes"]["eval_item_id"] == eval_item_id
+
+    def test_evaluator_and_evaluation_share_eval_item_id(self):
+        """Test that Evaluator and Evaluation spans share the same eval_item_id."""
+        recorder = SpanRecorder()
+        eval_item_id = "shared-item-456"
+
+        with recorder.start_as_current_span(
+            "Evaluation",
+            attributes={
+                "execution.id": str(uuid.uuid4()),
+                "span_type": "evaluation",
+                "eval_item_id": eval_item_id,
+                "eval_item_name": "Test",
+            },
+        ):
+            with recorder.start_as_current_span(
+                "Evaluator: Test",
+                attributes={
+                    "span_type": "evaluator",
+                    "evaluator_id": "test-1",
+                    "evaluator_name": "Test",
+                    "eval_item_id": eval_item_id,
+                },
+            ):
+                pass
+
+        evaluation_span = recorder.get_spans_by_type("evaluation")[0]
+        evaluator_span = recorder.get_spans_by_type("evaluator")[0]
+
+        assert (
+            evaluation_span["attributes"]["eval_item_id"]
+            == evaluator_span["attributes"]["eval_item_id"]
+        )
diff --git a/tests/telemetry/__init__.py b/tests/telemetry/__init__.py
new file mode 100644
index 000000000..e673b8aab
--- /dev/null
+++ b/tests/telemetry/__init__.py
@@ -0,0 +1 @@
+"""Tests for telemetry tracking functionality."""
diff --git a/tests/telemetry/test_track.py b/tests/telemetry/test_track.py
new file mode 100644
index 000000000..aca2afd4e
--- /dev/null
+++ b/tests/telemetry/test_track.py
@@ -0,0 +1,482 @@
+"""Tests for telemetry tracking functionality."""
+
+import os
+from unittest.mock import MagicMock, patch
+
+from uipath.telemetry._track import (
+    _AppInsightsEventClient,
+    _parse_connection_string,
+    _TelemetryClient,
+    flush_events,
+    is_telemetry_enabled,
+    track,
+    track_event,
+)
+
+
+class TestParseConnectionString:
+    """Test connection string parsing functionality."""
+
+    def test_parse_valid_connection_string(self):
+        """Test parsing a valid Application Insights connection string."""
+        connection_string = (
+            "InstrumentationKey=test-key-123;"
+            "IngestionEndpoint=https://example.com/;"
+            "LiveEndpoint=https://live.example.com/"
+        )
+
+        result = _parse_connection_string(connection_string)
+
+        assert result == "test-key-123"
+
+    def test_parse_connection_string_only_instrumentation_key(self):
+        """Test parsing connection string with only InstrumentationKey."""
+        connection_string = "InstrumentationKey=simple-key"
+
+        result = _parse_connection_string(connection_string)
+
+        assert result == "simple-key"
+
+    def test_parse_connection_string_missing_instrumentation_key(self):
+        """Test parsing connection string without InstrumentationKey."""
+        connection_string = (
+            "IngestionEndpoint=https://example.com/;"
+            "LiveEndpoint=https://live.example.com/"
+        )
+
+        result = _parse_connection_string(connection_string)
+
+        assert result is None
+
+    def test_parse_malformed_connection_string(self):
+        """Test parsing malformed connection string."""
+        connection_string = "not-a-valid-connection-string"
+
+        result = _parse_connection_string(connection_string)
+
+        assert result is None
+
+    def test_parse_empty_connection_string(self):
+        """Test parsing empty connection string."""
+        result = _parse_connection_string("")
+
+        assert result is None
+
+    def test_parse_connection_string_with_special_chars_in_value(self):
+        """Test parsing connection string with special characters in value."""
+        connection_string = "InstrumentationKey=key=with=equals;Other=value"
+
+        result = _parse_connection_string(connection_string)
+
+        assert result == "key=with=equals"
+
+
+class TestAppInsightsEventClient:
+    """Test _AppInsightsEventClient functionality."""
+
+    def setup_method(self):
+        """Reset AppInsightsEventClient state before each test."""
+        _AppInsightsEventClient._initialized = False
+        _AppInsightsEventClient._client = None
+
+    def teardown_method(self):
+        """Clean up after each test."""
+        _AppInsightsEventClient._initialized = False
+        _AppInsightsEventClient._client = None
+
+    def test_initialize_no_connection_string(self):
+        """Test initialization when no connection string is provided."""
+        with patch.dict(os.environ, {}, clear=True):
+            # Remove APPLICATIONINSIGHTS_CONNECTION_STRING if it exists
+            os.environ.pop("APPLICATIONINSIGHTS_CONNECTION_STRING", None)
+
+            _AppInsightsEventClient._initialize()
+
+            assert _AppInsightsEventClient._initialized is True
+            assert _AppInsightsEventClient._client is None
+
+    @patch("uipath.telemetry._track._HAS_APPINSIGHTS", False)
+    def test_initialize_no_appinsights_package(self):
+        """Test initialization when applicationinsights package is not available."""
+        _AppInsightsEventClient._initialize()
+
+        assert _AppInsightsEventClient._initialized is True
+        assert _AppInsightsEventClient._client is None
+
+    @patch("uipath.telemetry._track._HAS_APPINSIGHTS", True)
+    @patch("uipath.telemetry._track.AppInsightsTelemetryClient")
+    def test_initialize_creates_client(self, mock_client_class):
+        """Test that initialization creates Application Insights client."""
+        mock_client = MagicMock()
+        mock_client_class.return_value = mock_client
+
+        with patch.dict(
+            os.environ,
+            {
+                "APPLICATIONINSIGHTS_CONNECTION_STRING": (
+                    "InstrumentationKey=test-key;IngestionEndpoint=https://example.com/"
+                )
+            },
+        ):
+            _AppInsightsEventClient._initialize()
+
+        assert _AppInsightsEventClient._initialized is True
+        assert _AppInsightsEventClient._client is mock_client
+        mock_client_class.assert_called_once_with("test-key")
+
+    @patch("uipath.telemetry._track._HAS_APPINSIGHTS", True)
+    @patch("uipath.telemetry._track.AppInsightsTelemetryClient")
+    def test_initialize_invalid_connection_string(self, mock_client_class):
+        """Test initialization with invalid connection string."""
+        with patch.dict(
+            os.environ,
+            {"APPLICATIONINSIGHTS_CONNECTION_STRING": "invalid-connection-string"},
+        ):
+            _AppInsightsEventClient._initialize()
+
+        assert _AppInsightsEventClient._initialized is True
+        assert _AppInsightsEventClient._client is None
+        mock_client_class.assert_not_called()
+
+    def test_initialize_only_once(self):
+        """Test that initialization only happens once."""
+        _AppInsightsEventClient._initialized = True
+        _AppInsightsEventClient._client = "existing_client"
+
+        _AppInsightsEventClient._initialize()
+
+        # Should not change the client since already initialized
+        assert _AppInsightsEventClient._client == "existing_client"
+
+    @patch("uipath.telemetry._track._HAS_APPINSIGHTS", True)
+    @patch("uipath.telemetry._track.AppInsightsTelemetryClient")
+    def test_track_event_calls_client(self, mock_client_class):
+        """Test that track_event calls the Application Insights client."""
+        mock_client = MagicMock()
+        mock_client_class.return_value = mock_client
+        _AppInsightsEventClient._initialized = True
+        _AppInsightsEventClient._client = mock_client
+
+        properties = {"key1": "value1", "key2": 123, "key3": None}
+
+        _AppInsightsEventClient.track_event("test_event", properties)
+
+        mock_client.track_event.assert_called_once_with(
+            name="test_event",
+            properties={
+                "key1": "value1",
+                "key2": "123",
+            },  # None filtered, int converted
+            measurements={},
+        )
+
+    def test_track_event_no_client(self):
+        """Test that track_event does nothing when client is not initialized."""
+        _AppInsightsEventClient._initialized = True
+        _AppInsightsEventClient._client = None
+
+        # Should not raise any exception
+        _AppInsightsEventClient.track_event("test_event", {"key": "value"})
+
+    def test_track_event_empty_properties(self):
+        """Test track_event with empty properties."""
+        mock_client = MagicMock()
+        _AppInsightsEventClient._initialized = True
+        _AppInsightsEventClient._client = mock_client
+
+        _AppInsightsEventClient.track_event("test_event", None)
+
+        mock_client.track_event.assert_called_once_with(
+            name="test_event",
+            properties={},
+            measurements={},
+        )
+
+    def test_flush_calls_client(self):
+        """Test that flush calls the client's flush method."""
+        mock_client = MagicMock()
+        _AppInsightsEventClient._client = mock_client
+
+        _AppInsightsEventClient.flush()
+
+        mock_client.flush.assert_called_once()
+
+    def test_flush_no_client(self):
+        """Test that flush does nothing when client is not available."""
+        _AppInsightsEventClient._client = None
+
+        # Should not raise any exception
+        _AppInsightsEventClient.flush()
+
+
+class TestTelemetryClient:
+    """Test _TelemetryClient functionality."""
+
+    def setup_method(self):
+        """Reset TelemetryClient state before each test."""
+        _TelemetryClient._initialized = False
+
+    def teardown_method(self):
+        """Clean up after each test."""
+        _TelemetryClient._initialized = False
+
+    def test_is_enabled_default_true(self):
+        """Test that telemetry is enabled by default."""
+        with patch.dict(os.environ, {}, clear=True):
+            os.environ.pop("UIPATH_TELEMETRY_ENABLED", None)
+
+            assert _TelemetryClient._is_enabled() is True
+
+    def test_is_enabled_explicit_true(self):
+        """Test telemetry enabled when explicitly set to true."""
+        with patch.dict(os.environ, {"UIPATH_TELEMETRY_ENABLED": "true"}):
+            assert _TelemetryClient._is_enabled() is True
+
+    def test_is_enabled_explicit_false(self):
+        """Test telemetry disabled when set to false."""
+        with patch.dict(os.environ, {"UIPATH_TELEMETRY_ENABLED": "false"}):
+            assert _TelemetryClient._is_enabled() is False
+
+    def test_is_enabled_case_insensitive(self):
+        """Test that telemetry enabled check is case insensitive."""
+        with patch.dict(os.environ, {"UIPATH_TELEMETRY_ENABLED": "TRUE"}):
+            assert _TelemetryClient._is_enabled() is True
+
+        with patch.dict(os.environ, {"UIPATH_TELEMETRY_ENABLED": "False"}):
+            assert _TelemetryClient._is_enabled() is False
+
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=False)
+    def test_track_event_disabled(self, mock_is_enabled):
+        """Test that track_event does nothing when telemetry is disabled."""
+        with patch.object(_AppInsightsEventClient, "track_event") as mock_track:
+            _TelemetryClient.track_event("test_event", {"key": "value"})
+
+            mock_track.assert_not_called()
+
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=True)
+    @patch.object(_AppInsightsEventClient, "track_event")
+    def test_track_event_enabled(self, mock_track, mock_is_enabled):
+        """Test that track_event calls AppInsightsEventClient when enabled."""
+        properties = {"key": "value"}
+
+        _TelemetryClient.track_event("test_event", properties)
+
+        mock_track.assert_called_once_with("test_event", properties)
+
+
+class TestPublicFunctions:
+    """Test public telemetry functions."""
+
+    def setup_method(self):
+        """Reset state before each test."""
+        _TelemetryClient._initialized = False
+        _AppInsightsEventClient._initialized = False
+        _AppInsightsEventClient._client = None
+
+    @patch.object(_TelemetryClient, "track_event")
+    def test_track_event_function(self, mock_track):
+        """Test the global track_event function."""
+        properties = {"key": "value"}
+
+        track_event("test_event", properties)
+
+        mock_track.assert_called_once_with("test_event", properties)
+
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=True)
+    def test_is_telemetry_enabled_true(self, mock_is_enabled):
+        """Test is_telemetry_enabled returns True when enabled."""
+        assert is_telemetry_enabled() is True
+
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=False)
+    def test_is_telemetry_enabled_false(self, mock_is_enabled):
+        """Test is_telemetry_enabled returns False when disabled."""
+        assert is_telemetry_enabled() is False
+
+    @patch.object(_AppInsightsEventClient, "flush")
+    def test_flush_events_function(self, mock_flush):
+        """Test the global flush_events function."""
+        flush_events()
+
+        mock_flush.assert_called_once()
+
+
+class TestTrackDecorator:
+    """Test the @track decorator functionality."""
+
+    def setup_method(self):
+        """Reset state before each test."""
+        _TelemetryClient._initialized = False
+
+    @patch.object(_TelemetryClient, "_track_method")
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=True)
+    def test_track_decorator_with_name(self, mock_is_enabled, mock_track_method):
+        """Test @track decorator with explicit name."""
+
+        @track("custom_name")
+        def my_function():
+            return "result"
+
+        result = my_function()
+
+        assert result == "result"
+        mock_track_method.assert_called_once_with("custom_name", None)
+
+    @patch.object(_TelemetryClient, "_track_method")
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=True)
+    def test_track_decorator_without_name(self, mock_is_enabled, mock_track_method):
+        """Test @track decorator without name uses function name."""
+
+        @track
+        def my_function():
+            return "result"
+
+        result = my_function()
+
+        assert result == "result"
+        mock_track_method.assert_called_once_with("my_function", None)
+
+    @patch.object(_TelemetryClient, "_track_method")
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=True)
+    def test_track_decorator_with_extra(self, mock_is_enabled, mock_track_method):
+        """Test @track decorator with extra attributes."""
+        extra = {"attr1": "value1"}
+
+        @track("event_name", extra=extra)
+        def my_function():
+            return "result"
+
+        result = my_function()
+
+        assert result == "result"
+        mock_track_method.assert_called_once_with("event_name", extra)
+
+    @patch.object(_TelemetryClient, "_track_method")
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=True)
+    def test_track_decorator_when_condition_true(
+        self, mock_is_enabled, mock_track_method
+    ):
+        """Test @track decorator with when condition that returns True."""
+
+        @track("event_name", when=True)
+        def my_function():
+            return "result"
+
+        result = my_function()
+
+        assert result == "result"
+        mock_track_method.assert_called_once()
+
+    @patch.object(_TelemetryClient, "_track_method")
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=True)
+    def test_track_decorator_when_condition_false(
+        self, mock_is_enabled, mock_track_method
+    ):
+        """Test @track decorator with when condition that returns False."""
+
+        @track("event_name", when=False)
+        def my_function():
+            return "result"
+
+        result = my_function()
+
+        assert result == "result"
+        mock_track_method.assert_not_called()
+
+    @patch.object(_TelemetryClient, "_track_method")
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=True)
+    def test_track_decorator_when_callable(self, mock_is_enabled, mock_track_method):
+        """Test @track decorator with callable when condition."""
+
+        @track("event_name", when=lambda x: x > 5)
+        def my_function(x):
+            return x * 2
+
+        # Should track when x > 5
+        result = my_function(10)
+        assert result == 20
+        mock_track_method.assert_called_once()
+
+        mock_track_method.reset_mock()
+
+        # Should not track when x <= 5
+        result = my_function(3)
+        assert result == 6
+        mock_track_method.assert_not_called()
+
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=False)
+    @patch.object(_TelemetryClient, "_initialize")
+    def test_track_decorator_telemetry_disabled(self, mock_initialize, mock_is_enabled):
+        """Test @track decorator doesn't initialize when telemetry is disabled.
+
+        The decorator still calls _track_method, but _track_method should
+        short-circuit and not initialize when telemetry is disabled.
+        """
+
+        @track("event_name")
+        def my_function():
+            return "result"
+
+        result = my_function()
+
+        assert result == "result"
+        # _initialize should not be called when telemetry is disabled
+        mock_initialize.assert_not_called()
+
+    @patch.object(_TelemetryClient, "_track_method")
+    @patch.object(_TelemetryClient, "_is_enabled", return_value=True)
+    def test_track_decorator_preserves_function_metadata(
+        self, mock_is_enabled, mock_track_method
+    ):
+        """Test that @track decorator preserves function metadata."""
+
+        @track("event_name")
+        def my_function_with_doc():
+            """This is a docstring."""
+            return "result"
+
+        assert my_function_with_doc.__name__ == "my_function_with_doc"
+        assert my_function_with_doc.__doc__ == "This is a docstring."
+
+
+class TestTelemetryExceptionHandling:
+    """Test that telemetry never breaks the main application."""
+
+    def setup_method(self):
+        """Reset state before each test."""
+        _AppInsightsEventClient._initialized = False
+        _AppInsightsEventClient._client = None
+
+    def test_track_event_handles_client_exception(self):
+        """Test that track_event handles exceptions from the client."""
+        mock_client = MagicMock()
+        mock_client.track_event.side_effect = Exception("Client error")
+        _AppInsightsEventClient._initialized = True
+        _AppInsightsEventClient._client = mock_client
+
+        # Should not raise exception
+        _AppInsightsEventClient.track_event("test_event", {"key": "value"})
+
+    def test_flush_handles_exception(self):
+        """Test that flush handles exceptions from the client."""
+        mock_client = MagicMock()
+        mock_client.flush.side_effect = Exception("Flush error")
+        _AppInsightsEventClient._client = mock_client
+
+        # Should not raise exception
+        _AppInsightsEventClient.flush()
+
+    @patch("uipath.telemetry._track._HAS_APPINSIGHTS", True)
+    @patch("uipath.telemetry._track.AppInsightsTelemetryClient")
+    def test_initialize_handles_exception(self, mock_client_class):
+        """Test that initialization handles exceptions."""
+        mock_client_class.side_effect = Exception("Init error")
+
+        with patch.dict(
+            os.environ,
+            {"APPLICATIONINSIGHTS_CONNECTION_STRING": "InstrumentationKey=test-key"},
+        ):
+            # Should not raise exception
+            _AppInsightsEventClient._initialize()
+
+        assert _AppInsightsEventClient._initialized is True
+        assert _AppInsightsEventClient._client is None

From e136c254732d9e9cbc05f3f26bc25ada026bf9fb Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 7 Jan 2026 16:43:05 -0800
Subject: [PATCH 05/12] fix: linting errors and integration tests

---
 testcases/eval-spans-testcase/pyproject.toml  |  11 +
 testcases/eval-spans-testcase/run.sh          |  18 ++
 testcases/eval-spans-testcase/src/assert.py   | 252 ++++++++++++++++++
 testcases/eval-spans-testcase/uipath.json     |   5 +
 tests/cli/eval/test_eval_runtime_spans.py     |   8 +-
 tests/cli/eval/test_eval_telemetry.py         |  13 +-
 .../cli/eval/test_eval_tracing_integration.py |   8 +-
 7 files changed, 303 insertions(+), 12 deletions(-)
 create mode 100644 testcases/eval-spans-testcase/pyproject.toml
 create mode 100755 testcases/eval-spans-testcase/run.sh
 create mode 100644 testcases/eval-spans-testcase/src/assert.py
 create mode 100644 testcases/eval-spans-testcase/uipath.json

diff --git a/testcases/eval-spans-testcase/pyproject.toml b/testcases/eval-spans-testcase/pyproject.toml
new file mode 100644
index 000000000..41b4430c7
--- /dev/null
+++ b/testcases/eval-spans-testcase/pyproject.toml
@@ -0,0 +1,11 @@
+[project]
+name = "eval-spans-testcase"
+version = "0.1.0"
+description = "E2E test for verifying eval spans (Evaluation Set Run, Evaluation, Evaluator)"
+requires-python = ">=3.11"
+dependencies = [
+    "uipath",
+]
+
+[tool.uv.sources]
+uipath = { path = "../../", editable = true }
diff --git a/testcases/eval-spans-testcase/run.sh b/testcases/eval-spans-testcase/run.sh
new file mode 100755
index 000000000..80be32da9
--- /dev/null
+++ b/testcases/eval-spans-testcase/run.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -e
+
+echo "=== E2E Test: Eval Spans Verification ==="
+
+echo "Syncing dependencies..."
+uv sync
+
+echo "Authenticating with UiPath..."
+uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL"
+
+echo "Running evaluations with trace capture..."
+# Run eval with trace file to capture spans
+uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/default.json \
+    --no-report \
+    --trace-file __uipath/traces.jsonl
+
+echo "Test completed successfully!"
diff --git a/testcases/eval-spans-testcase/src/assert.py b/testcases/eval-spans-testcase/src/assert.py
new file mode 100644
index 000000000..670452fc1
--- /dev/null
+++ b/testcases/eval-spans-testcase/src/assert.py
@@ -0,0 +1,252 @@
+"""E2E assertions for eval spans testcase.
+
+This script validates that the new eval spans are created correctly:
+1. "Evaluation Set Run" span with span_type: "eval_set_run"
+2. "Evaluation" spans with span_type: "evaluation"
+3. "Evaluator: {name}" spans with span_type: "evaluator"
+"""
+
+import json
+import os
+import sys
+from typing import Any
+
+
+def load_traces(traces_file: str) -> list[dict[str, Any]]:
+    """Load traces from a JSONL file."""
+    traces = []
+    with open(traces_file, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                traces.append(json.loads(line))
+    return traces
+
+
+def get_attributes(span: dict[str, Any]) -> dict[str, Any]:
+    """Get attributes from a span."""
+    return span.get("attributes", {})
+
+
+def find_spans_by_type(
+    traces: list[dict[str, Any]], span_type: str
+) -> list[dict[str, Any]]:
+    """Find all spans with the given span_type attribute."""
+    return [
+        trace for trace in traces if get_attributes(trace).get("span_type") == span_type
+    ]
+
+
+def find_spans_by_name(traces: list[dict[str, Any]], name: str) -> list[dict[str, Any]]:
+    """Find all spans with the given name."""
+    return [trace for trace in traces if trace.get("name") == name]
+
+
+def find_spans_by_name_prefix(
+    traces: list[dict[str, Any]], prefix: str
+) -> list[dict[str, Any]]:
+    """Find all spans whose name starts with the given prefix."""
+    return [trace for trace in traces if trace.get("name", "").startswith(prefix)]
+
+
+def assert_eval_set_run_span(traces: list[dict[str, Any]]) -> None:
+    """Assert that the Evaluation Set Run span exists with correct attributes."""
+    print("\n--- Checking 'Evaluation Set Run' span ---")
+
+    # Find by span_type
+    eval_set_run_spans = find_spans_by_type(traces, "eval_set_run")
+
+    assert len(eval_set_run_spans) >= 1, (
+        "Expected at least 1 'eval_set_run' span, found 0. "
+        "Spans with span_type attribute: "
+        f"{[get_attributes(t).get('span_type') for t in traces if get_attributes(t).get('span_type')]}"
+    )
+
+    print(f"  Found {len(eval_set_run_spans)} eval_set_run span(s)")
+
+    for span in eval_set_run_spans:
+        name = span.get("name")
+        attrs = get_attributes(span)
+
+        # Check span name
+        assert name == "Evaluation Set Run", (
+            f"Expected span name 'Evaluation Set Run', got '{name}'"
+        )
+        print(f"  Name: {name}")
+
+        # Check span_type attribute
+        assert attrs.get("span_type") == "eval_set_run", (
+            f"Expected span_type 'eval_set_run', got '{attrs.get('span_type')}'"
+        )
+        print(f"  span_type: {attrs.get('span_type')}")
+
+        # Check eval_set_run_id is present (may be execution_id fallback)
+        if "eval_set_run_id" in attrs:
+            print(f"  eval_set_run_id: {attrs.get('eval_set_run_id')}")
+
+    print("Evaluation Set Run span assertion passed")
+
+
+def assert_evaluation_spans(traces: list[dict[str, Any]]) -> None:
+    """Assert that Evaluation spans exist with correct attributes."""
+    print("\n--- Checking 'Evaluation' spans ---")
+
+    # Find by span_type
+    evaluation_spans = find_spans_by_type(traces, "evaluation")
+
+    assert len(evaluation_spans) >= 1, "Expected at least 1 'evaluation' span, found 0"
+
+    print(f"  Found {len(evaluation_spans)} evaluation span(s)")
+
+    for i, span in enumerate(evaluation_spans):
+        name = span.get("name")
+        attrs = get_attributes(span)
+
+        print(f"\n  Evaluation span {i + 1}:")
+
+        # Check span name
+        assert name == "Evaluation", f"Expected span name 'Evaluation', got '{name}'"
+        print(f"    Name: {name}")
+
+        # Check span_type attribute
+        assert attrs.get("span_type") == "evaluation", (
+            f"Expected span_type 'evaluation', got '{attrs.get('span_type')}'"
+        )
+        print(f"    span_type: {attrs.get('span_type')}")
+
+        # Check required attributes
+        assert "execution.id" in attrs, (
+            "Expected 'execution.id' attribute in Evaluation span"
+        )
+        print(f"    execution.id: {attrs.get('execution.id')}")
+
+        assert "eval_item_id" in attrs, (
+            "Expected 'eval_item_id' attribute in Evaluation span"
+        )
+        print(f"    eval_item_id: {attrs.get('eval_item_id')}")
+
+        assert "eval_item_name" in attrs, (
+            "Expected 'eval_item_name' attribute in Evaluation span"
+        )
+        print(f"    eval_item_name: {attrs.get('eval_item_name')}")
+
+    print("\nEvaluation spans assertion passed")
+
+
+def assert_evaluator_spans(traces: list[dict[str, Any]]) -> None:
+    """Assert that Evaluator spans exist with correct attributes."""
+    print("\n--- Checking 'Evaluator' spans ---")
+
+    # Find by span_type
+    evaluator_spans = find_spans_by_type(traces, "evaluator")
+
+    assert len(evaluator_spans) >= 1, "Expected at least 1 'evaluator' span, found 0"
+
+    print(f"  Found {len(evaluator_spans)} evaluator span(s)")
+
+    for i, span in enumerate(evaluator_spans):
+        name = span.get("name")
+        attrs = get_attributes(span)
+
+        print(f"\n  Evaluator span {i + 1}:")
+
+        # Check span name starts with "Evaluator: "
+        assert name and name.startswith("Evaluator: "), (
+            f"Expected span name to start with 'Evaluator: ', got '{name}'"
+        )
+        print(f"    Name: {name}")
+
+        # Check span_type attribute
+        assert attrs.get("span_type") == "evaluator", (
+            f"Expected span_type 'evaluator', got '{attrs.get('span_type')}'"
+        )
+        print(f"    span_type: {attrs.get('span_type')}")
+
+        # Check required attributes
+        assert "evaluator_id" in attrs, (
+            "Expected 'evaluator_id' attribute in Evaluator span"
+        )
+        print(f"    evaluator_id: {attrs.get('evaluator_id')}")
+
+        assert "evaluator_name" in attrs, (
+            "Expected 'evaluator_name' attribute in Evaluator span"
+        )
+        print(f"    evaluator_name: {attrs.get('evaluator_name')}")
+
+        assert "eval_item_id" in attrs, (
+            "Expected 'eval_item_id' attribute in Evaluator span"
+        )
+        print(f"    eval_item_id: {attrs.get('eval_item_id')}")
+
+    print("\nEvaluator spans assertion passed")
+
+
+def assert_span_hierarchy(traces: list[dict[str, Any]]) -> None:
+    """Assert the span hierarchy is correct."""
+    print("\n--- Checking span hierarchy ---")
+
+    # Build span lookup by span_id
+    span_by_id: dict[str, dict[str, Any]] = {}
+    for trace in traces:
+        context = trace.get("context", {})
+        span_id = context.get("span_id")
+        if span_id:
+            span_by_id[span_id] = trace
+
+    # Get spans by type
+    eval_set_run_spans = find_spans_by_type(traces, "eval_set_run")
+    evaluation_spans = find_spans_by_type(traces, "evaluation")
+    evaluator_spans = find_spans_by_type(traces, "evaluator")
+
+    # Get eval_set_run span_id
+    if eval_set_run_spans:
+        eval_set_run_span_id = eval_set_run_spans[0].get("context", {}).get("span_id")
+        print(f"  EvalSetRun span_id: {eval_set_run_span_id}")
+
+        # Check Evaluation spans are children of EvalSetRun (through parent chain)
+        # Note: In practice, there may be intermediate spans, so we just verify
+        # the relationship exists through the trace
+        print(f"  Found {len(evaluation_spans)} Evaluation spans")
+        print(f"  Found {len(evaluator_spans)} Evaluator spans")
+
+    print("\nSpan hierarchy check passed")
+
+
+def main() -> None:
+    """Main assertion logic."""
+    traces_file = "__uipath/traces.jsonl"
+
+    # Check if traces file exists
+    if not os.path.isfile(traces_file):
+        print(f"Traces file '{traces_file}' not found")
+        sys.exit(1)
+
+    print(f"Loading traces from {traces_file}...")
+    traces = load_traces(traces_file)
+    print(f"Loaded {len(traces)} trace spans")
+
+    # Print all span names and types for debugging
+    print("\n--- All spans ---")
+    for i, trace in enumerate(traces):
+        name = trace.get("name", "Unknown")
+        attrs = get_attributes(trace)
+        span_type = attrs.get("span_type", "N/A")
+        print(f"  {i + 1}. {name} (span_type: {span_type})")
+
+    # Run assertions
+    try:
+        assert_eval_set_run_span(traces)
+        assert_evaluation_spans(traces)
+        assert_evaluator_spans(traces)
+        assert_span_hierarchy(traces)
+
+        print("\n" + "=" * 60)
+        print("All eval span assertions passed!")
+        print("=" * 60)
+
+    except AssertionError as e:
+        print(f"\nAssertion failed: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testcases/eval-spans-testcase/uipath.json b/testcases/eval-spans-testcase/uipath.json
new file mode 100644
index 000000000..2b8e5b396
--- /dev/null
+++ b/testcases/eval-spans-testcase/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "../../samples/calculator/main.py:main"
+  }
+}
diff --git a/tests/cli/eval/test_eval_runtime_spans.py b/tests/cli/eval/test_eval_runtime_spans.py
index 58e5da5e9..c0a7d74a0 100644
--- a/tests/cli/eval/test_eval_runtime_spans.py
+++ b/tests/cli/eval/test_eval_runtime_spans.py
@@ -21,7 +21,7 @@
 class MockSpanContext:
     """Mock span context manager for testing span creation."""
 
-    def __init__(self, name: str, attributes: Dict[str, Any]):
+    def __init__(self, name: str, attributes: dict[str, Any] | None):
         self.name = name
         self.attributes = attributes or {}
         self.span = MagicMock(spec=Span)
@@ -40,7 +40,9 @@ class SpanCapturingTracer:
     def __init__(self):
         self.created_spans: List[Dict[str, Any]] = []
 
-    def start_as_current_span(self, name: str, attributes: Dict[str, Any] = None):
+    def start_as_current_span(
+        self, name: str, attributes: dict[str, Any] | None = None
+    ):
         """Capture span creation and return a mock context manager."""
         span_info = {"name": name, "attributes": attributes or {}}
         self.created_spans.append(span_info)
@@ -415,7 +417,7 @@ def test_span_type_values_match_expected(self):
             "Evaluator": "evaluator",
         }
 
-        for span_name, span_type in expected_span_types.items():
+        for _, span_type in expected_span_types.items():
             assert isinstance(span_type, str)
             assert span_type.islower() or "_" in span_type
 
diff --git a/tests/cli/eval/test_eval_telemetry.py b/tests/cli/eval/test_eval_telemetry.py
index 63f8f913f..06c48b011 100644
--- a/tests/cli/eval/test_eval_telemetry.py
+++ b/tests/cli/eval/test_eval_telemetry.py
@@ -1,6 +1,7 @@
 """Tests for EvalTelemetrySubscriber functionality."""
 
 import os
+from typing import Any
 from unittest.mock import patch
 
 import pytest
@@ -93,7 +94,7 @@ def _create_eval_set_run_created_event(
         eval_set_run_id: str | None = "run-456",
         entrypoint: str = "agent.py",
         no_of_evals: int = 5,
-        evaluators: list = None,
+        evaluators: list[Any] | None = None,
     ) -> EvalSetRunCreatedEvent:
         """Helper to create EvalSetRunCreatedEvent."""
         return EvalSetRunCreatedEvent(
@@ -212,7 +213,7 @@ def _create_eval_run_updated_event(
         eval_item_name: str = "Test Eval",
         success: bool = True,
         agent_execution_time: float = 1.5,
-        eval_results: list = None,
+        eval_results: list[Any] | None = None,
         exception_details: EvalItemExceptionDetails | None = None,
     ) -> EvalRunUpdatedEvent:
         """Helper to create EvalRunUpdatedEvent."""
@@ -327,7 +328,7 @@ class TestEvalSetRunUpdated:
     def _create_eval_set_run_updated_event(
         self,
         execution_id: str = "exec-123",
-        evaluator_scores: dict = None,
+        evaluator_scores: dict[str, Any] | None = None,
         success: bool = True,
     ) -> EvalSetRunUpdatedEvent:
         """Helper to create EvalSetRunUpdatedEvent."""
@@ -409,7 +410,7 @@ class TestEnrichProperties:
     def test_enrich_properties_adds_source(self):
         """Test that source and application name are always added."""
         subscriber = EvalTelemetrySubscriber()
-        properties = {}
+        properties: dict[str, Any] = {}
 
         subscriber._enrich_properties(properties)
 
@@ -419,7 +420,7 @@ def test_enrich_properties_adds_source(self):
     def test_enrich_properties_adds_env_vars(self):
         """Test that environment variables are added when present."""
         subscriber = EvalTelemetrySubscriber()
-        properties = {}
+        properties: dict[str, Any] = {}
 
         with patch.dict(
             os.environ,
@@ -440,7 +441,7 @@ def test_enrich_properties_adds_env_vars(self):
     def test_enrich_properties_skips_missing_env_vars(self):
         """Test that missing environment variables are not added."""
         subscriber = EvalTelemetrySubscriber()
-        properties = {}
+        properties: dict[str, Any] = {}
 
         with patch.dict(os.environ, {}, clear=True):
             # Remove env vars if they exist
diff --git a/tests/cli/eval/test_eval_tracing_integration.py b/tests/cli/eval/test_eval_tracing_integration.py
index 4d4556771..644551c5b 100644
--- a/tests/cli/eval/test_eval_tracing_integration.py
+++ b/tests/cli/eval/test_eval_tracing_integration.py
@@ -4,13 +4,13 @@
 """
 
 import uuid
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 
 class MockSpan:
     """Mock span that captures attributes for testing."""
 
-    def __init__(self, name: str, attributes: Dict[str, Any] = None):
+    def __init__(self, name: str, attributes: Optional[Dict[str, Any]] = None):
         self.name = name
         self.attributes = attributes or {}
         self._status = None
@@ -26,7 +26,9 @@ def __init__(self):
         self.spans: List[Dict[str, Any]] = []
         self._span_stack: List[MockSpan] = []
 
-    def start_as_current_span(self, name: str, attributes: Dict[str, Any] = None):
+    def start_as_current_span(
+        self, name: str, attributes: Optional[Dict[str, Any]] = None
+    ):
         """Mock tracer method that records span creation."""
         span_info = {
             "name": name,

From f26b32e71237ef94592e8c64f6fb8e683f0870ad Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 7 Jan 2026 16:52:21 -0800
Subject: [PATCH 06/12] feat: add trace-file option

---
 src/uipath/_cli/cli_eval.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py
index 8c9f9870e..570832b47 100644
--- a/src/uipath/_cli/cli_eval.py
+++ b/src/uipath/_cli/cli_eval.py
@@ -21,7 +21,7 @@
 from uipath.eval._helpers import auto_discover_entrypoint
 from uipath.platform.common import UiPathConfig
 from uipath.telemetry._track import flush_events
-from uipath.tracing import LlmOpsHttpExporter
+from uipath.tracing import JsonLinesFileExporter, LlmOpsHttpExporter
 
 from ._utils._console import ConsoleLogger
 from ._utils._eval_set import EvalHelpers
@@ -100,6 +100,12 @@ def setup_reporting_prereq(no_report: bool) -> bool:
     default="default",
     help="Model settings ID from evaluation set to override agent settings (default: 'default')",
 )
+@click.option(
+    "--trace-file",
+    required=False,
+    type=click.Path(exists=False),
+    help="File path where traces will be written in JSONL format",
+)
 def eval(
     entrypoint: str | None,
     eval_set: str | None,
@@ -111,6 +117,7 @@ def eval(
     enable_mocker_cache: bool,
     report_coverage: bool,
     model_settings_id: str,
+    trace_file: str | None,
 ) -> None:
     """Run an evaluation set against the agent.
 
@@ -185,6 +192,11 @@ async def execute_eval():
                     if ctx.job_id:
                         trace_manager.add_span_exporter(LlmOpsHttpExporter())
 
+                    if trace_file:
+                        trace_manager.add_span_exporter(
+                            JsonLinesFileExporter(trace_file)
+                        )
+
                     project_id = UiPathConfig.project_id
 
                     runtime_factory = UiPathRuntimeFactoryRegistry.get(context=ctx)

From 348a1285b0a881241c6f01a6c4a7f51c95f2a61d Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 7 Jan 2026 17:06:06 -0800
Subject: [PATCH 07/12] feat: add integration tests for telemetry

---
 .github/workflows/integration_tests.yml       |   5 +
 .../eval-telemetry-testcase/pyproject.toml    |  12 ++
 testcases/eval-telemetry-testcase/run.sh      |  40 ++++
 .../eval-telemetry-testcase/src/assert.py     | 194 ++++++++++++++++++
 testcases/eval-telemetry-testcase/uipath.json |   5 +
 5 files changed, 256 insertions(+)
 create mode 100644 testcases/eval-telemetry-testcase/pyproject.toml
 create mode 100755 testcases/eval-telemetry-testcase/run.sh
 create mode 100644 testcases/eval-telemetry-testcase/src/assert.py
 create mode 100644 testcases/eval-telemetry-testcase/uipath.json

diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index 5ef766f90..448435f91 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -62,6 +62,11 @@ jobs:
         BASE_URL:       ${{ matrix.environment == 'alpha' && secrets.ALPHA_BASE_URL || matrix.environment == 'staging' && secrets.STAGING_BASE_URL || matrix.environment == 'cloud' && secrets.CLOUD_BASE_URL }}
 
         USE_AZURE_CHAT: ${{ matrix.use_azure_chat }}
+
+        # App Insights for telemetry testing
+        APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.APPLICATIONINSIGHTS_CONNECTION_STRING }}
+        APP_INSIGHTS_APP_ID: ${{ secrets.APP_INSIGHTS_APP_ID }}
+        APP_INSIGHTS_API_KEY: ${{ secrets.APP_INSIGHTS_API_KEY }}
       working-directory: testcases/${{ matrix.testcase }}
       run: |
         # If any errors occur execution will stop with exit code
diff --git a/testcases/eval-telemetry-testcase/pyproject.toml b/testcases/eval-telemetry-testcase/pyproject.toml
new file mode 100644
index 000000000..e9c2e52d9
--- /dev/null
+++ b/testcases/eval-telemetry-testcase/pyproject.toml
@@ -0,0 +1,12 @@
+[project]
+name = "eval-telemetry-testcase"
+version = "0.1.0"
+description = "E2E test for verifying eval telemetry events in Application Insights"
+requires-python = ">=3.11"
+dependencies = [
+    "uipath",
+    "httpx",
+]
+
+[tool.uv.sources]
+uipath = { path = "../../", editable = true }
diff --git a/testcases/eval-telemetry-testcase/run.sh b/testcases/eval-telemetry-testcase/run.sh
new file mode 100755
index 000000000..e28fb04f5
--- /dev/null
+++ b/testcases/eval-telemetry-testcase/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -e
+
+echo "=== E2E Test: Eval Telemetry Integration ==="
+
+# Validate required environment variables
+if [ -z "$APPLICATIONINSIGHTS_CONNECTION_STRING" ]; then
+    echo "Warning: APPLICATIONINSIGHTS_CONNECTION_STRING not set, telemetry won't be sent"
+fi
+if [ -z "$APP_INSIGHTS_APP_ID" ]; then
+    echo "Warning: APP_INSIGHTS_APP_ID not set, skipping telemetry verification"
+fi
+if [ -z "$APP_INSIGHTS_API_KEY" ]; then
+    echo "Warning: APP_INSIGHTS_API_KEY not set, skipping telemetry verification"
+fi
+
+echo "Syncing dependencies..."
+uv sync
+
+echo "Authenticating with UiPath..."
+uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL"
+
+# Generate a unique run ID to identify this test run's telemetry events
+export EVAL_TEST_RUN_ID="e2e-test-$(date +%s)-$$"
+echo "Test Run ID: $EVAL_TEST_RUN_ID"
+
+echo "Running evaluations with telemetry enabled..."
+# Run eval with telemetry explicitly enabled and App Insights connection string
+UIPATH_TELEMETRY_ENABLED=true uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/default.json \
+    --no-report \
+    --output-file __uipath/output.json \
+    --eval-set-run-id "$EVAL_TEST_RUN_ID"
+
+# Wait for telemetry to be ingested into App Insights
+if [ -n "$APP_INSIGHTS_APP_ID" ] && [ -n "$APP_INSIGHTS_API_KEY" ]; then
+    echo "Waiting for telemetry to be ingested (30 seconds)..."
+    sleep 30
+fi
+
+echo "Test completed successfully!"
diff --git a/testcases/eval-telemetry-testcase/src/assert.py b/testcases/eval-telemetry-testcase/src/assert.py
new file mode 100644
index 000000000..bdd512eb1
--- /dev/null
+++ b/testcases/eval-telemetry-testcase/src/assert.py
@@ -0,0 +1,194 @@
+"""E2E assertions for eval telemetry testcase.
+
+This script validates that telemetry events are sent to Application Insights by:
+1. Verifying eval completed successfully
+2. Querying App Insights API to check for expected telemetry events
+3. Validating event properties match expected values
+"""
+
+import json
+import os
+import sys
+import time
+from typing import Any
+
+import httpx
+
+# Expected telemetry event names
+EXPECTED_EVENTS = [
+    "EvalSetRun.Start.URT",
+    "EvalSetRun.End.URT",
+    "EvalRun.Start.URT",
+    "EvalRun.End.URT",
+]
+
+
+def load_output(output_file: str) -> dict[str, Any]:
+    """Load output from a JSON file."""
+    with open(output_file, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def query_app_insights(
+    app_id: str, api_key: str, query: str, max_retries: int = 3
+) -> dict[str, Any]:
+    """Query Application Insights using the REST API.
+
+    Args:
+        app_id: Application Insights App ID
+        api_key: Application Insights API Key
+        query: Kusto query to execute
+        max_retries: Number of retries on failure
+
+    Returns:
+        Query results as dictionary
+    """
+    url = f"https://api.applicationinsights.io/v1/apps/{app_id}/query"
+    headers = {"x-api-key": api_key, "Content-Type": "application/json"}
+    payload = {"query": query}
+
+    for attempt in range(max_retries):
+        try:
+            response = httpx.post(url, headers=headers, json=payload, timeout=30)
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            if attempt < max_retries - 1:
+                print(f"  Retry {attempt + 1}/{max_retries} after error: {e}")
+                time.sleep(5)
+            else:
+                raise
+
+
+def verify_telemetry_events(app_id: str, api_key: str, eval_set_run_id: str) -> bool:
+    """Verify telemetry events were sent to Application Insights.
+
+    Args:
+        app_id: Application Insights App ID
+        api_key: Application Insights API Key
+        eval_set_run_id: The eval set run ID to search for
+
+    Returns:
+        True if all expected events were found
+    """
+    print("\n--- Querying App Insights for events ---")
+    print(f"  EvalSetRunId: {eval_set_run_id}")
+
+    # Query for events with the specific EvalSetRunId
+    query = f"""
+    customEvents
+    | where timestamp > ago(10m)
+    | where customDimensions.EvalSetRunId == "{eval_set_run_id}"
+       or customDimensions["EvalSetRunId"] == "{eval_set_run_id}"
+    | project name, timestamp, customDimensions
+    | order by timestamp asc
+    """
+
+    try:
+        result = query_app_insights(app_id, api_key, query)
+    except Exception as e:
+        print(f"  Error querying App Insights: {e}")
+        return False
+
+    # Parse results
+    tables = result.get("tables", [])
+    if not tables:
+        print("  No tables returned from query")
+        return False
+
+    rows = tables[0].get("rows", [])
+    columns = [col["name"] for col in tables[0].get("columns", [])]
+
+    print(f"  Found {len(rows)} events")
+
+    # Extract event names
+    found_events: list[str] = []
+    name_idx = columns.index("name") if "name" in columns else 0
+
+    for row in rows:
+        event_name = row[name_idx]
+        found_events.append(event_name)
+        print(f"    - {event_name}")
+
+    # Check for expected events
+    print("\n--- Verifying expected events ---")
+    all_found = True
+    for expected in EXPECTED_EVENTS:
+        if expected in found_events:
+            print(f"  [OK] {expected}")
+        else:
+            print(f"  [MISSING] {expected}")
+            all_found = False
+
+    return all_found
+
+
+def verify_output(output_file: str) -> bool:
+    """Verify the eval output file."""
+    print("\n--- Verifying eval output ---")
+
+    if not os.path.isfile(output_file):
+        print(f"  Output file '{output_file}' not found")
+        return False
+
+    output_data = load_output(output_file)
+    status = output_data.get("status")
+
+    if status != "successful":
+        print(f"  Eval failed with status: {status}")
+        return False
+
+    print(f"  Status: {status}")
+
+    output = output_data.get("output", {})
+    evaluation_results = output.get("evaluationSetResults", [])
+    print(f"  Evaluation results: {len(evaluation_results)}")
+
+    return True
+
+
+def main() -> None:
+    """Main assertion logic."""
+    output_file = "__uipath/output.json"
+
+    # Get environment variables
+    app_id = os.environ.get("APP_INSIGHTS_APP_ID")
+    api_key = os.environ.get("APP_INSIGHTS_API_KEY")
+    eval_set_run_id = os.environ.get("EVAL_TEST_RUN_ID")
+
+    # Verify eval output first
+    if not verify_output(output_file):
+        print("\nEval output verification failed")
+        sys.exit(1)
+
+    # Check if App Insights verification is possible
+    if not app_id or not api_key:
+        print("\n--- Skipping App Insights verification ---")
+        print("  APP_INSIGHTS_APP_ID or APP_INSIGHTS_API_KEY not set")
+        print("  Telemetry verification skipped (eval completed successfully)")
+        print("\nAll assertions passed! (telemetry verification skipped)")
+        return
+
+    if not eval_set_run_id:
+        print("\n--- Skipping App Insights verification ---")
+        print("  EVAL_TEST_RUN_ID not set")
+        print("\nAll assertions passed! (telemetry verification skipped)")
+        return
+
+    # Verify telemetry events in App Insights
+    if not verify_telemetry_events(app_id, api_key, eval_set_run_id):
+        print("\n" + "=" * 60)
+        print("Telemetry verification FAILED")
+        print("Expected events not found in App Insights")
+        print("=" * 60)
+        sys.exit(1)
+
+    print("\n" + "=" * 60)
+    print("All assertions passed!")
+    print("  - Eval completed successfully")
+    print("  - Telemetry events verified in App Insights")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testcases/eval-telemetry-testcase/uipath.json b/testcases/eval-telemetry-testcase/uipath.json
new file mode 100644
index 000000000..2b8e5b396
--- /dev/null
+++ b/testcases/eval-telemetry-testcase/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "../../samples/calculator/main.py:main"
+  }
+}

From fdcaed4762cb596d6393a0ea21cc9728e32b49e3 Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 7 Jan 2026 17:09:47 -0800
Subject: [PATCH 08/12] fix: failing telemetry integration test

---
 testcases/common/validate_output.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/testcases/common/validate_output.sh b/testcases/common/validate_output.sh
index 83f587f99..1e4cd1a59 100644
--- a/testcases/common/validate_output.sh
+++ b/testcases/common/validate_output.sh
@@ -26,9 +26,9 @@ debug_print_uipath_output() {
 run_assertions() {
     echo "Running assertions..."
     if [ -f "src/assert.py" ]; then
-        # Use the Python from the virtual environment
+        # Use uv run to ensure testcase dependencies are available
         # Prepend the common directory to the python path so it can be resolved
-        PYTHONPATH="../common:$PYTHONPATH" python src/assert.py
+        PYTHONPATH="../common:$PYTHONPATH" uv run python src/assert.py
     else
         echo "assert.py not found in src directory!"
         exit 1

From 8a54b506bcf899e4797a0263a65c710a408abcfd Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Wed, 7 Jan 2026 17:14:13 -0800
Subject: [PATCH 09/12] fix: failing telemetry integration test

---
 .../eval-telemetry-testcase/src/assert.py     | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/testcases/eval-telemetry-testcase/src/assert.py b/testcases/eval-telemetry-testcase/src/assert.py
index bdd512eb1..b9077ba8e 100644
--- a/testcases/eval-telemetry-testcase/src/assert.py
+++ b/testcases/eval-telemetry-testcase/src/assert.py
@@ -132,18 +132,33 @@ def verify_output(output_file: str) -> bool:
         return False
 
     output_data = load_output(output_file)
-    status = output_data.get("status")
 
-    if status != "successful":
-        print(f"  Eval failed with status: {status}")
-        return False
-
-    print(f"  Status: {status}")
+    # The eval output can have two formats:
+    # 1. Direct results: {"evaluationSetName": "...", "evaluationSetResults": [...]}
+    # 2. Wrapped results: {"status": "successful", "output": {...}}
+    if "status" in output_data:
+        status = output_data.get("status")
+        if status != "successful":
+            print(f"  Eval failed with status: {status}")
+            return False
+        print(f"  Status: {status}")
+        output = output_data.get("output", {})
+        evaluation_results = output.get("evaluationSetResults", [])
+    else:
+        # Direct format - check for evaluationSetResults
+        evaluation_results = output_data.get("evaluationSetResults", [])
+        if not evaluation_results:
+            print("  No evaluationSetResults found in output")
+            return False
+        print("  Status: completed (direct output format)")
 
-    output = output_data.get("output", {})
-    evaluation_results = output.get("evaluationSetResults", [])
     print(f"  Evaluation results: {len(evaluation_results)}")
 
+    # Verify we have results with scores
+    if len(evaluation_results) == 0:
+        print("  No evaluation results found")
+        return False
+
     return True
 
 

From 36cef7d21aeab243d4d2a0f923a8e3a9175ea276 Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Thu, 8 Jan 2026 09:36:38 -0800
Subject: [PATCH 10/12] fix: update the version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 85f0f2f66..4b77acf3c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.4.4"
+version = "2.4.5"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"

From 59289366ca51f81622b80a739ac175a2d7d0e7da Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Thu, 8 Jan 2026 09:41:10 -0800
Subject: [PATCH 11/12] fix: linting package mismatch error

---
 uv.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/uv.lock b/uv.lock
index 8c90eda7b..9a7a728ce 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2486,7 +2486,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.4.4"
+version = "2.4.5"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },

From 9520ec5c4c372a561bd33054559d7f523edf4737 Mon Sep 17 00:00:00 2001
From: Anipik <anirudh.agnihotry@uipath.com>
Date: Thu, 8 Jan 2026 10:19:00 -0800
Subject: [PATCH 12/12] fix: tracing integration tests

---
 .../cli/eval/test_eval_tracing_integration.py | 902 ++++++++++--------
 1 file changed, 478 insertions(+), 424 deletions(-)

diff --git a/tests/cli/eval/test_eval_tracing_integration.py b/tests/cli/eval/test_eval_tracing_integration.py
index 644551c5b..1bb5a6ddf 100644
--- a/tests/cli/eval/test_eval_tracing_integration.py
+++ b/tests/cli/eval/test_eval_tracing_integration.py
@@ -1,487 +1,541 @@
 """Integration tests for eval tracing flow.
 
-These tests verify the end-to-end span creation and hierarchy in the eval runtime.
+These tests verify that the eval runtime code correctly creates spans
+with the expected attributes by mocking the tracer.
 """
 
-import uuid
-from typing import Any, Dict, List, Optional
+from contextlib import contextmanager
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime
+from uipath.eval.evaluators import BaseEvaluator
+from uipath.eval.models import NumericEvaluationResult
 
 
 class MockSpan:
-    """Mock span that captures attributes for testing."""
+    """Mock span that captures attributes."""
 
-    def __init__(self, name: str, attributes: Optional[Dict[str, Any]] = None):
+    def __init__(self, name: str, attributes: dict[str, Any] | None = None):
         self.name = name
         self.attributes = attributes or {}
         self._status = None
 
-    def set_status(self, status):
+    def set_status(self, status: Any) -> None:
         self._status = status
 
+    def __enter__(self) -> "MockSpan":
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        pass
 
-class SpanRecorder:
-    """Records all spans created during test execution."""
 
-    def __init__(self):
-        self.spans: List[Dict[str, Any]] = []
-        self._span_stack: List[MockSpan] = []
+class SpanCapturingTracer:
+    """A tracer that captures all created spans for verification."""
 
+    def __init__(self) -> None:
+        self.captured_spans: list[dict[str, Any]] = []
+
+    @contextmanager
     def start_as_current_span(
-        self, name: str, attributes: Optional[Dict[str, Any]] = None
+        self, name: str, attributes: dict[str, Any] | None = None
     ):
-        """Mock tracer method that records span creation."""
-        span_info = {
-            "name": name,
-            "attributes": dict(attributes) if attributes else {},
-            "parent": self._span_stack[-1].name if self._span_stack else None,
-        }
-        self.spans.append(span_info)
-
-        mock_span = MockSpan(name, attributes)
-        return _SpanContextManager(mock_span, self._span_stack)
-
-    def get_spans_by_type(self, span_type: str) -> List[Dict[str, Any]]:
-        """Get all spans with the given span_type attribute."""
-        return [s for s in self.spans if s["attributes"].get("span_type") == span_type]
-
-    def get_span_by_name(self, name: str) -> Dict[str, Any] | None:
+        """Capture span creation and yield a mock span."""
+        span_info = {"name": name, "attributes": dict(attributes) if attributes else {}}
+        self.captured_spans.append(span_info)
+        yield MockSpan(name, attributes)
+
+    def get_spans_by_type(self, span_type: str) -> list[dict[str, Any]]:
+        """Get all captured spans with the given span_type."""
+        return [
+            s
+            for s in self.captured_spans
+            if s["attributes"].get("span_type") == span_type
+        ]
+
+    def get_span_by_name(self, name: str) -> dict[str, Any] | None:
         """Get the first span with the given name."""
-        for span in self.spans:
+        for span in self.captured_spans:
             if span["name"] == name:
                 return span
         return None
 
 
-class _SpanContextManager:
-    """Context manager for mock spans."""
-
-    def __init__(self, span: MockSpan, stack: List[MockSpan]):
-        self.span = span
-        self.stack = stack
-
-    def __enter__(self):
-        self.stack.append(self.span)
-        return self.span
-
-    def __exit__(self, *args):
-        self.stack.pop()
-
+def create_eval_context(**kwargs: Any) -> UiPathEvalContext:
+    """Helper to create UiPathEvalContext with specific attribute values."""
+    context = UiPathEvalContext()
+    for key, value in kwargs.items():
+        setattr(context, key, value)
+    return context
+
+
+class TestEvalSetRunSpanCreation:
+    """Tests that verify EvalSetRun span is created correctly by the runtime."""
+
+    @pytest.fixture
+    def mock_trace_manager(self) -> MagicMock:
+        """Create a mock trace manager with a capturing tracer."""
+        trace_manager = MagicMock()
+        self.capturing_tracer = SpanCapturingTracer()
+        trace_manager.tracer_provider.get_tracer.return_value = self.capturing_tracer
+        trace_manager.tracer_span_processors = []
+        return trace_manager
+
+    @pytest.fixture
+    def mock_factory(self) -> MagicMock:
+        """Create a mock runtime factory."""
+        factory = MagicMock()
+        mock_runtime = AsyncMock()
+        mock_runtime.get_schema = AsyncMock(return_value=MagicMock())
+        factory.new_runtime = AsyncMock(return_value=mock_runtime)
+        return factory
+
+    @pytest.fixture
+    def mock_event_bus(self) -> MagicMock:
+        """Create a mock event bus."""
+        event_bus = MagicMock()
+        event_bus.publish = AsyncMock()
+        return event_bus
+
+    @pytest.mark.asyncio
+    async def test_execute_creates_eval_set_run_span(
+        self,
+        mock_trace_manager: MagicMock,
+        mock_factory: MagicMock,
+        mock_event_bus: MagicMock,
+    ) -> None:
+        """Test that execute() creates the Evaluation Set Run span."""
+        context = create_eval_context(
+            eval_set="test.json",
+            entrypoint="main.py:main",
+        )
 
-class TestEvalSetRunSpanIntegration:
-    """Integration tests for Evaluation Set Run span."""
+        runtime = UiPathEvalRuntime(
+            context=context,
+            factory=mock_factory,
+            trace_manager=mock_trace_manager,
+            event_bus=mock_event_bus,
+        )
 
-    def test_eval_set_run_span_created_first(self):
-        """Test that Evaluation Set Run span is created as the root span."""
-        recorder = SpanRecorder()
+        # Mock initiate_evaluation to return empty results
+        mock_eval_set = MagicMock()
+        mock_eval_set.name = "Test Eval Set"
+        mock_eval_set.evaluations = []
 
-        # Simulate the span creation from _runtime.py:315-317
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run"},
+        with patch.object(
+            runtime,
+            "initiate_evaluation",
+            new=AsyncMock(return_value=(mock_eval_set, [], iter([]))),
         ):
-            pass
+            try:
+                await runtime.execute()
+            except Exception:
+                pass  # We just want to verify span creation
 
-        assert len(recorder.spans) == 1
-        span = recorder.spans[0]
+        # Verify the span was created
+        eval_set_run_spans = self.capturing_tracer.get_spans_by_type("eval_set_run")
+        assert len(eval_set_run_spans) >= 1
+
+        span = eval_set_run_spans[0]
         assert span["name"] == "Evaluation Set Run"
         assert span["attributes"]["span_type"] == "eval_set_run"
-        assert span["parent"] is None
-
-    def test_eval_set_run_span_with_run_id(self):
-        """Test that eval_set_run_id is included when provided."""
-        recorder = SpanRecorder()
-        eval_set_run_id = "custom-run-123"
-
-        span_attributes: Dict[str, str] = {"span_type": "eval_set_run"}
-        span_attributes["eval_set_run_id"] = eval_set_run_id
-
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes=span_attributes,
-        ):
-            pass
-
-        span = recorder.spans[0]
-        assert span["attributes"]["eval_set_run_id"] == "custom-run-123"
 
+    @pytest.mark.asyncio
+    async def test_execute_includes_eval_set_run_id_when_provided(
+        self,
+        mock_trace_manager: MagicMock,
+        mock_factory: MagicMock,
+        mock_event_bus: MagicMock,
+    ) -> None:
+        """Test that eval_set_run_id is included in span when provided."""
+        context = create_eval_context(
+            eval_set="test.json",
+            entrypoint="main.py:main",
+            eval_set_run_id="custom-run-123",
+        )
 
-class TestEvaluationSpanIntegration:
-    """Integration tests for Evaluation span."""
+        runtime = UiPathEvalRuntime(
+            context=context,
+            factory=mock_factory,
+            trace_manager=mock_trace_manager,
+            event_bus=mock_event_bus,
+        )
 
-    def test_evaluation_span_is_child_of_eval_set_run(self):
-        """Test that Evaluation span is a child of Evaluation Set Run."""
-        recorder = SpanRecorder()
-        execution_id = str(uuid.uuid4())
+        mock_eval_set = MagicMock()
+        mock_eval_set.name = "Test Eval Set"
+        mock_eval_set.evaluations = []
 
-        # Simulate the nested span creation
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run"},
+        with patch.object(
+            runtime,
+            "initiate_evaluation",
+            new=AsyncMock(return_value=(mock_eval_set, [], iter([]))),
         ):
-            with recorder.start_as_current_span(
-                "Evaluation",
-                attributes={
-                    "execution.id": execution_id,
-                    "span_type": "evaluation",
-                    "eval_item_id": "item-1",
-                    "eval_item_name": "Test Item",
-                },
-            ):
+            try:
+                await runtime.execute()
+            except Exception:
                 pass
 
-        assert len(recorder.spans) == 2
-
-        eval_set_run_span = recorder.get_span_by_name("Evaluation Set Run")
-        evaluation_span = recorder.get_span_by_name("Evaluation")
-
-        assert eval_set_run_span is not None
-        assert evaluation_span is not None
-        assert evaluation_span["parent"] == "Evaluation Set Run"
-
-    def test_multiple_evaluation_spans_share_parent(self):
-        """Test that multiple Evaluation spans share the same parent."""
-        recorder = SpanRecorder()
-
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run"},
-        ):
-            for i in range(3):
-                with recorder.start_as_current_span(
-                    "Evaluation",
-                    attributes={
-                        "execution.id": str(uuid.uuid4()),
-                        "span_type": "evaluation",
-                        "eval_item_id": f"item-{i}",
-                        "eval_item_name": f"Test Item {i}",
-                    },
-                ):
-                    pass
-
-        evaluation_spans = recorder.get_spans_by_type("evaluation")
-        assert len(evaluation_spans) == 3
-
-        for span in evaluation_spans:
-            assert span["parent"] == "Evaluation Set Run"
-
-
-class TestEvaluatorSpanIntegration:
-    """Integration tests for Evaluator span."""
-
-    def test_evaluator_span_is_child_of_evaluation(self):
-        """Test that Evaluator span is a child of Evaluation."""
-        recorder = SpanRecorder()
-
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run"},
-        ):
-            with recorder.start_as_current_span(
-                "Evaluation",
-                attributes={
-                    "execution.id": str(uuid.uuid4()),
-                    "span_type": "evaluation",
-                    "eval_item_id": "item-1",
-                    "eval_item_name": "Test Item",
-                },
-            ):
-                with recorder.start_as_current_span(
-                    "Evaluator: AccuracyEvaluator",
-                    attributes={
-                        "span_type": "evaluator",
-                        "evaluator_id": "accuracy-1",
-                        "evaluator_name": "AccuracyEvaluator",
-                        "eval_item_id": "item-1",
-                    },
-                ):
-                    pass
-
-        evaluator_span = recorder.spans[-1]
-        assert evaluator_span["name"] == "Evaluator: AccuracyEvaluator"
-        assert evaluator_span["parent"] == "Evaluation"
-
-    def test_multiple_evaluator_spans_per_evaluation(self):
-        """Test that multiple Evaluator spans can be children of one Evaluation."""
-        recorder = SpanRecorder()
-        evaluator_names = ["Accuracy", "Relevance", "Fluency"]
-
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run"},
-        ):
-            with recorder.start_as_current_span(
-                "Evaluation",
-                attributes={
-                    "execution.id": str(uuid.uuid4()),
-                    "span_type": "evaluation",
-                    "eval_item_id": "item-1",
-                    "eval_item_name": "Test Item",
-                },
-            ):
-                for name in evaluator_names:
-                    with recorder.start_as_current_span(
-                        f"Evaluator: {name}",
-                        attributes={
-                            "span_type": "evaluator",
-                            "evaluator_id": f"{name.lower()}-1",
-                            "evaluator_name": name,
-                            "eval_item_id": "item-1",
-                        },
-                    ):
-                        pass
-
-        evaluator_spans = recorder.get_spans_by_type("evaluator")
-        assert len(evaluator_spans) == 3
-
-        for span in evaluator_spans:
-            assert span["parent"] == "Evaluation"
+        span = self.capturing_tracer.get_spans_by_type("eval_set_run")[0]
+        assert span["attributes"]["eval_set_run_id"] == "custom-run-123"
 
 
-class TestFullSpanHierarchy:
-    """Integration tests for the complete span hierarchy."""
+class TestEvaluationSpanCreation:
+    """Tests that verify Evaluation span is created correctly."""
+
+    @pytest.fixture
+    def capturing_tracer(self) -> SpanCapturingTracer:
+        return SpanCapturingTracer()
+
+    @pytest.fixture
+    def mock_trace_manager(self, capturing_tracer: SpanCapturingTracer) -> MagicMock:
+        trace_manager = MagicMock()
+        trace_manager.tracer_provider.get_tracer.return_value = capturing_tracer
+        trace_manager.tracer_span_processors = []
+        return trace_manager
+
+    @pytest.fixture
+    def mock_factory(self) -> MagicMock:
+        factory = MagicMock()
+        mock_runtime = AsyncMock()
+        mock_runtime.get_schema = AsyncMock(return_value=MagicMock())
+        factory.new_runtime = AsyncMock(return_value=mock_runtime)
+        return factory
+
+    @pytest.fixture
+    def mock_event_bus(self) -> MagicMock:
+        event_bus = MagicMock()
+        event_bus.publish = AsyncMock()
+        return event_bus
+
+    @pytest.fixture
+    def mock_eval_item(self) -> Any:
+        """Create a real EvaluationItem instance for testing."""
+        from uipath._cli._evals._models._evaluation_set import EvaluationItem
+
+        return EvaluationItem(
+            id="item-123",
+            name="Test Evaluation",
+            inputs={},
+            evaluation_criterias={},
+        )
 
-    def test_complete_hierarchy_structure(self):
-        """Test the complete span hierarchy: EvalSetRun > Evaluation > Evaluator."""
-        recorder = SpanRecorder()
+    @pytest.mark.asyncio
+    async def test_execute_eval_creates_evaluation_span(
+        self,
+        capturing_tracer: SpanCapturingTracer,
+        mock_trace_manager: MagicMock,
+        mock_factory: MagicMock,
+        mock_event_bus: MagicMock,
+        mock_eval_item: Any,
+    ) -> None:
+        """Test that _execute_eval creates an Evaluation span with correct attributes."""
+        context = create_eval_context(
+            eval_set="test.json",
+            entrypoint="main.py:main",
+        )
 
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run", "eval_set_run_id": "run-1"},
-        ):
-            for i in range(2):
-                with recorder.start_as_current_span(
-                    "Evaluation",
-                    attributes={
-                        "execution.id": str(uuid.uuid4()),
-                        "span_type": "evaluation",
-                        "eval_item_id": f"item-{i}",
-                        "eval_item_name": f"Test Item {i}",
-                    },
-                ):
-                    with recorder.start_as_current_span(
-                        "Evaluator: TestEvaluator",
-                        attributes={
-                            "span_type": "evaluator",
-                            "evaluator_id": "test-eval",
-                            "evaluator_name": "TestEvaluator",
-                            "eval_item_id": f"item-{i}",
-                        },
-                    ):
-                        pass
-
-        # Should have: 1 EvalSetRun + 2 Evaluation + 2 Evaluator = 5 spans
-        assert len(recorder.spans) == 5
-
-        eval_set_run_spans = recorder.get_spans_by_type("eval_set_run")
-        evaluation_spans = recorder.get_spans_by_type("evaluation")
-        evaluator_spans = recorder.get_spans_by_type("evaluator")
-
-        assert len(eval_set_run_spans) == 1
-        assert len(evaluation_spans) == 2
-        assert len(evaluator_spans) == 2
-
-    def test_span_attributes_are_complete(self):
-        """Test that all spans have the required attributes."""
-        recorder = SpanRecorder()
-
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run", "eval_set_run_id": "run-123"},
-        ):
-            with recorder.start_as_current_span(
-                "Evaluation",
-                attributes={
-                    "execution.id": "exec-456",
-                    "span_type": "evaluation",
-                    "eval_item_id": "item-789",
-                    "eval_item_name": "My Test",
-                },
-            ):
-                with recorder.start_as_current_span(
-                    "Evaluator: Accuracy",
-                    attributes={
-                        "span_type": "evaluator",
-                        "evaluator_id": "acc-1",
-                        "evaluator_name": "Accuracy",
-                        "eval_item_id": "item-789",
-                    },
-                ):
-                    pass
-
-        # Verify EvalSetRun span
-        eval_set_run = recorder.get_spans_by_type("eval_set_run")[0]
-        assert eval_set_run["attributes"]["eval_set_run_id"] == "run-123"
-
-        # Verify Evaluation span
-        evaluation = recorder.get_spans_by_type("evaluation")[0]
-        assert evaluation["attributes"]["execution.id"] == "exec-456"
-        assert evaluation["attributes"]["eval_item_id"] == "item-789"
-        assert evaluation["attributes"]["eval_item_name"] == "My Test"
-
-        # Verify Evaluator span
-        evaluator = recorder.get_spans_by_type("evaluator")[0]
-        assert evaluator["attributes"]["evaluator_id"] == "acc-1"
-        assert evaluator["attributes"]["evaluator_name"] == "Accuracy"
-        assert evaluator["attributes"]["eval_item_id"] == "item-789"
-
-
-class TestSpanNaming:
-    """Tests for span naming conventions."""
-
-    def test_eval_set_run_span_name(self):
-        """Test that EvalSetRun span has correct name."""
-        recorder = SpanRecorder()
-
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run"},
-        ):
-            pass
+        runtime = UiPathEvalRuntime(
+            context=context,
+            factory=mock_factory,
+            trace_manager=mock_trace_manager,
+            event_bus=mock_event_bus,
+        )
 
-        assert recorder.spans[0]["name"] == "Evaluation Set Run"
+        # Mock execute_runtime to return a successful result
+        mock_execution_output = MagicMock()
+        mock_execution_output.result.output = {"result": 42}
+        mock_execution_output.result.status = "successful"
+        mock_execution_output.result.error = None
+        mock_execution_output.spans = []
+        mock_execution_output.logs = []
 
-    def test_evaluation_span_name(self):
-        """Test that Evaluation span has correct name."""
-        recorder = SpanRecorder()
+        mock_runtime = AsyncMock()
 
-        with recorder.start_as_current_span(
-            "Evaluation",
-            attributes={"span_type": "evaluation"},
+        with patch.object(
+            runtime,
+            "execute_runtime",
+            new=AsyncMock(return_value=mock_execution_output),
         ):
-            pass
-
-        assert recorder.spans[0]["name"] == "Evaluation"
+            await runtime._execute_eval(mock_eval_item, [], mock_runtime)
+
+        # Verify Evaluation span was created
+        evaluation_spans = capturing_tracer.get_spans_by_type("evaluation")
+        assert len(evaluation_spans) == 1
+
+        span = evaluation_spans[0]
+        assert span["name"] == "Evaluation"
+        assert span["attributes"]["span_type"] == "evaluation"
+        assert span["attributes"]["eval_item_id"] == "item-123"
+        assert span["attributes"]["eval_item_name"] == "Test Evaluation"
+        assert "execution.id" in span["attributes"]
+
+
+class TestEvaluatorSpanCreation:
+    """Tests that verify Evaluator span is created correctly."""
+
+    @pytest.fixture
+    def capturing_tracer(self) -> SpanCapturingTracer:
+        return SpanCapturingTracer()
+
+    @pytest.fixture
+    def mock_trace_manager(self, capturing_tracer: SpanCapturingTracer) -> MagicMock:
+        trace_manager = MagicMock()
+        trace_manager.tracer_provider.get_tracer.return_value = capturing_tracer
+        trace_manager.tracer_span_processors = []
+        return trace_manager
+
+    @pytest.fixture
+    def mock_factory(self) -> MagicMock:
+        factory = MagicMock()
+        return factory
+
+    @pytest.fixture
+    def mock_event_bus(self) -> MagicMock:
+        event_bus = MagicMock()
+        event_bus.publish = AsyncMock()
+        return event_bus
+
+    @pytest.fixture
+    def mock_evaluator(self) -> MagicMock:
+        evaluator = MagicMock(spec=BaseEvaluator)
+        evaluator.id = "accuracy-evaluator"
+        evaluator.name = "AccuracyEvaluator"
+        evaluator.validate_and_evaluate_criteria = AsyncMock(
+            return_value=NumericEvaluationResult(score=0.95, details="Good accuracy")
+        )
+        return evaluator
+
+    @pytest.fixture
+    def mock_eval_item(self) -> MagicMock:
+        eval_item = MagicMock()
+        eval_item.id = "eval-item-456"
+        eval_item.name = "Test Item"
+        eval_item.inputs = {"input": "test"}
+        eval_item.expected_agent_behavior = None
+        return eval_item
+
+    @pytest.fixture
+    def mock_execution_output(self) -> MagicMock:
+        output = MagicMock()
+        output.result.output = {"result": 42}
+        output.spans = []
+        return output
+
+    @pytest.mark.asyncio
+    async def test_run_evaluator_creates_evaluator_span(
+        self,
+        capturing_tracer: SpanCapturingTracer,
+        mock_trace_manager: MagicMock,
+        mock_factory: MagicMock,
+        mock_event_bus: MagicMock,
+        mock_evaluator: MagicMock,
+        mock_eval_item: MagicMock,
+        mock_execution_output: MagicMock,
+    ) -> None:
+        """Test that run_evaluator creates an Evaluator span with correct attributes."""
+        context = create_eval_context(
+            eval_set="test.json",
+            entrypoint="main.py:main",
+        )
 
-    def test_evaluator_span_name_format(self):
-        """Test that Evaluator span name follows the pattern 'Evaluator: {name}'."""
-        recorder = SpanRecorder()
-        evaluator_name = "MyCustomEvaluator"
+        runtime = UiPathEvalRuntime(
+            context=context,
+            factory=mock_factory,
+            trace_manager=mock_trace_manager,
+            event_bus=mock_event_bus,
+        )
 
-        with recorder.start_as_current_span(
-            f"Evaluator: {evaluator_name}",
-            attributes={
-                "span_type": "evaluator",
-                "evaluator_name": evaluator_name,
-            },
-        ):
-            pass
+        await runtime.run_evaluator(
+            evaluator=mock_evaluator,
+            execution_output=mock_execution_output,
+            eval_item=mock_eval_item,
+            evaluation_criteria=None,
+        )
 
-        span = recorder.spans[0]
-        assert span["name"] == "Evaluator: MyCustomEvaluator"
-        assert span["name"].startswith("Evaluator: ")
+        # Verify Evaluator span was created
+        evaluator_spans = capturing_tracer.get_spans_by_type("evaluator")
+        assert len(evaluator_spans) == 1
+
+        span = evaluator_spans[0]
+        assert span["name"] == "Evaluator: AccuracyEvaluator"
+        assert span["attributes"]["span_type"] == "evaluator"
+        assert span["attributes"]["evaluator_id"] == "accuracy-evaluator"
+        assert span["attributes"]["evaluator_name"] == "AccuracyEvaluator"
+        assert span["attributes"]["eval_item_id"] == "eval-item-456"
+
+    @pytest.mark.asyncio
+    async def test_multiple_evaluators_create_multiple_spans(
+        self,
+        capturing_tracer: SpanCapturingTracer,
+        mock_trace_manager: MagicMock,
+        mock_factory: MagicMock,
+        mock_event_bus: MagicMock,
+        mock_eval_item: MagicMock,
+        mock_execution_output: MagicMock,
+    ) -> None:
+        """Test that running multiple evaluators creates multiple spans."""
+        context = create_eval_context(
+            eval_set="test.json",
+            entrypoint="main.py:main",
+        )
 
+        runtime = UiPathEvalRuntime(
+            context=context,
+            factory=mock_factory,
+            trace_manager=mock_trace_manager,
+            event_bus=mock_event_bus,
+        )
 
-class TestExecutionIdTracking:
-    """Tests for execution.id tracking in spans."""
+        evaluator_names = ["Accuracy", "Relevance", "Fluency"]
+        for name in evaluator_names:
+            evaluator = MagicMock(spec=BaseEvaluator)
+            evaluator.id = f"{name.lower()}-id"
+            evaluator.name = name
+            evaluator.validate_and_evaluate_criteria = AsyncMock(
+                return_value=NumericEvaluationResult(score=0.9)
+            )
+
+            await runtime.run_evaluator(
+                evaluator=evaluator,
+                execution_output=mock_execution_output,
+                eval_item=mock_eval_item,
+                evaluation_criteria=None,
+            )
+
+        evaluator_spans = capturing_tracer.get_spans_by_type("evaluator")
+        assert len(evaluator_spans) == 3
 
-    def test_each_evaluation_has_unique_execution_id(self):
+        span_names = [s["name"] for s in evaluator_spans]
+        assert "Evaluator: Accuracy" in span_names
+        assert "Evaluator: Relevance" in span_names
+        assert "Evaluator: Fluency" in span_names
+
+
+class TestSpanAttributeValues:
+    """Tests for verifying specific span attribute values."""
+
+    @pytest.fixture
+    def capturing_tracer(self) -> SpanCapturingTracer:
+        return SpanCapturingTracer()
+
+    @pytest.fixture
+    def mock_trace_manager(self, capturing_tracer: SpanCapturingTracer) -> MagicMock:
+        trace_manager = MagicMock()
+        trace_manager.tracer_provider.get_tracer.return_value = capturing_tracer
+        trace_manager.tracer_span_processors = []
+        return trace_manager
+
+    @pytest.fixture
+    def mock_factory(self) -> MagicMock:
+        factory = MagicMock()
+        return factory
+
+    @pytest.fixture
+    def mock_event_bus(self) -> MagicMock:
+        event_bus = MagicMock()
+        event_bus.publish = AsyncMock()
+        return event_bus
+
+    @pytest.mark.asyncio
+    async def test_evaluation_span_has_unique_execution_id(
+        self,
+        capturing_tracer: SpanCapturingTracer,
+        mock_trace_manager: MagicMock,
+        mock_factory: MagicMock,
+        mock_event_bus: MagicMock,
+    ) -> None:
         """Test that each Evaluation span gets a unique execution.id."""
-        recorder = SpanRecorder()
-        execution_ids = []
-
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run"},
-        ):
-            for i in range(3):
-                exec_id = str(uuid.uuid4())
-                execution_ids.append(exec_id)
-                with recorder.start_as_current_span(
-                    "Evaluation",
-                    attributes={
-                        "execution.id": exec_id,
-                        "span_type": "evaluation",
-                        "eval_item_id": f"item-{i}",
-                        "eval_item_name": f"Item {i}",
-                    },
-                ):
-                    pass
-
-        # Verify all execution IDs are unique
-        assert len(set(execution_ids)) == 3
-
-        # Verify each evaluation span has its execution.id
-        evaluation_spans = recorder.get_spans_by_type("evaluation")
-        for i, span in enumerate(evaluation_spans):
-            assert span["attributes"]["execution.id"] == execution_ids[i]
-
-    def test_eval_set_run_does_not_have_execution_id(self):
-        """Test that EvalSetRun span does NOT have execution.id.
-
-        This is intentional to prevent ID propagation to child spans.
-        """
-        recorder = SpanRecorder()
-
-        with recorder.start_as_current_span(
-            "Evaluation Set Run",
-            attributes={"span_type": "eval_set_run"},
-        ):
-            pass
+        context = create_eval_context(
+            eval_set="test.json",
+            entrypoint="main.py:main",
+        )
 
-        eval_set_run = recorder.spans[0]
-        assert "execution.id" not in eval_set_run["attributes"]
+        runtime = UiPathEvalRuntime(
+            context=context,
+            factory=mock_factory,
+            trace_manager=mock_trace_manager,
+            event_bus=mock_event_bus,
+        )
 
+        mock_runtime = AsyncMock()
+        mock_execution_output = MagicMock()
+        mock_execution_output.result.output = {}
+        mock_execution_output.result.status = "successful"
+        mock_execution_output.result.error = None
+        mock_execution_output.spans = []
+        mock_execution_output.logs = []
+
+        from uipath._cli._evals._models._evaluation_set import EvaluationItem
+
+        for i in range(3):
+            eval_item = EvaluationItem(
+                id=f"item-{i}",
+                name=f"Test {i}",
+                inputs={},
+                evaluation_criterias={},
+            )
+
+            with patch.object(
+                runtime,
+                "execute_runtime",
+                new=AsyncMock(return_value=mock_execution_output),
+            ):
+                await runtime._execute_eval(eval_item, [], mock_runtime)
 
-class TestEvaluatorSpanEvalItemId:
-    """Tests for eval_item_id in evaluator spans."""
+        # Get execution IDs from spans
+        evaluation_spans = capturing_tracer.get_spans_by_type("evaluation")
+        execution_ids = [s["attributes"]["execution.id"] for s in evaluation_spans]
 
-    def test_evaluator_span_has_eval_item_id(self):
-        """Test that Evaluator span includes the eval_item_id."""
-        recorder = SpanRecorder()
-        eval_item_id = "item-specific-123"
+        # All execution IDs should be unique
+        assert len(set(execution_ids)) == 3
 
-        with recorder.start_as_current_span(
-            "Evaluation",
-            attributes={
-                "execution.id": str(uuid.uuid4()),
-                "span_type": "evaluation",
-                "eval_item_id": eval_item_id,
-                "eval_item_name": "Test",
-            },
-        ):
-            with recorder.start_as_current_span(
-                "Evaluator: Test",
-                attributes={
-                    "span_type": "evaluator",
-                    "evaluator_id": "test-1",
-                    "evaluator_name": "Test",
-                    "eval_item_id": eval_item_id,
-                },
-            ):
-                pass
+    @pytest.mark.asyncio
+    async def test_evaluator_span_inherits_eval_item_id(
+        self,
+        capturing_tracer: SpanCapturingTracer,
+        mock_trace_manager: MagicMock,
+        mock_factory: MagicMock,
+        mock_event_bus: MagicMock,
+    ) -> None:
+        """Test that Evaluator span contains the same eval_item_id as its parent Evaluation."""
+        context = create_eval_context(
+            eval_set="test.json",
+            entrypoint="main.py:main",
+        )
 
-        evaluator_span = recorder.get_spans_by_type("evaluator")[0]
-        assert evaluator_span["attributes"]["eval_item_id"] == eval_item_id
-
-    def test_evaluator_and_evaluation_share_eval_item_id(self):
-        """Test that Evaluator and Evaluation spans share the same eval_item_id."""
-        recorder = SpanRecorder()
-        eval_item_id = "shared-item-456"
-
-        with recorder.start_as_current_span(
-            "Evaluation",
-            attributes={
-                "execution.id": str(uuid.uuid4()),
-                "span_type": "evaluation",
-                "eval_item_id": eval_item_id,
-                "eval_item_name": "Test",
-            },
-        ):
-            with recorder.start_as_current_span(
-                "Evaluator: Test",
-                attributes={
-                    "span_type": "evaluator",
-                    "evaluator_id": "test-1",
-                    "evaluator_name": "Test",
-                    "eval_item_id": eval_item_id,
-                },
-            ):
-                pass
+        runtime = UiPathEvalRuntime(
+            context=context,
+            factory=mock_factory,
+            trace_manager=mock_trace_manager,
+            event_bus=mock_event_bus,
+        )
 
-        evaluation_span = recorder.get_spans_by_type("evaluation")[0]
-        evaluator_span = recorder.get_spans_by_type("evaluator")[0]
+        eval_item = MagicMock()
+        eval_item.id = "shared-item-id-789"
+        eval_item.name = "Test"
+        eval_item.inputs = {}
+        eval_item.expected_agent_behavior = None
+
+        mock_execution_output = MagicMock()
+        mock_execution_output.result.output = {}
+        mock_execution_output.spans = []
+
+        evaluator = MagicMock(spec=BaseEvaluator)
+        evaluator.id = "test-evaluator"
+        evaluator.name = "TestEvaluator"
+        evaluator.validate_and_evaluate_criteria = AsyncMock(
+            return_value=NumericEvaluationResult(score=1.0)
+        )
 
-        assert (
-            evaluation_span["attributes"]["eval_item_id"]
-            == evaluator_span["attributes"]["eval_item_id"]
+        await runtime.run_evaluator(
+            evaluator=evaluator,
+            execution_output=mock_execution_output,
+            eval_item=eval_item,
+            evaluation_criteria=None,
         )
+
+        evaluator_span = capturing_tracer.get_spans_by_type("evaluator")[0]
+        assert evaluator_span["attributes"]["eval_item_id"] == "shared-item-id-789"