From a544e16653bbda8ab7fc626d2e5ef983f32f5dbd Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 7 Jan 2026 15:08:22 -0800 Subject: [PATCH 01/12] feat: rearrange the eval trace, expose new telemetry client and send eval set events --- pyproject.toml | 1 + src/uipath/_cli/_evals/_runtime.py | 470 +++++++++++++++------------ src/uipath/_cli/_evals/_telemetry.py | 280 ++++++++++++++++ src/uipath/_cli/cli_eval.py | 4 + src/uipath/telemetry/__init__.py | 9 +- src/uipath/telemetry/_track.py | 197 ++++++++++- uv.lock | 11 + 7 files changed, 759 insertions(+), 213 deletions(-) create mode 100644 src/uipath/_cli/_evals/_telemetry.py diff --git a/pyproject.toml b/pyproject.toml index ea5a6854a..85f0f2f66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "python-socketio>=5.15.0, <6.0.0", "coverage>=7.8.2", "mermaid-builder==0.0.3", + "applicationinsights>=0.11.10", ] classifiers = [ "Intended Audience :: Developers", diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index 8db78ab43..aaab896d9 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -20,6 +20,7 @@ from opentelemetry import context as context_api from opentelemetry.sdk.trace import ReadableSpan, Span from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult +from opentelemetry.trace import Status, StatusCode from pydantic import BaseModel from uipath.core.tracing import UiPathTraceManager from uipath.core.tracing.processors import UiPathExecutionBatchTraceProcessor @@ -299,60 +300,91 @@ async def execute(self) -> UiPathRuntimeResult: ) try: with self._mocker_cache(): - ( - evaluation_set, - evaluators, - evaluation_iterable, - ) = await self.initiate_evaluation(runtime) - workers = self.context.workers or 1 - assert workers >= 1 - eval_run_result_list = await execute_parallel( - evaluation_iterable, workers - ) - results = UiPathEvalOutput( - evaluation_set_name=evaluation_set.name, - evaluation_set_results=eval_run_result_list, - ) - - # Computing evaluator averages - evaluator_averages: dict[str, float] = defaultdict(float) - evaluator_count: dict[str, int] = defaultdict(int) - - # Check if any eval runs failed - any_failed = False - for eval_run_result in results.evaluation_set_results: - # Check if the agent execution had an error - if ( - eval_run_result.agent_execution_output - and eval_run_result.agent_execution_output.result.error - ): - any_failed = True - - for result_dto in eval_run_result.evaluation_run_results: - evaluator_averages[result_dto.evaluator_id] += ( - result_dto.result.score + # Create the parent "Evaluation set run" span + # Use tracer from trace_manager's provider to ensure spans go through + # the ExecutionSpanProcessor + # NOTE: Do NOT set execution.id on this parent span, as the mixin in + # UiPathExecutionBatchTraceProcessor propagates execution.id from parent + # to child spans, which would overwrite the per-eval execution.id + tracer = self.trace_manager.tracer_provider.get_tracer(__name__) + span_attributes: dict[str, str] = { + "span_type": "eval_set_run", + } + if self.context.eval_set_run_id: + span_attributes["eval_set_run_id"] = self.context.eval_set_run_id + with tracer.start_as_current_span( + "Evaluation Set Run", attributes=span_attributes + ) as span: + try: + ( + evaluation_set, + evaluators, + evaluation_iterable, + ) = await self.initiate_evaluation(runtime) + workers = self.context.workers or 1 + assert workers >= 1 + eval_run_result_list = await execute_parallel( + evaluation_iterable, workers + ) + results = UiPathEvalOutput( + evaluation_set_name=evaluation_set.name, + evaluation_set_results=eval_run_result_list, ) - evaluator_count[result_dto.evaluator_id] += 1 - for eval_id in evaluator_averages: - evaluator_averages[eval_id] = ( - evaluator_averages[eval_id] / evaluator_count[eval_id] - ) - await self.event_bus.publish( - EvaluationEvents.UPDATE_EVAL_SET_RUN, - EvalSetRunUpdatedEvent( - execution_id=self.execution_id, - evaluator_scores=evaluator_averages, - success=not any_failed, - ), - wait_for_completion=False, - ) + # Computing evaluator averages + evaluator_averages: dict[str, float] = defaultdict(float) + evaluator_count: dict[str, int] = defaultdict(int) + + # Check if any eval runs failed + any_failed = False + for eval_run_result in results.evaluation_set_results: + # Check if the agent execution had an error + if ( + eval_run_result.agent_execution_output + and eval_run_result.agent_execution_output.result.error + ): + any_failed = True + + for result_dto in eval_run_result.evaluation_run_results: + evaluator_averages[result_dto.evaluator_id] += ( + result_dto.result.score + ) + evaluator_count[result_dto.evaluator_id] += 1 + + for eval_id in evaluator_averages: + evaluator_averages[eval_id] = ( + evaluator_averages[eval_id] / evaluator_count[eval_id] + ) + await self.event_bus.publish( + EvaluationEvents.UPDATE_EVAL_SET_RUN, + EvalSetRunUpdatedEvent( + execution_id=self.execution_id, + evaluator_scores=evaluator_averages, + success=not any_failed, + ), + wait_for_completion=False, + ) - result = UiPathRuntimeResult( - output={**results.model_dump(by_alias=True)}, - status=UiPathRuntimeStatus.SUCCESSFUL, - ) - return result + result = UiPathRuntimeResult( + output={**results.model_dump(by_alias=True)}, + status=UiPathRuntimeStatus.SUCCESSFUL, + ) + return result + except Exception as e: + # Set span status to ERROR on exception + span.set_status(Status(StatusCode.ERROR, str(e))) + + # Publish failure event for eval set run + await self.event_bus.publish( + EvaluationEvents.UPDATE_EVAL_SET_RUN, + EvalSetRunUpdatedEvent( + execution_id=self.execution_id, + evaluator_scores={}, + success=False, + ), + wait_for_completion=False, + ) + raise finally: await runtime.dispose() @@ -378,165 +410,180 @@ async def _execute_eval( ), ) - evaluation_run_results = EvaluationRunResult( - evaluation_name=eval_item.name, evaluation_run_results=[] - ) + # Create the "Evaluation" span for this eval item + # Use tracer from trace_manager's provider to ensure spans go through + # the ExecutionSpanProcessor + tracer = self.trace_manager.tracer_provider.get_tracer(__name__) + with tracer.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": execution_id, + "span_type": "evaluation", + "eval_item_id": eval_item.id, + "eval_item_name": eval_item.name, + }, + ): + evaluation_run_results = EvaluationRunResult( + evaluation_name=eval_item.name, evaluation_run_results=[] + ) - try: try: - agent_execution_output = await self.execute_runtime( - eval_item, execution_id, runtime - ) - except Exception as e: - if self.context.verbose: - if isinstance(e, EvaluationRuntimeException): - spans = e.spans - logs = e.logs - execution_time = e.execution_time - loggable_error = e.root_exception - else: - spans = [] - logs = [] - execution_time = 0 - loggable_error = e - - error_info = UiPathErrorContract( - code="RUNTIME_SHUTDOWN_ERROR", - title="Runtime shutdown failed", - detail=f"Error: {str(loggable_error)}", - category=UiPathErrorCategory.UNKNOWN, - ) - error_result = UiPathRuntimeResult( - status=UiPathRuntimeStatus.FAULTED, - error=error_info, + try: + agent_execution_output = await self.execute_runtime( + eval_item, execution_id, runtime ) + except Exception as e: + if self.context.verbose: + if isinstance(e, EvaluationRuntimeException): + spans = e.spans + logs = e.logs + execution_time = e.execution_time + loggable_error = e.root_exception + else: + spans = [] + logs = [] + execution_time = 0 + loggable_error = e + + error_info = UiPathErrorContract( + code="RUNTIME_SHUTDOWN_ERROR", + title="Runtime shutdown failed", + detail=f"Error: {str(loggable_error)}", + category=UiPathErrorCategory.UNKNOWN, + ) + error_result = UiPathRuntimeResult( + status=UiPathRuntimeStatus.FAULTED, + error=error_info, + ) + evaluation_run_results.agent_execution_output = ( + convert_eval_execution_output_to_serializable( + UiPathEvalRunExecutionOutput( + execution_time=execution_time, + result=error_result, + spans=spans, + logs=logs, + ) + ) + ) + raise + + if self.context.verbose: evaluation_run_results.agent_execution_output = ( convert_eval_execution_output_to_serializable( - UiPathEvalRunExecutionOutput( - execution_time=execution_time, - result=error_result, - spans=spans, - logs=logs, - ) + agent_execution_output ) ) - raise - - if self.context.verbose: - evaluation_run_results.agent_execution_output = ( - convert_eval_execution_output_to_serializable( - agent_execution_output + evaluation_item_results: list[EvalItemResult] = [] + + for evaluator in evaluators: + if evaluator.id not in eval_item.evaluation_criterias: + # Skip! + continue + evaluation_criteria = eval_item.evaluation_criterias[evaluator.id] + + evaluation_result = await self.run_evaluator( + evaluator=evaluator, + execution_output=agent_execution_output, + eval_item=eval_item, + evaluation_criteria=evaluator.evaluation_criteria_type( + **evaluation_criteria + ) + if evaluation_criteria + else evaluator.evaluator_config.default_evaluation_criteria, ) - ) - evaluation_item_results: list[EvalItemResult] = [] - for evaluator in evaluators: - if evaluator.id not in eval_item.evaluation_criterias: - # Skip! - continue - evaluation_criteria = eval_item.evaluation_criterias[evaluator.id] - - evaluation_result = await self.run_evaluator( - evaluator=evaluator, - execution_output=agent_execution_output, - eval_item=eval_item, - evaluation_criteria=evaluator.evaluation_criteria_type( - **evaluation_criteria + dto_result = EvaluationResultDto.from_evaluation_result( + evaluation_result ) - if evaluation_criteria - else evaluator.evaluator_config.default_evaluation_criteria, - ) - dto_result = EvaluationResultDto.from_evaluation_result( - evaluation_result - ) - - evaluation_run_results.evaluation_run_results.append( - EvaluationRunResultDto( - evaluator_name=evaluator.name, - result=dto_result, - evaluator_id=evaluator.id, + evaluation_run_results.evaluation_run_results.append( + EvaluationRunResultDto( + evaluator_name=evaluator.name, + result=dto_result, + evaluator_id=evaluator.id, + ) ) - ) - evaluation_item_results.append( - EvalItemResult( - evaluator_id=evaluator.id, - result=evaluation_result, + evaluation_item_results.append( + EvalItemResult( + evaluator_id=evaluator.id, + result=evaluation_result, + ) ) + + exception_details = None + agent_output = agent_execution_output.result.output + if agent_execution_output.result.status == UiPathRuntimeStatus.FAULTED: + error = agent_execution_output.result.error + if error is not None: + # we set the exception details for the run event + # Convert error contract to exception + error_exception = Exception( + f"{error.title}: {error.detail} (code: {error.code})" + ) + exception_details = EvalItemExceptionDetails( + exception=error_exception + ) + agent_output = error.model_dump() + + await self.event_bus.publish( + EvaluationEvents.UPDATE_EVAL_RUN, + EvalRunUpdatedEvent( + execution_id=execution_id, + eval_item=eval_item, + eval_results=evaluation_item_results, + success=not agent_execution_output.result.error, + agent_output=agent_output, + agent_execution_time=agent_execution_output.execution_time, + spans=agent_execution_output.spans, + logs=agent_execution_output.logs, + exception_details=exception_details, + ), + wait_for_completion=False, ) - exception_details = None - agent_output = agent_execution_output.result.output - if agent_execution_output.result.status == UiPathRuntimeStatus.FAULTED: - error = agent_execution_output.result.error - if error is not None: - # we set the exception details for the run event - # Convert error contract to exception - error_exception = Exception( - f"{error.title}: {error.detail} (code: {error.code})" - ) - exception_details = EvalItemExceptionDetails( - exception=error_exception + except Exception as e: + exception_details = EvalItemExceptionDetails(exception=e) + + for evaluator in evaluators: + evaluation_run_results.evaluation_run_results.append( + EvaluationRunResultDto( + evaluator_name=evaluator.name, + evaluator_id=evaluator.id, + result=EvaluationResultDto(score=0), + ) ) - agent_output = error.model_dump() - await self.event_bus.publish( - EvaluationEvents.UPDATE_EVAL_RUN, - EvalRunUpdatedEvent( + eval_run_updated_event = EvalRunUpdatedEvent( execution_id=execution_id, eval_item=eval_item, - eval_results=evaluation_item_results, - success=not agent_execution_output.result.error, - agent_output=agent_output, - agent_execution_time=agent_execution_output.execution_time, - spans=agent_execution_output.spans, - logs=agent_execution_output.logs, + eval_results=[], + success=False, + agent_output={}, + agent_execution_time=0.0, exception_details=exception_details, - ), - wait_for_completion=False, - ) - - except Exception as e: - exception_details = EvalItemExceptionDetails(exception=e) - - for evaluator in evaluators: - evaluation_run_results.evaluation_run_results.append( - EvaluationRunResultDto( - evaluator_name=evaluator.name, - evaluator_id=evaluator.id, - result=EvaluationResultDto(score=0), - ) + spans=[], + logs=[], ) + if isinstance(e, EvaluationRuntimeException): + eval_run_updated_event.spans = e.spans + eval_run_updated_event.logs = e.logs + if eval_run_updated_event.exception_details: + eval_run_updated_event.exception_details.exception = ( + e.root_exception + ) + eval_run_updated_event.exception_details.runtime_exception = ( + True + ) - eval_run_updated_event = EvalRunUpdatedEvent( - execution_id=execution_id, - eval_item=eval_item, - eval_results=[], - success=False, - agent_output={}, - agent_execution_time=0.0, - exception_details=exception_details, - spans=[], - logs=[], - ) - if isinstance(e, EvaluationRuntimeException): - eval_run_updated_event.spans = e.spans - eval_run_updated_event.logs = e.logs - if eval_run_updated_event.exception_details: - eval_run_updated_event.exception_details.exception = ( - e.root_exception - ) - eval_run_updated_event.exception_details.runtime_exception = True - - await self.event_bus.publish( - EvaluationEvents.UPDATE_EVAL_RUN, - eval_run_updated_event, - wait_for_completion=False, - ) - finally: - clear_execution_context() + await self.event_bus.publish( + EvaluationEvents.UPDATE_EVAL_RUN, + eval_run_updated_event, + wait_for_completion=False, + ) + finally: + clear_execution_context() - return evaluation_run_results + return evaluation_run_results async def _generate_input_for_eval( self, eval_item: EvaluationItem, runtime: UiPathRuntimeProtocol @@ -678,26 +725,39 @@ async def run_evaluator( *, evaluation_criteria: Any, ) -> EvaluationResult: - output_data: dict[str, Any] | str = {} - if execution_output.result.output: - if isinstance(execution_output.result.output, BaseModel): - output_data = execution_output.result.output.model_dump() - else: - output_data = execution_output.result.output - - agent_execution = AgentExecution( - agent_input=eval_item.inputs, - agent_output=output_data, - agent_trace=execution_output.spans, - expected_agent_behavior=eval_item.expected_agent_behavior, - ) + # Create span for evaluator execution + # Use tracer from trace_manager's provider to ensure spans go through + # the ExecutionSpanProcessor + tracer = self.trace_manager.tracer_provider.get_tracer(__name__) + with tracer.start_as_current_span( + f"Evaluator: {evaluator.name}", + attributes={ + "span_type": "evaluator", + "evaluator_id": evaluator.id, + "evaluator_name": evaluator.name, + "eval_item_id": eval_item.id, + }, + ): + output_data: dict[str, Any] | str = {} + if execution_output.result.output: + if isinstance(execution_output.result.output, BaseModel): + output_data = execution_output.result.output.model_dump() + else: + output_data = execution_output.result.output + + agent_execution = AgentExecution( + agent_input=eval_item.inputs, + agent_output=output_data, + agent_trace=execution_output.spans, + expected_agent_behavior=eval_item.expected_agent_behavior, + ) - result = await evaluator.validate_and_evaluate_criteria( - agent_execution=agent_execution, - evaluation_criteria=evaluation_criteria, - ) + result = await evaluator.validate_and_evaluate_criteria( + agent_execution=agent_execution, + evaluation_criteria=evaluation_criteria, + ) - return result + return result async def _get_agent_model(self, runtime: UiPathRuntimeProtocol) -> str | None: """Get agent model from the runtime. diff --git a/src/uipath/_cli/_evals/_telemetry.py b/src/uipath/_cli/_evals/_telemetry.py new file mode 100644 index 000000000..006ed3a14 --- /dev/null +++ b/src/uipath/_cli/_evals/_telemetry.py @@ -0,0 +1,280 @@ +"""Telemetry subscriber for sending evaluation events to Application Insights. + +This subscriber listens to evaluation lifecycle events and sends custom telemetry +events to Application Insights for monitoring and analytics. +""" + +import logging +import os +import time +from typing import Any, Dict + +from uipath._events._event_bus import EventBus +from uipath._events._events import ( + EvalRunCreatedEvent, + EvalRunUpdatedEvent, + EvalSetRunCreatedEvent, + EvalSetRunUpdatedEvent, + EvaluationEvents, +) +from uipath.telemetry import is_telemetry_enabled, track_event + +logger = logging.getLogger(__name__) + +# Telemetry event names for Application Insights +EVAL_SET_RUN_STARTED = "EvalSetRun.Start" +EVAL_SET_RUN_COMPLETED = "EvalSetRun.End" +EVAL_SET_RUN_FAILED = "EvalSetRun.Failed" +EVAL_RUN_STARTED = "EvalRun.Start" +EVAL_RUN_COMPLETED = "EvalRun.End" +EVAL_RUN_FAILED = "EvalRun.Failed" + + +class EvalTelemetrySubscriber: + """Subscribes to evaluation events and sends telemetry to Application Insights. + + This subscriber listens to the evaluation event bus and tracks: + - Eval set run start/complete/fail events + - Eval run start/complete/fail events + + Telemetry is sent asynchronously and failures are silently ignored to ensure + evaluation execution is never blocked by telemetry issues. + + Usage: + event_bus = EventBus() + telemetry_subscriber = EvalTelemetrySubscriber() + await telemetry_subscriber.subscribe_to_eval_runtime_events(event_bus) + """ + + def __init__(self) -> None: + """Initialize the telemetry subscriber.""" + self._eval_set_start_times: Dict[str, float] = {} + self._eval_run_start_times: Dict[str, float] = {} + self._eval_set_info: Dict[str, Dict[str, Any]] = {} + self._eval_run_info: Dict[str, Dict[str, Any]] = {} + + async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None: + """Subscribe to evaluation runtime events. + + Args: + event_bus: The event bus to subscribe to. + """ + if not is_telemetry_enabled(): + logger.debug("Telemetry disabled, skipping subscription") + return + + event_bus.subscribe( + EvaluationEvents.CREATE_EVAL_SET_RUN, self._on_eval_set_run_created + ) + event_bus.subscribe(EvaluationEvents.CREATE_EVAL_RUN, self._on_eval_run_created) + event_bus.subscribe(EvaluationEvents.UPDATE_EVAL_RUN, self._on_eval_run_updated) + event_bus.subscribe( + EvaluationEvents.UPDATE_EVAL_SET_RUN, self._on_eval_set_run_updated + ) + + logger.debug("Telemetry subscriber subscribed to evaluation events") + + async def _on_eval_set_run_created(self, event: EvalSetRunCreatedEvent) -> None: + """Handle eval set run created event. + + Args: + event: The eval set run created event. + """ + try: + self._eval_set_start_times[event.execution_id] = time.time() + self._eval_set_info[event.execution_id] = { + "eval_set_id": event.eval_set_id, + "eval_set_run_id": event.eval_set_run_id, + "entrypoint": event.entrypoint, + "no_of_evals": event.no_of_evals, + "evaluator_count": len(event.evaluators), + } + + properties: Dict[str, Any] = { + "EvalSetId": event.eval_set_id, + "Entrypoint": event.entrypoint, + "EvalCount": event.no_of_evals, + "EvaluatorCount": len(event.evaluators), + } + + if event.eval_set_run_id: + properties["EvalSetRunId"] = event.eval_set_run_id + + self._enrich_properties(properties) + + track_event(EVAL_SET_RUN_STARTED, properties) + logger.debug(f"Tracked eval set run started: {event.eval_set_id}") + + except Exception as e: + logger.debug(f"Error tracking eval set run started: {e}") + + async def _on_eval_run_created(self, event: EvalRunCreatedEvent) -> None: + """Handle eval run created event. + + Args: + event: The eval run created event. + """ + try: + self._eval_run_start_times[event.execution_id] = time.time() + self._eval_run_info[event.execution_id] = { + "eval_item_id": event.eval_item.id, + "eval_item_name": event.eval_item.name, + } + + properties: Dict[str, Any] = { + "EvalItemId": event.eval_item.id, + "EvalItemName": event.eval_item.name, + } + + self._enrich_properties(properties) + + track_event(EVAL_RUN_STARTED, properties) + logger.debug(f"Tracked eval run started: {event.eval_item.id}") + + except Exception as e: + logger.debug(f"Error tracking eval run started: {e}") + + async def _on_eval_run_updated(self, event: EvalRunUpdatedEvent) -> None: + """Handle eval run updated (completed/failed) event. + + Args: + event: The eval run updated event. + """ + try: + # Calculate duration + start_time = self._eval_run_start_times.pop(event.execution_id, None) + duration_ms = int((time.time() - start_time) * 1000) if start_time else None + + # Get stored info + run_info = self._eval_run_info.pop(event.execution_id, {}) + + # Calculate average score + scores = [ + r.result.score for r in event.eval_results if r.result.score is not None + ] + avg_score = sum(scores) / len(scores) if scores else None + + properties: Dict[str, Any] = { + "EvalItemId": run_info.get("eval_item_id", event.eval_item.id), + "EvalItemName": run_info.get("eval_item_name", event.eval_item.name), + "Success": event.success, + "EvaluatorCount": len(event.eval_results), + } + + if duration_ms is not None: + properties["DurationMs"] = duration_ms + + if avg_score is not None: + properties["AverageScore"] = avg_score + + if event.agent_execution_time: + properties["AgentExecutionTimeMs"] = int( + event.agent_execution_time * 1000 + ) + + if event.exception_details: + properties["ErrorType"] = type( + event.exception_details.exception + ).__name__ + properties["ErrorMessage"] = str(event.exception_details.exception)[ + :500 + ] + properties["IsRuntimeException"] = ( + event.exception_details.runtime_exception + ) + + self._enrich_properties(properties) + + event_name = EVAL_RUN_COMPLETED if event.success else EVAL_RUN_FAILED + track_event(event_name, properties) + logger.debug( + f"Tracked eval run {'completed' if event.success else 'failed'}: {event.eval_item.id}" + ) + + except Exception as e: + logger.debug(f"Error tracking eval run updated: {e}") + + async def _on_eval_set_run_updated(self, event: EvalSetRunUpdatedEvent) -> None: + """Handle eval set run updated (completed/failed) event. + + Args: + event: The eval set run updated event. + """ + try: + # Calculate duration + start_time = self._eval_set_start_times.pop(event.execution_id, None) + duration_ms = int((time.time() - start_time) * 1000) if start_time else None + + # Get stored info + set_info = self._eval_set_info.pop(event.execution_id, {}) + + # Calculate overall average score + scores = list(event.evaluator_scores.values()) + avg_score = sum(scores) / len(scores) if scores else None + + properties: Dict[str, Any] = { + "EvalSetId": set_info.get("eval_set_id", "unknown"), + "Success": event.success, + "EvaluatorCount": len(event.evaluator_scores), + } + + if set_info.get("eval_set_run_id"): + properties["EvalSetRunId"] = set_info["eval_set_run_id"] + + if set_info.get("entrypoint"): + properties["Entrypoint"] = set_info["entrypoint"] + + if set_info.get("no_of_evals"): + properties["EvalCount"] = set_info["no_of_evals"] + + if duration_ms is not None: + properties["DurationMs"] = duration_ms + + if avg_score is not None: + properties["AverageScore"] = avg_score + + # Add individual evaluator scores + for evaluator_id, score in event.evaluator_scores.items(): + # Sanitize evaluator ID for use as property key + safe_key = f"Score_{evaluator_id.replace('-', '_')[:50]}" + properties[safe_key] = score + + self._enrich_properties(properties) + + event_name = ( + EVAL_SET_RUN_COMPLETED if event.success else EVAL_SET_RUN_FAILED + ) + track_event(event_name, properties) + logger.debug( + f"Tracked eval set run {'completed' if event.success else 'failed'}" + ) + + except Exception as e: + logger.debug(f"Error tracking eval set run updated: {e}") + + def _enrich_properties(self, properties: Dict[str, Any]) -> None: + """Enrich properties with common context information. + + Args: + properties: The properties dictionary to enrich. + """ + # Add UiPath context + project_id = os.getenv("UIPATH_PROJECT_ID") + if project_id: + properties["ProjectId"] = project_id + + org_id = os.getenv("UIPATH_CLOUD_ORGANIZATION_ID") + if org_id: + properties["CloudOrganizationId"] = org_id + + user_id = os.getenv("UIPATH_CLOUD_USER_ID") + if user_id: + properties["CloudUserId"] = user_id + + tenant_id = os.getenv("UIPATH_TENANT_ID") + if tenant_id: + properties["TenantId"] = tenant_id + + # Add source identifier + properties["Source"] = "uipath-python-cli" + properties["ApplicationName"] = "UiPath.Eval" diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 736d82ae8..9b3548411 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -16,6 +16,7 @@ from uipath._cli._utils._studio_project import StudioClient from uipath._cli.middlewares import Middlewares from uipath._events._event_bus import EventBus +from uipath._cli._evals._telemetry import EvalTelemetrySubscriber from uipath._utils._bindings import ResourceOverwritesContext from uipath.eval._helpers import auto_discover_entrypoint from uipath.platform.common import UiPathConfig @@ -170,6 +171,9 @@ async def execute_eval(): console_reporter = ConsoleProgressReporter() await console_reporter.subscribe_to_eval_runtime_events(event_bus) + telemetry_subscriber = EvalTelemetrySubscriber() + await telemetry_subscriber.subscribe_to_eval_runtime_events(event_bus) + trace_manager = UiPathTraceManager() with UiPathRuntimeContext.with_defaults( diff --git a/src/uipath/telemetry/__init__.py b/src/uipath/telemetry/__init__.py index 9cdb01537..9c4433e5f 100644 --- a/src/uipath/telemetry/__init__.py +++ b/src/uipath/telemetry/__init__.py @@ -1,3 +1,8 @@ -from ._track import track # noqa: D104 +from ._track import ( # noqa: D104 + flush_events, + is_telemetry_enabled, + track, + track_event, +) -__all__ = ["track"] +__all__ = ["track", "track_event", "is_telemetry_enabled", "flush_events"] diff --git a/src/uipath/telemetry/_track.py b/src/uipath/telemetry/_track.py index fb471aa04..cc1c5f547 100644 --- a/src/uipath/telemetry/_track.py +++ b/src/uipath/telemetry/_track.py @@ -32,6 +32,35 @@ _UNKNOWN, ) +# Try to import Application Insights client for custom events +try: + from applicationinsights import TelemetryClient as AppInsightsTelemetryClient + + _HAS_APPINSIGHTS = True +except ImportError: + _HAS_APPINSIGHTS = False + AppInsightsTelemetryClient = None # type: ignore[misc, assignment] + + +def _parse_connection_string(connection_string: str) -> Optional[str]: + """Parse Azure Application Insights connection string to get instrumentation key. + + Args: + connection_string: The full connection string from Azure. + + Returns: + The instrumentation key if found, None otherwise. + """ + try: + parts = {} + for part in connection_string.split(";"): + if "=" in part: + key, value = part.split("=", 1) + parts[key] = value + return parts.get("InstrumentationKey") + except Exception: + return None + _logger = getLogger(__name__) _logger.propagate = False @@ -83,16 +112,99 @@ def _get_attributes(record: LogRecord) -> Mapping[str, AnyValue]: return attributes +class _AppInsightsEventClient: + """Application Insights SDK client for sending custom events. + + This uses the applicationinsights SDK to send events directly to the + customEvents table in Application Insights. + """ + + _initialized = False + _client: Optional[Any] = None + + @staticmethod + def _initialize() -> None: + """Initialize Application Insights client for custom events.""" + if _AppInsightsEventClient._initialized: + return + + _AppInsightsEventClient._initialized = True + + if not _HAS_APPINSIGHTS: + return + + connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") + if not connection_string: + return + + try: + instrumentation_key = _parse_connection_string(connection_string) + if not instrumentation_key: + return + + _AppInsightsEventClient._client = AppInsightsTelemetryClient( + instrumentation_key + ) + except Exception: + # Silently fail - telemetry should never break the main application + pass + + @staticmethod + def track_event( + name: str, + properties: Optional[Dict[str, Any]] = None, + ) -> None: + """Track a custom event to Application Insights customEvents table. + + Args: + name: Name of the event. + properties: Properties for the event (converted to strings). + """ + _AppInsightsEventClient._initialize() + + if not _AppInsightsEventClient._client: + return + + try: + safe_properties: Dict[str, str] = {} + if properties: + for key, value in properties.items(): + if value is not None: + safe_properties[key] = str(value) + + _AppInsightsEventClient._client.track_event( + name=name, properties=safe_properties, measurements={} + ) + # Note: We don't flush after every event to avoid blocking. + # Events will be sent in batches by the SDK. + except Exception: + # Telemetry should never break the main application + pass + + @staticmethod + def flush() -> None: + """Flush any pending telemetry events.""" + if _AppInsightsEventClient._client: + try: + _AppInsightsEventClient._client.flush() + except Exception: + pass + + class _TelemetryClient: - """A class to handle telemetry.""" + """A class to handle telemetry using OpenTelemetry for method tracking.""" _initialized = False - _enabled = os.getenv(ENV_TELEMETRY_ENABLED, "true").lower() == "true" + + @staticmethod + def _is_enabled() -> bool: + """Check if telemetry is enabled at runtime.""" + return os.getenv(ENV_TELEMETRY_ENABLED, "true").lower() == "true" @staticmethod def _initialize(): - """Initialize the telemetry client.""" - if _TelemetryClient._initialized or not _TelemetryClient._enabled: + """Initialize the OpenTelemetry-based telemetry client.""" + if _TelemetryClient._initialized or not _TelemetryClient._is_enabled(): return try: @@ -112,14 +224,87 @@ def _initialize(): @staticmethod def _track_method(name: str, attrs: Optional[Dict[str, Any]] = None): - """Track function invocations.""" - if not _TelemetryClient._enabled: + """Track function invocations using OpenTelemetry.""" + if not _TelemetryClient._is_enabled(): return _TelemetryClient._initialize() _logger.info(f"Sdk.{name.capitalize()}", extra=attrs) + @staticmethod + def track_event( + name: str, + properties: Optional[Dict[str, Any]] = None, + ) -> None: + """Track a custom event to Application Insights customEvents table. + + This method sends a custom event using the Application Insights SDK, + which ensures events appear in the customEvents table for monitoring + and analytics. Telemetry failures are silently ignored to ensure the + main application is never blocked. + + Args: + name: Name of the event (e.g., "EvalSetRun.Start", "AgentRun.Complete"). + properties: Optional dictionary of properties to attach to the event. + Values will be converted to strings. + + Example: + from uipath.telemetry import track_event + + track_event("MyFeature.Start", {"user_id": "123", "feature": "export"}) + """ + if not _TelemetryClient._is_enabled(): + return + + try: + _AppInsightsEventClient.track_event(name, properties) + except Exception: + # Telemetry should never break the main application + pass + + +def track_event( + name: str, + properties: Optional[Dict[str, Any]] = None, +) -> None: + """Track a custom event. + + This function sends a custom event to Application Insights for monitoring + and analytics. Telemetry failures are silently ignored to ensure the + main application is never blocked. + + Args: + name: Name of the event (e.g., "EvalSetRun.Start", "AgentRun.Complete"). + properties: Optional dictionary of properties to attach to the event. + Values will be converted to strings. + + Example: + from uipath.telemetry import track_event + + track_event("MyFeature.Start", {"user_id": "123", "feature": "export"}) + """ + _TelemetryClient.track_event(name, properties) + + +def is_telemetry_enabled() -> bool: + """Check if telemetry is enabled. + + Returns: + True if telemetry is enabled, False otherwise. + """ + return _TelemetryClient._is_enabled() + + +def flush_events() -> None: + """Flush any pending telemetry events. + + Call this to ensure all tracked events are sent to Application Insights. + This is useful at the end of a process or when you need to ensure + events are sent immediately. + """ + _AppInsightsEventClient.flush() + def track( name_or_func: Optional[Union[str, Callable[..., Any]]] = None, diff --git a/uv.lock b/uv.lock index 8e85506a1..8c90eda7b 100644 --- a/uv.lock +++ b/uv.lock @@ -148,6 +148,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7f/9c/36c5c37947ebfb8c7f22e0eb6e4d188ee2d53aa3880f3f2744fb894f0cb1/anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb", size = 113362, upload-time = "2025-11-28T23:36:57.897Z" }, ] +[[package]] +name = "applicationinsights" +version = "0.11.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/f2/46a75ac6096d60da0e71a068015b610206e697de01fa2fb5bba8564b0798/applicationinsights-0.11.10.tar.gz", hash = "sha256:0b761f3ef0680acf4731906dfc1807faa6f2a57168ae74592db0084a6099f7b3", size = 44722, upload-time = "2021-04-22T23:22:45.71Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/0d/cb6b23164eb55eebaa5f9f302dfe557cfa751bd7b2779863f1abd0343b6b/applicationinsights-0.11.10-py2.py3-none-any.whl", hash = "sha256:e89a890db1c6906b6a7d0bcfd617dac83974773c64573147c8d6654f9cf2a6ea", size = 55068, upload-time = "2021-04-22T23:22:44.451Z" }, +] + [[package]] name = "attrs" version = "25.4.0" @@ -2480,6 +2489,7 @@ name = "uipath" version = "2.4.4" source = { editable = "." } dependencies = [ + { name = "applicationinsights" }, { name = "click" }, { name = "coverage" }, { name = "httpx" }, @@ -2527,6 +2537,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "applicationinsights", specifier = ">=0.11.10" }, { name = "click", specifier = ">=8.3.1" }, { name = "coverage", specifier = ">=7.8.2" }, { name = "httpx", specifier = ">=0.28.1" }, From 22b199ca75f28ce7431cc02f1bf2d7a046607226 Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 7 Jan 2026 15:24:37 -0800 Subject: [PATCH 02/12] fix: linting erros --- src/uipath/_cli/cli_eval.py | 2 +- src/uipath/telemetry/_track.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 9b3548411..86ac09ac3 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -12,11 +12,11 @@ from uipath._cli._evals._runtime import ( UiPathEvalContext, ) +from uipath._cli._evals._telemetry import EvalTelemetrySubscriber from uipath._cli._utils._folders import get_personal_workspace_key_async from uipath._cli._utils._studio_project import StudioClient from uipath._cli.middlewares import Middlewares from uipath._events._event_bus import EventBus -from uipath._cli._evals._telemetry import EvalTelemetrySubscriber from uipath._utils._bindings import ResourceOverwritesContext from uipath.eval._helpers import auto_discover_entrypoint from uipath.platform.common import UiPathConfig diff --git a/src/uipath/telemetry/_track.py b/src/uipath/telemetry/_track.py index cc1c5f547..302207322 100644 --- a/src/uipath/telemetry/_track.py +++ b/src/uipath/telemetry/_track.py @@ -33,13 +33,18 @@ ) # Try to import Application Insights client for custom events +# Note: applicationinsights is not typed, as it was deprecated in favor of the +# OpenTelemetry SDK. We still use it because it's the only way to send custom +# events to the Application Insights customEvents table. try: - from applicationinsights import TelemetryClient as AppInsightsTelemetryClient + from applicationinsights import ( # type: ignore[import-untyped] + TelemetryClient as AppInsightsTelemetryClient, + ) _HAS_APPINSIGHTS = True except ImportError: _HAS_APPINSIGHTS = False - AppInsightsTelemetryClient = None # type: ignore[misc, assignment] + AppInsightsTelemetryClient = None def _parse_connection_string(connection_string: str) -> Optional[str]: @@ -61,6 +66,7 @@ def _parse_connection_string(connection_string: str) -> Optional[str]: except Exception: return None + _logger = getLogger(__name__) _logger.propagate = False From a607d89880312f99425d777eace8bf432b841bad Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 7 Jan 2026 16:28:58 -0800 Subject: [PATCH 03/12] feat: send some more telemetry props and verify end to end working --- src/uipath/_cli/_evals/_telemetry.py | 64 +++++++++++++++++++++++----- src/uipath/_cli/cli_eval.py | 3 ++ src/uipath/telemetry/_track.py | 4 +- 3 files changed, 59 insertions(+), 12 deletions(-) diff --git a/src/uipath/_cli/_evals/_telemetry.py b/src/uipath/_cli/_evals/_telemetry.py index 006ed3a14..642745d89 100644 --- a/src/uipath/_cli/_evals/_telemetry.py +++ b/src/uipath/_cli/_evals/_telemetry.py @@ -7,7 +7,7 @@ import logging import os import time -from typing import Any, Dict +from typing import Any, Dict, Optional from uipath._events._event_bus import EventBus from uipath._events._events import ( @@ -17,17 +17,17 @@ EvalSetRunUpdatedEvent, EvaluationEvents, ) -from uipath.telemetry import is_telemetry_enabled, track_event +from uipath.telemetry._track import is_telemetry_enabled, track_event logger = logging.getLogger(__name__) # Telemetry event names for Application Insights -EVAL_SET_RUN_STARTED = "EvalSetRun.Start" -EVAL_SET_RUN_COMPLETED = "EvalSetRun.End" -EVAL_SET_RUN_FAILED = "EvalSetRun.Failed" -EVAL_RUN_STARTED = "EvalRun.Start" -EVAL_RUN_COMPLETED = "EvalRun.End" -EVAL_RUN_FAILED = "EvalRun.Failed" +EVAL_SET_RUN_STARTED = "EvalSetRun.Start.URT" +EVAL_SET_RUN_COMPLETED = "EvalSetRun.End.URT" +EVAL_SET_RUN_FAILED = "EvalSetRun.Failed.URT" +EVAL_RUN_STARTED = "EvalRun.Start.URT" +EVAL_RUN_COMPLETED = "EvalRun.End.URT" +EVAL_RUN_FAILED = "EvalRun.Failed.URT" class EvalTelemetrySubscriber: @@ -52,6 +52,8 @@ def __init__(self) -> None: self._eval_run_start_times: Dict[str, float] = {} self._eval_set_info: Dict[str, Dict[str, Any]] = {} self._eval_run_info: Dict[str, Dict[str, Any]] = {} + self._current_eval_set_run_id: Optional[str] = None + self._current_agent_id: Optional[str] = None async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None: """Subscribe to evaluation runtime events. @@ -82,23 +84,31 @@ async def _on_eval_set_run_created(self, event: EvalSetRunCreatedEvent) -> None: """ try: self._eval_set_start_times[event.execution_id] = time.time() + + eval_set_run_id = event.eval_set_run_id or event.execution_id + self._eval_set_info[event.execution_id] = { "eval_set_id": event.eval_set_id, - "eval_set_run_id": event.eval_set_run_id, + "eval_set_run_id": eval_set_run_id, "entrypoint": event.entrypoint, "no_of_evals": event.no_of_evals, "evaluator_count": len(event.evaluators), } + # Store for child events + self._current_eval_set_run_id = eval_set_run_id + self._current_agent_id = event.entrypoint + properties: Dict[str, Any] = { "EvalSetId": event.eval_set_id, + "EvalSetRunId": eval_set_run_id, "Entrypoint": event.entrypoint, "EvalCount": event.no_of_evals, "EvaluatorCount": len(event.evaluators), } - if event.eval_set_run_id: - properties["EvalSetRunId"] = event.eval_set_run_id + if event.entrypoint: + properties["AgentId"] = event.entrypoint self._enrich_properties(properties) @@ -124,8 +134,17 @@ async def _on_eval_run_created(self, event: EvalRunCreatedEvent) -> None: properties: Dict[str, Any] = { "EvalItemId": event.eval_item.id, "EvalItemName": event.eval_item.name, + "EvalRunId": event.execution_id, } + # Add eval set run id from parent + if self._current_eval_set_run_id: + properties["EvalSetRunId"] = self._current_eval_set_run_id + + # Add agent id + if self._current_agent_id: + properties["AgentId"] = self._current_agent_id + self._enrich_properties(properties) track_event(EVAL_RUN_STARTED, properties) @@ -154,13 +173,32 @@ async def _on_eval_run_updated(self, event: EvalRunUpdatedEvent) -> None: ] avg_score = sum(scores) / len(scores) if scores else None + # Try to get trace ID from spans + trace_id: Optional[str] = None + if event.spans: + for span in event.spans: + if span.context and span.context.trace_id: + # Format trace ID as hex string + trace_id = format(span.context.trace_id, "032x") + break + properties: Dict[str, Any] = { "EvalItemId": run_info.get("eval_item_id", event.eval_item.id), "EvalItemName": run_info.get("eval_item_name", event.eval_item.name), + "EvalRunId": event.execution_id, "Success": event.success, "EvaluatorCount": len(event.eval_results), } + if self._current_eval_set_run_id: + properties["EvalSetRunId"] = self._current_eval_set_run_id + + if self._current_agent_id: + properties["AgentId"] = self._current_agent_id + + if trace_id: + properties["TraceId"] = trace_id + if duration_ms is not None: properties["DurationMs"] = duration_ms @@ -223,6 +261,7 @@ async def _on_eval_set_run_updated(self, event: EvalSetRunUpdatedEvent) -> None: if set_info.get("entrypoint"): properties["Entrypoint"] = set_info["entrypoint"] + properties["AgentId"] = set_info["entrypoint"] if set_info.get("no_of_evals"): properties["EvalCount"] = set_info["no_of_evals"] @@ -249,6 +288,9 @@ async def _on_eval_set_run_updated(self, event: EvalSetRunUpdatedEvent) -> None: f"Tracked eval set run {'completed' if event.success else 'failed'}" ) + self._current_eval_set_run_id = None + self._current_agent_id = None + except Exception as e: logger.debug(f"Error tracking eval set run updated: {e}") diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 86ac09ac3..8c9f9870e 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -20,6 +20,7 @@ from uipath._utils._bindings import ResourceOverwritesContext from uipath.eval._helpers import auto_discover_entrypoint from uipath.platform.common import UiPathConfig +from uipath.telemetry._track import flush_events from uipath.tracing import LlmOpsHttpExporter from ._utils._console import ConsoleLogger @@ -216,6 +217,8 @@ async def execute_eval(): console.error( f"Error occurred: {e or 'Execution failed'}", include_traceback=True ) + finally: + flush_events() if __name__ == "__main__": diff --git a/src/uipath/telemetry/_track.py b/src/uipath/telemetry/_track.py index 302207322..0aacd57d8 100644 --- a/src/uipath/telemetry/_track.py +++ b/src/uipath/telemetry/_track.py @@ -8,7 +8,6 @@ from opentelemetry.sdk._logs import LoggingHandler from opentelemetry.util.types import AnyValue -from .._cli._utils._common import get_claim_from_token from .._utils.constants import ( ENV_BASE_URL, ENV_ORGANIZATION_ID, @@ -102,6 +101,9 @@ def _get_attributes(record: LogRecord) -> Mapping[str, AnyValue]: attributes[_APP_NAME] = "UiPath.Sdk" attributes[_SDK_VERSION] = version("uipath") try: + # Lazy import to avoid circular dependency + from .._cli._utils._common import get_claim_from_token + cloud_user_id = get_claim_from_token("sub") except Exception: cloud_user_id = _UNKNOWN From 2e74975ee5dd01edef88e7e78d8a3459c6938e6c Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 7 Jan 2026 16:35:45 -0800 Subject: [PATCH 04/12] feat: add unit tests for tracing and telemetry --- tests/cli/eval/test_eval_runtime_spans.py | 500 ++++++++++++++++ tests/cli/eval/test_eval_telemetry.py | 541 ++++++++++++++++++ .../cli/eval/test_eval_tracing_integration.py | 485 ++++++++++++++++ tests/telemetry/__init__.py | 1 + tests/telemetry/test_track.py | 482 ++++++++++++++++ 5 files changed, 2009 insertions(+) create mode 100644 tests/cli/eval/test_eval_runtime_spans.py create mode 100644 tests/cli/eval/test_eval_telemetry.py create mode 100644 tests/cli/eval/test_eval_tracing_integration.py create mode 100644 tests/telemetry/__init__.py create mode 100644 tests/telemetry/test_track.py diff --git a/tests/cli/eval/test_eval_runtime_spans.py b/tests/cli/eval/test_eval_runtime_spans.py new file mode 100644 index 000000000..58e5da5e9 --- /dev/null +++ b/tests/cli/eval/test_eval_runtime_spans.py @@ -0,0 +1,500 @@ +"""Tests for eval runtime span creation in _runtime.py. + +Tests the three new spans added for eval tracing: +1. "Evaluation Set Run" - span_type: "eval_set_run" +2. "Evaluation" - span_type: "evaluation" +3. "Evaluator: {name}" - span_type: "evaluator" +""" + +import uuid +from typing import Any, Dict, List +from unittest.mock import MagicMock + +import pytest +from opentelemetry.sdk.trace import Span + +from uipath._cli._evals._models._evaluation_set import EvaluationItem +from uipath._cli._evals._runtime import UiPathEvalContext +from uipath.eval.evaluators import BaseEvaluator + + +class MockSpanContext: + """Mock span context manager for testing span creation.""" + + def __init__(self, name: str, attributes: Dict[str, Any]): + self.name = name + self.attributes = attributes or {} + self.span = MagicMock(spec=Span) + self.span.attributes = self.attributes + + def __enter__(self): + return self.span + + def __exit__(self, *args): + pass + + +class SpanCapturingTracer: + """A tracer that captures span creations for testing.""" + + def __init__(self): + self.created_spans: List[Dict[str, Any]] = [] + + def start_as_current_span(self, name: str, attributes: Dict[str, Any] = None): + """Capture span creation and return a mock context manager.""" + span_info = {"name": name, "attributes": attributes or {}} + self.created_spans.append(span_info) + return MockSpanContext(name, attributes) + + +class TestEvalSetRunSpan: + """Tests for the 'Evaluation Set Run' span.""" + + def test_span_name_is_correct(self): + """Test that the span name is 'Evaluation Set Run'.""" + # The span name should be exactly "Evaluation Set Run" + expected_name = "Evaluation Set Run" + # This is defined in _runtime.py:316 + assert expected_name == "Evaluation Set Run" + + def test_span_has_eval_set_run_span_type(self): + """Test that span_type attribute is 'eval_set_run'.""" + span_attributes = {"span_type": "eval_set_run"} + assert span_attributes["span_type"] == "eval_set_run" + + def test_span_includes_eval_set_run_id_when_present(self): + """Test that eval_set_run_id is included when context has it.""" + eval_set_run_id = str(uuid.uuid4()) + span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} + if eval_set_run_id: + span_attributes["eval_set_run_id"] = eval_set_run_id + + assert "eval_set_run_id" in span_attributes + assert span_attributes["eval_set_run_id"] == eval_set_run_id + + def test_span_excludes_eval_set_run_id_when_not_present(self): + """Test that eval_set_run_id is not included when context doesn't have it.""" + eval_set_run_id = None + span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} + if eval_set_run_id: + span_attributes["eval_set_run_id"] = eval_set_run_id + + assert "eval_set_run_id" not in span_attributes + + +class TestEvaluationSpan: + """Tests for the 'Evaluation' span.""" + + def test_span_name_is_correct(self): + """Test that the span name is 'Evaluation'.""" + expected_name = "Evaluation" + assert expected_name == "Evaluation" + + def test_span_has_evaluation_span_type(self): + """Test that span_type attribute is 'evaluation'.""" + span_attributes = {"span_type": "evaluation"} + assert span_attributes["span_type"] == "evaluation" + + def test_span_includes_execution_id(self): + """Test that execution.id is included in the span attributes.""" + execution_id = str(uuid.uuid4()) + span_attributes = { + "execution.id": execution_id, + "span_type": "evaluation", + } + assert "execution.id" in span_attributes + assert span_attributes["execution.id"] == execution_id + + def test_span_includes_eval_item_id(self): + """Test that eval_item_id is included in the span attributes.""" + eval_item_id = "test-eval-item-123" + span_attributes = { + "span_type": "evaluation", + "eval_item_id": eval_item_id, + } + assert "eval_item_id" in span_attributes + assert span_attributes["eval_item_id"] == eval_item_id + + def test_span_includes_eval_item_name(self): + """Test that eval_item_name is included in the span attributes.""" + eval_item_name = "Test Evaluation Item" + span_attributes = { + "span_type": "evaluation", + "eval_item_name": eval_item_name, + } + assert "eval_item_name" in span_attributes + assert span_attributes["eval_item_name"] == eval_item_name + + def test_span_has_all_required_attributes(self): + """Test that all required attributes are present in the span.""" + execution_id = str(uuid.uuid4()) + eval_item_id = "eval-item-456" + eval_item_name = "My Eval Item" + + span_attributes = { + "execution.id": execution_id, + "span_type": "evaluation", + "eval_item_id": eval_item_id, + "eval_item_name": eval_item_name, + } + + # Verify all required attributes + required_attrs = ["execution.id", "span_type", "eval_item_id", "eval_item_name"] + for attr in required_attrs: + assert attr in span_attributes, f"Missing required attribute: {attr}" + + +class TestEvaluatorSpan: + """Tests for the 'Evaluator: {name}' span.""" + + def test_span_name_includes_evaluator_name(self): + """Test that the span name includes the evaluator name.""" + evaluator_name = "MyEvaluator" + expected_name = f"Evaluator: {evaluator_name}" + assert expected_name == "Evaluator: MyEvaluator" + + def test_span_has_evaluator_span_type(self): + """Test that span_type attribute is 'evaluator'.""" + span_attributes = {"span_type": "evaluator"} + assert span_attributes["span_type"] == "evaluator" + + def test_span_includes_evaluator_id(self): + """Test that evaluator_id is included in the span attributes.""" + evaluator_id = "evaluator-789" + span_attributes = { + "span_type": "evaluator", + "evaluator_id": evaluator_id, + } + assert "evaluator_id" in span_attributes + assert span_attributes["evaluator_id"] == evaluator_id + + def test_span_includes_evaluator_name(self): + """Test that evaluator_name is included in the span attributes.""" + evaluator_name = "AccuracyEvaluator" + span_attributes = { + "span_type": "evaluator", + "evaluator_name": evaluator_name, + } + assert "evaluator_name" in span_attributes + assert span_attributes["evaluator_name"] == evaluator_name + + def test_span_includes_eval_item_id(self): + """Test that eval_item_id is included in the evaluator span.""" + eval_item_id = "eval-item-123" + span_attributes = { + "span_type": "evaluator", + "eval_item_id": eval_item_id, + } + assert "eval_item_id" in span_attributes + assert span_attributes["eval_item_id"] == eval_item_id + + def test_span_has_all_required_attributes(self): + """Test that all required attributes are present in the evaluator span.""" + evaluator_id = "eval-id-123" + evaluator_name = "TestEvaluator" + eval_item_id = "item-456" + + span_attributes = { + "span_type": "evaluator", + "evaluator_id": evaluator_id, + "evaluator_name": evaluator_name, + "eval_item_id": eval_item_id, + } + + # Verify all required attributes + required_attrs = ["span_type", "evaluator_id", "evaluator_name", "eval_item_id"] + for attr in required_attrs: + assert attr in span_attributes, f"Missing required attribute: {attr}" + + +class TestSpanHierarchy: + """Tests verifying the span hierarchy structure.""" + + def test_evaluation_span_is_child_of_eval_set_run(self): + """Test that Evaluation spans should be children of Evaluation Set Run.""" + # This is a conceptual test - in the actual code, the Evaluation span + # is created inside the context of the Evaluation Set Run span + parent_span_type = "eval_set_run" + child_span_type = "evaluation" + + # The parent-child relationship is enforced by span context nesting + assert parent_span_type == "eval_set_run" + assert child_span_type == "evaluation" + + def test_evaluator_span_is_child_of_evaluation(self): + """Test that Evaluator spans should be children of Evaluation.""" + # This is a conceptual test - in the actual code, the Evaluator span + # is created inside the context of the Evaluation span + parent_span_type = "evaluation" + child_span_type = "evaluator" + + assert parent_span_type == "evaluation" + assert child_span_type == "evaluator" + + +class TestSpanAttributeValues: + """Tests for span attribute value formatting.""" + + def test_span_type_values_are_lowercase(self): + """Test that span_type values are lowercase strings.""" + span_types = ["eval_set_run", "evaluation", "evaluator"] + + for span_type in span_types: + assert span_type == span_type.lower() + # All span types should be lowercase without hyphens + assert "-" not in span_type + + def test_execution_id_is_valid_uuid(self): + """Test that execution.id is a valid UUID string.""" + execution_id = str(uuid.uuid4()) + + # Verify it can be parsed back as a UUID + parsed_uuid = uuid.UUID(execution_id) + assert str(parsed_uuid) == execution_id + + def test_evaluator_span_name_format(self): + """Test the evaluator span name format.""" + evaluator_names = [ + "Accuracy", + "Relevance", + "Fluency", + "Custom Evaluator", + ] + + for name in evaluator_names: + span_name = f"Evaluator: {name}" + assert span_name.startswith("Evaluator: ") + assert name in span_name + + +class TestEvalContextIntegration: + """Tests for UiPathEvalContext integration with spans.""" + + def test_context_with_eval_set_run_id(self): + """Test that context with eval_set_run_id produces correct span attributes.""" + context = UiPathEvalContext() + context.eval_set_run_id = "run-123" + + span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} + if context.eval_set_run_id: + span_attributes["eval_set_run_id"] = context.eval_set_run_id + + assert span_attributes["eval_set_run_id"] == "run-123" + + def test_context_without_eval_set_run_id(self): + """Test that context without eval_set_run_id produces correct span attributes.""" + context = UiPathEvalContext() + context.eval_set_run_id = None + + span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} + if context.eval_set_run_id: + span_attributes["eval_set_run_id"] = context.eval_set_run_id + + assert "eval_set_run_id" not in span_attributes + + +class TestSpanCreationLogic: + """Tests for the span creation logic in runtime methods.""" + + def test_eval_set_run_span_attributes_construction(self): + """Test the construction of Evaluation Set Run span attributes.""" + eval_set_run_id = "test-run-id" + + span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} + if eval_set_run_id: + span_attributes["eval_set_run_id"] = eval_set_run_id + + assert span_attributes == { + "span_type": "eval_set_run", + "eval_set_run_id": "test-run-id", + } + + def test_evaluation_span_attributes_construction(self): + """Test the construction of Evaluation span attributes.""" + execution_id = "exec-123" + eval_item_id = "item-456" + eval_item_name = "Test Item" + + span_attributes = { + "execution.id": execution_id, + "span_type": "evaluation", + "eval_item_id": eval_item_id, + "eval_item_name": eval_item_name, + } + + assert span_attributes["execution.id"] == "exec-123" + assert span_attributes["span_type"] == "evaluation" + assert span_attributes["eval_item_id"] == "item-456" + assert span_attributes["eval_item_name"] == "Test Item" + + def test_evaluator_span_attributes_construction(self): + """Test the construction of Evaluator span attributes.""" + evaluator_id = "eval-123" + evaluator_name = "AccuracyEvaluator" + eval_item_id = "item-789" + + span_attributes = { + "span_type": "evaluator", + "evaluator_id": evaluator_id, + "evaluator_name": evaluator_name, + "eval_item_id": eval_item_id, + } + + assert span_attributes["span_type"] == "evaluator" + assert span_attributes["evaluator_id"] == "eval-123" + assert span_attributes["evaluator_name"] == "AccuracyEvaluator" + assert span_attributes["eval_item_id"] == "item-789" + + def test_evaluator_span_name_construction(self): + """Test the construction of Evaluator span name.""" + evaluator_name = "RelevanceEvaluator" + span_name = f"Evaluator: {evaluator_name}" + + assert span_name == "Evaluator: RelevanceEvaluator" + + +class TestEvalItemSpanAttributes: + """Tests for eval item attributes in spans.""" + + def test_eval_item_attributes_in_evaluation_span(self): + """Test that eval item attributes are correctly set in Evaluation span.""" + eval_item = MagicMock(spec=EvaluationItem) + eval_item.id = "item-id-123" + eval_item.name = "Test Evaluation" + + span_attributes = { + "execution.id": str(uuid.uuid4()), + "span_type": "evaluation", + "eval_item_id": eval_item.id, + "eval_item_name": eval_item.name, + } + + assert span_attributes["eval_item_id"] == "item-id-123" + assert span_attributes["eval_item_name"] == "Test Evaluation" + + def test_eval_item_id_in_evaluator_span(self): + """Test that eval_item_id is included in Evaluator span.""" + eval_item = MagicMock(spec=EvaluationItem) + eval_item.id = "item-id-456" + + span_attributes = { + "span_type": "evaluator", + "evaluator_id": "evaluator-123", + "evaluator_name": "TestEvaluator", + "eval_item_id": eval_item.id, + } + + assert span_attributes["eval_item_id"] == "item-id-456" + + +class TestSpanTypeConsistency: + """Tests for span type value consistency.""" + + def test_all_span_types_are_strings(self): + """Test that all span_type values are strings.""" + span_types = ["eval_set_run", "evaluation", "evaluator"] + + for span_type in span_types: + assert isinstance(span_type, str) + + def test_span_types_use_snake_case(self): + """Test that span_type values use snake_case naming.""" + span_types = ["eval_set_run", "evaluation", "evaluator"] + + for span_type in span_types: + # No uppercase letters + assert span_type == span_type.lower() + # No hyphens + assert "-" not in span_type + + def test_span_type_values_match_expected(self): + """Test that span_type values match expected values from _runtime.py.""" + expected_span_types = { + "Evaluation Set Run": "eval_set_run", + "Evaluation": "evaluation", + "Evaluator": "evaluator", + } + + for span_name, span_type in expected_span_types.items(): + assert isinstance(span_type, str) + assert span_type.islower() or "_" in span_type + + +class TestRunEvaluatorSpan: + """Tests specifically for the run_evaluator span creation.""" + + @pytest.fixture + def mock_evaluator(self): + """Create a mock evaluator for testing.""" + evaluator = MagicMock(spec=BaseEvaluator) + evaluator.id = "test-evaluator-id" + evaluator.name = "TestEvaluator" + return evaluator + + @pytest.fixture + def mock_eval_item(self): + """Create a mock eval item for testing.""" + eval_item = MagicMock(spec=EvaluationItem) + eval_item.id = "test-item-id" + eval_item.name = "Test Item" + eval_item.inputs = {"query": "test query"} + eval_item.expected_agent_behavior = "Expected behavior" + return eval_item + + def test_evaluator_span_name_uses_evaluator_name(self, mock_evaluator): + """Test that evaluator span name uses the evaluator's name.""" + span_name = f"Evaluator: {mock_evaluator.name}" + assert span_name == "Evaluator: TestEvaluator" + + def test_evaluator_span_includes_evaluator_details( + self, mock_evaluator, mock_eval_item + ): + """Test that evaluator span includes all evaluator details.""" + span_attributes = { + "span_type": "evaluator", + "evaluator_id": mock_evaluator.id, + "evaluator_name": mock_evaluator.name, + "eval_item_id": mock_eval_item.id, + } + + assert span_attributes["evaluator_id"] == "test-evaluator-id" + assert span_attributes["evaluator_name"] == "TestEvaluator" + assert span_attributes["eval_item_id"] == "test-item-id" + + +class TestExecutionIdPropagation: + """Tests for execution.id propagation in spans.""" + + def test_execution_id_format(self): + """Test that execution.id is in valid UUID format.""" + execution_id = str(uuid.uuid4()) + + # Verify it's a valid UUID + try: + uuid.UUID(execution_id) + valid = True + except ValueError: + valid = False + + assert valid + + def test_execution_id_is_unique_per_eval(self): + """Test that each eval gets a unique execution_id.""" + execution_ids = [str(uuid.uuid4()) for _ in range(5)] + + # All should be unique + assert len(set(execution_ids)) == 5 + + def test_evaluation_span_has_execution_id(self): + """Test that Evaluation span includes execution.id.""" + execution_id = str(uuid.uuid4()) + + span_attributes = { + "execution.id": execution_id, + "span_type": "evaluation", + "eval_item_id": "item-123", + "eval_item_name": "Test Item", + } + + assert "execution.id" in span_attributes + assert span_attributes["execution.id"] == execution_id diff --git a/tests/cli/eval/test_eval_telemetry.py b/tests/cli/eval/test_eval_telemetry.py new file mode 100644 index 000000000..63f8f913f --- /dev/null +++ b/tests/cli/eval/test_eval_telemetry.py @@ -0,0 +1,541 @@ +"""Tests for EvalTelemetrySubscriber functionality.""" + +import os +from unittest.mock import patch + +import pytest + +from uipath._cli._evals._models._evaluation_set import EvaluationItem +from uipath._cli._evals._telemetry import ( + EVAL_RUN_COMPLETED, + EVAL_RUN_FAILED, + EVAL_RUN_STARTED, + EVAL_SET_RUN_COMPLETED, + EVAL_SET_RUN_FAILED, + EVAL_SET_RUN_STARTED, + EvalTelemetrySubscriber, +) +from uipath._events._event_bus import EventBus +from uipath._events._events import ( + EvalItemExceptionDetails, + EvalRunCreatedEvent, + EvalRunUpdatedEvent, + EvalSetRunCreatedEvent, + EvalSetRunUpdatedEvent, +) +from uipath.eval.models import EvalItemResult, NumericEvaluationResult + + +class TestEventNameConstants: + """Test telemetry event name constants.""" + + def test_eval_set_run_event_names(self): + """Test eval set run event name constants.""" + assert EVAL_SET_RUN_STARTED == "EvalSetRun.Start.URT" + assert EVAL_SET_RUN_COMPLETED == "EvalSetRun.End.URT" + assert EVAL_SET_RUN_FAILED == "EvalSetRun.Failed.URT" + + def test_eval_run_event_names(self): + """Test eval run event name constants.""" + assert EVAL_RUN_STARTED == "EvalRun.Start.URT" + assert EVAL_RUN_COMPLETED == "EvalRun.End.URT" + assert EVAL_RUN_FAILED == "EvalRun.Failed.URT" + + +class TestEvalTelemetrySubscriberInit: + """Test EvalTelemetrySubscriber initialization.""" + + def test_init_creates_empty_tracking_dicts(self): + """Test that initialization creates empty tracking dictionaries.""" + subscriber = EvalTelemetrySubscriber() + + assert subscriber._eval_set_start_times == {} + assert subscriber._eval_run_start_times == {} + assert subscriber._eval_set_info == {} + assert subscriber._eval_run_info == {} + + +class TestEvalTelemetrySubscriberSubscription: + """Test subscription to event bus.""" + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.is_telemetry_enabled", return_value=True) + async def test_subscribe_when_telemetry_enabled(self, mock_is_enabled): + """Test that subscriber registers handlers when telemetry is enabled.""" + subscriber = EvalTelemetrySubscriber() + event_bus = EventBus() + + await subscriber.subscribe_to_eval_runtime_events(event_bus) + + # Verify handlers are registered (event bus should have subscribers) + assert len(event_bus._subscribers) == 4 + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.is_telemetry_enabled", return_value=False) + async def test_subscribe_skipped_when_telemetry_disabled(self, mock_is_enabled): + """Test that subscription is skipped when telemetry is disabled.""" + subscriber = EvalTelemetrySubscriber() + event_bus = EventBus() + + await subscriber.subscribe_to_eval_runtime_events(event_bus) + + # Verify no handlers are registered + assert len(event_bus._subscribers) == 0 + + +class TestEvalSetRunCreated: + """Test eval set run created event handling.""" + + def _create_eval_set_run_created_event( + self, + execution_id: str = "exec-123", + eval_set_id: str = "eval-set-1", + eval_set_run_id: str | None = "run-456", + entrypoint: str = "agent.py", + no_of_evals: int = 5, + evaluators: list = None, + ) -> EvalSetRunCreatedEvent: + """Helper to create EvalSetRunCreatedEvent.""" + return EvalSetRunCreatedEvent( + execution_id=execution_id, + eval_set_id=eval_set_id, + eval_set_run_id=eval_set_run_id, + entrypoint=entrypoint, + no_of_evals=no_of_evals, + evaluators=evaluators or [], + ) + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_set_run_created_tracks_event(self, mock_track_event): + """Test that eval set run created event is tracked.""" + subscriber = EvalTelemetrySubscriber() + event = self._create_eval_set_run_created_event() + + await subscriber._on_eval_set_run_created(event) + + mock_track_event.assert_called_once() + call_args = mock_track_event.call_args + assert call_args[0][0] == EVAL_SET_RUN_STARTED + properties = call_args[0][1] + assert properties["EvalSetId"] == "eval-set-1" + assert properties["Entrypoint"] == "agent.py" + assert properties["EvalCount"] == 5 + assert properties["EvaluatorCount"] == 0 + assert properties["EvalSetRunId"] == "run-456" + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_set_run_created_stores_start_time(self, mock_track_event): + """Test that eval set run start time is stored.""" + subscriber = EvalTelemetrySubscriber() + event = self._create_eval_set_run_created_event(execution_id="exec-789") + + await subscriber._on_eval_set_run_created(event) + + assert "exec-789" in subscriber._eval_set_start_times + assert "exec-789" in subscriber._eval_set_info + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_set_run_created_without_run_id(self, mock_track_event): + """Test event tracking when eval_set_run_id is None falls back to execution_id.""" + subscriber = EvalTelemetrySubscriber() + event = self._create_eval_set_run_created_event(eval_set_run_id=None) + + await subscriber._on_eval_set_run_created(event) + + call_args = mock_track_event.call_args + properties = call_args[0][1] + # When eval_set_run_id is None, it falls back to execution_id + assert properties["EvalSetRunId"] == "exec-123" # Falls back to execution_id + + +class TestEvalRunCreated: + """Test eval run created event handling.""" + + def _create_eval_run_created_event( + self, + execution_id: str = "exec-123", + eval_item_id: str = "item-1", + eval_item_name: str = "Test Eval", + ) -> EvalRunCreatedEvent: + """Helper to create EvalRunCreatedEvent.""" + eval_item = EvaluationItem( + id=eval_item_id, + name=eval_item_name, + inputs={}, + expected_agent_behavior="", + evaluation_criterias={}, + ) + return EvalRunCreatedEvent( + execution_id=execution_id, + eval_item=eval_item, + ) + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_run_created_tracks_event(self, mock_track_event): + """Test that eval run created event is tracked.""" + subscriber = EvalTelemetrySubscriber() + event = self._create_eval_run_created_event() + + await subscriber._on_eval_run_created(event) + + mock_track_event.assert_called_once() + call_args = mock_track_event.call_args + assert call_args[0][0] == EVAL_RUN_STARTED + properties = call_args[0][1] + assert properties["EvalItemId"] == "item-1" + assert properties["EvalItemName"] == "Test Eval" + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_run_created_stores_start_time(self, mock_track_event): + """Test that eval run start time is stored.""" + subscriber = EvalTelemetrySubscriber() + event = self._create_eval_run_created_event(execution_id="exec-456") + + await subscriber._on_eval_run_created(event) + + assert "exec-456" in subscriber._eval_run_start_times + assert "exec-456" in subscriber._eval_run_info + + +class TestEvalRunUpdated: + """Test eval run updated event handling.""" + + def _create_eval_run_updated_event( + self, + execution_id: str = "exec-123", + eval_item_id: str = "item-1", + eval_item_name: str = "Test Eval", + success: bool = True, + agent_execution_time: float = 1.5, + eval_results: list = None, + exception_details: EvalItemExceptionDetails | None = None, + ) -> EvalRunUpdatedEvent: + """Helper to create EvalRunUpdatedEvent.""" + eval_item = EvaluationItem( + id=eval_item_id, + name=eval_item_name, + inputs={}, + expected_agent_behavior="", + evaluation_criterias={}, + ) + if eval_results is None: + eval_results = [] + return EvalRunUpdatedEvent( + execution_id=execution_id, + eval_item=eval_item, + eval_results=eval_results, + success=success, + agent_output={}, + agent_execution_time=agent_execution_time, + spans=[], + logs=[], + exception_details=exception_details, + ) + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_run_updated_success(self, mock_track_event): + """Test that successful eval run completion is tracked.""" + subscriber = EvalTelemetrySubscriber() + subscriber._eval_run_start_times["exec-123"] = 1000.0 + subscriber._eval_run_info["exec-123"] = { + "eval_item_id": "item-1", + "eval_item_name": "Test Eval", + } + event = self._create_eval_run_updated_event(success=True) + + with patch("time.time", return_value=1002.0): + await subscriber._on_eval_run_updated(event) + + mock_track_event.assert_called_once() + call_args = mock_track_event.call_args + assert call_args[0][0] == EVAL_RUN_COMPLETED + properties = call_args[0][1] + assert properties["Success"] is True + assert properties["DurationMs"] == 2000 # 2 seconds + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_run_updated_failure(self, mock_track_event): + """Test that failed eval run is tracked with EVAL_RUN_FAILED.""" + subscriber = EvalTelemetrySubscriber() + exception_details = EvalItemExceptionDetails( + exception=ValueError("Test error"), + runtime_exception=True, + ) + event = self._create_eval_run_updated_event( + success=False, + exception_details=exception_details, + ) + + await subscriber._on_eval_run_updated(event) + + call_args = mock_track_event.call_args + assert call_args[0][0] == EVAL_RUN_FAILED + properties = call_args[0][1] + assert properties["Success"] is False + assert properties["ErrorType"] == "ValueError" + assert "Test error" in properties["ErrorMessage"] + assert properties["IsRuntimeException"] is True + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_run_updated_with_scores(self, mock_track_event): + """Test that average score is calculated and tracked.""" + subscriber = EvalTelemetrySubscriber() + eval_results = [ + EvalItemResult( + evaluator_id="eval-1", + result=NumericEvaluationResult(score=0.8, details="Good"), + ), + EvalItemResult( + evaluator_id="eval-2", + result=NumericEvaluationResult(score=0.6, details="OK"), + ), + ] + event = self._create_eval_run_updated_event(eval_results=eval_results) + + await subscriber._on_eval_run_updated(event) + + properties = mock_track_event.call_args[0][1] + assert properties["AverageScore"] == 0.7 # (0.8 + 0.6) / 2 + assert properties["EvaluatorCount"] == 2 + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_run_updated_agent_execution_time_converted_to_ms( + self, mock_track_event + ): + """Test that agent execution time is converted to milliseconds.""" + subscriber = EvalTelemetrySubscriber() + event = self._create_eval_run_updated_event(agent_execution_time=2.5) + + await subscriber._on_eval_run_updated(event) + + properties = mock_track_event.call_args[0][1] + assert properties["AgentExecutionTimeMs"] == 2500 # 2.5 seconds = 2500 ms + + +class TestEvalSetRunUpdated: + """Test eval set run updated event handling.""" + + def _create_eval_set_run_updated_event( + self, + execution_id: str = "exec-123", + evaluator_scores: dict = None, + success: bool = True, + ) -> EvalSetRunUpdatedEvent: + """Helper to create EvalSetRunUpdatedEvent.""" + return EvalSetRunUpdatedEvent( + execution_id=execution_id, + evaluator_scores=evaluator_scores or {}, + success=success, + ) + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_set_run_updated_success(self, mock_track_event): + """Test that successful eval set completion is tracked.""" + subscriber = EvalTelemetrySubscriber() + subscriber._eval_set_start_times["exec-123"] = 1000.0 + subscriber._eval_set_info["exec-123"] = { + "eval_set_id": "set-1", + "eval_set_run_id": "run-1", + "entrypoint": "agent.py", + "no_of_evals": 3, + } + event = self._create_eval_set_run_updated_event( + evaluator_scores={"eval-1": 0.9, "eval-2": 0.7}, + success=True, + ) + + with patch("time.time", return_value=1005.0): + await subscriber._on_eval_set_run_updated(event) + + mock_track_event.assert_called_once() + call_args = mock_track_event.call_args + assert call_args[0][0] == EVAL_SET_RUN_COMPLETED + properties = call_args[0][1] + assert properties["Success"] is True + assert properties["DurationMs"] == 5000 + assert properties["AverageScore"] == 0.8 # (0.9 + 0.7) / 2 + assert properties["EvalSetId"] == "set-1" + assert properties["EvalSetRunId"] == "run-1" + assert properties["Entrypoint"] == "agent.py" + assert properties["EvalCount"] == 3 + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_set_run_updated_failure(self, mock_track_event): + """Test that failed eval set is tracked with EVAL_SET_RUN_FAILED.""" + subscriber = EvalTelemetrySubscriber() + event = self._create_eval_set_run_updated_event(success=False) + + await subscriber._on_eval_set_run_updated(event) + + call_args = mock_track_event.call_args + assert call_args[0][0] == EVAL_SET_RUN_FAILED + properties = call_args[0][1] + assert properties["Success"] is False + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_on_eval_set_run_updated_includes_evaluator_scores( + self, mock_track_event + ): + """Test that individual evaluator scores are included.""" + subscriber = EvalTelemetrySubscriber() + event = self._create_eval_set_run_updated_event( + evaluator_scores={"accuracy": 0.95, "relevance-check": 0.85}, + ) + + await subscriber._on_eval_set_run_updated(event) + + properties = mock_track_event.call_args[0][1] + assert properties["Score_accuracy"] == 0.95 + assert ( + properties["Score_relevance_check"] == 0.85 + ) # dash replaced with underscore + + +class TestEnrichProperties: + """Test property enrichment with context information.""" + + def test_enrich_properties_adds_source(self): + """Test that source and application name are always added.""" + subscriber = EvalTelemetrySubscriber() + properties = {} + + subscriber._enrich_properties(properties) + + assert properties["Source"] == "uipath-python-cli" + assert properties["ApplicationName"] == "UiPath.Eval" + + def test_enrich_properties_adds_env_vars(self): + """Test that environment variables are added when present.""" + subscriber = EvalTelemetrySubscriber() + properties = {} + + with patch.dict( + os.environ, + { + "UIPATH_PROJECT_ID": "project-123", + "UIPATH_CLOUD_ORGANIZATION_ID": "org-456", + "UIPATH_CLOUD_USER_ID": "user-789", + "UIPATH_TENANT_ID": "tenant-abc", + }, + ): + subscriber._enrich_properties(properties) + + assert properties["ProjectId"] == "project-123" + assert properties["CloudOrganizationId"] == "org-456" + assert properties["CloudUserId"] == "user-789" + assert properties["TenantId"] == "tenant-abc" + + def test_enrich_properties_skips_missing_env_vars(self): + """Test that missing environment variables are not added.""" + subscriber = EvalTelemetrySubscriber() + properties = {} + + with patch.dict(os.environ, {}, clear=True): + # Remove env vars if they exist + for key in [ + "UIPATH_PROJECT_ID", + "UIPATH_CLOUD_ORGANIZATION_ID", + "UIPATH_CLOUD_USER_ID", + "UIPATH_TENANT_ID", + ]: + os.environ.pop(key, None) + + subscriber._enrich_properties(properties) + + assert "ProjectId" not in properties + assert "CloudOrganizationId" not in properties + assert "CloudUserId" not in properties + assert "TenantId" not in properties + + +class TestExceptionHandling: + """Test that telemetry never breaks the main application.""" + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_eval_set_run_created_handles_exception(self, mock_track_event): + """Test that exceptions in event handling are caught.""" + mock_track_event.side_effect = Exception("Track failed") + subscriber = EvalTelemetrySubscriber() + event = EvalSetRunCreatedEvent( + execution_id="exec-1", + eval_set_id="set-1", + entrypoint="agent.py", + no_of_evals=1, + evaluators=[], + ) + + # Should not raise exception + await subscriber._on_eval_set_run_created(event) + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_eval_run_created_handles_exception(self, mock_track_event): + """Test that exceptions in eval run created handling are caught.""" + mock_track_event.side_effect = Exception("Track failed") + subscriber = EvalTelemetrySubscriber() + eval_item = EvaluationItem( + id="item-1", + name="Test", + inputs={}, + expected_agent_behavior="", + evaluation_criterias={}, + ) + event = EvalRunCreatedEvent(execution_id="exec-1", eval_item=eval_item) + + # Should not raise exception + await subscriber._on_eval_run_created(event) + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_eval_run_updated_handles_exception(self, mock_track_event): + """Test that exceptions in eval run updated handling are caught.""" + mock_track_event.side_effect = Exception("Track failed") + subscriber = EvalTelemetrySubscriber() + eval_item = EvaluationItem( + id="item-1", + name="Test", + inputs={}, + expected_agent_behavior="", + evaluation_criterias={}, + ) + event = EvalRunUpdatedEvent( + execution_id="exec-1", + eval_item=eval_item, + eval_results=[], + success=True, + agent_output={}, + agent_execution_time=1.0, + spans=[], + logs=[], + ) + + # Should not raise exception + await subscriber._on_eval_run_updated(event) + + @pytest.mark.asyncio + @patch("uipath._cli._evals._telemetry.track_event") + async def test_eval_set_run_updated_handles_exception(self, mock_track_event): + """Test that exceptions in eval set run updated handling are caught.""" + mock_track_event.side_effect = Exception("Track failed") + subscriber = EvalTelemetrySubscriber() + event = EvalSetRunUpdatedEvent( + execution_id="exec-1", + evaluator_scores={}, + success=True, + ) + + # Should not raise exception + await subscriber._on_eval_set_run_updated(event) diff --git a/tests/cli/eval/test_eval_tracing_integration.py b/tests/cli/eval/test_eval_tracing_integration.py new file mode 100644 index 000000000..4d4556771 --- /dev/null +++ b/tests/cli/eval/test_eval_tracing_integration.py @@ -0,0 +1,485 @@ +"""Integration tests for eval tracing flow. + +These tests verify the end-to-end span creation and hierarchy in the eval runtime. +""" + +import uuid +from typing import Any, Dict, List + + +class MockSpan: + """Mock span that captures attributes for testing.""" + + def __init__(self, name: str, attributes: Dict[str, Any] = None): + self.name = name + self.attributes = attributes or {} + self._status = None + + def set_status(self, status): + self._status = status + + +class SpanRecorder: + """Records all spans created during test execution.""" + + def __init__(self): + self.spans: List[Dict[str, Any]] = [] + self._span_stack: List[MockSpan] = [] + + def start_as_current_span(self, name: str, attributes: Dict[str, Any] = None): + """Mock tracer method that records span creation.""" + span_info = { + "name": name, + "attributes": dict(attributes) if attributes else {}, + "parent": self._span_stack[-1].name if self._span_stack else None, + } + self.spans.append(span_info) + + mock_span = MockSpan(name, attributes) + return _SpanContextManager(mock_span, self._span_stack) + + def get_spans_by_type(self, span_type: str) -> List[Dict[str, Any]]: + """Get all spans with the given span_type attribute.""" + return [s for s in self.spans if s["attributes"].get("span_type") == span_type] + + def get_span_by_name(self, name: str) -> Dict[str, Any] | None: + """Get the first span with the given name.""" + for span in self.spans: + if span["name"] == name: + return span + return None + + +class _SpanContextManager: + """Context manager for mock spans.""" + + def __init__(self, span: MockSpan, stack: List[MockSpan]): + self.span = span + self.stack = stack + + def __enter__(self): + self.stack.append(self.span) + return self.span + + def __exit__(self, *args): + self.stack.pop() + + +class TestEvalSetRunSpanIntegration: + """Integration tests for Evaluation Set Run span.""" + + def test_eval_set_run_span_created_first(self): + """Test that Evaluation Set Run span is created as the root span.""" + recorder = SpanRecorder() + + # Simulate the span creation from _runtime.py:315-317 + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run"}, + ): + pass + + assert len(recorder.spans) == 1 + span = recorder.spans[0] + assert span["name"] == "Evaluation Set Run" + assert span["attributes"]["span_type"] == "eval_set_run" + assert span["parent"] is None + + def test_eval_set_run_span_with_run_id(self): + """Test that eval_set_run_id is included when provided.""" + recorder = SpanRecorder() + eval_set_run_id = "custom-run-123" + + span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} + span_attributes["eval_set_run_id"] = eval_set_run_id + + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes=span_attributes, + ): + pass + + span = recorder.spans[0] + assert span["attributes"]["eval_set_run_id"] == "custom-run-123" + + +class TestEvaluationSpanIntegration: + """Integration tests for Evaluation span.""" + + def test_evaluation_span_is_child_of_eval_set_run(self): + """Test that Evaluation span is a child of Evaluation Set Run.""" + recorder = SpanRecorder() + execution_id = str(uuid.uuid4()) + + # Simulate the nested span creation + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run"}, + ): + with recorder.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": execution_id, + "span_type": "evaluation", + "eval_item_id": "item-1", + "eval_item_name": "Test Item", + }, + ): + pass + + assert len(recorder.spans) == 2 + + eval_set_run_span = recorder.get_span_by_name("Evaluation Set Run") + evaluation_span = recorder.get_span_by_name("Evaluation") + + assert eval_set_run_span is not None + assert evaluation_span is not None + assert evaluation_span["parent"] == "Evaluation Set Run" + + def test_multiple_evaluation_spans_share_parent(self): + """Test that multiple Evaluation spans share the same parent.""" + recorder = SpanRecorder() + + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run"}, + ): + for i in range(3): + with recorder.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": str(uuid.uuid4()), + "span_type": "evaluation", + "eval_item_id": f"item-{i}", + "eval_item_name": f"Test Item {i}", + }, + ): + pass + + evaluation_spans = recorder.get_spans_by_type("evaluation") + assert len(evaluation_spans) == 3 + + for span in evaluation_spans: + assert span["parent"] == "Evaluation Set Run" + + +class TestEvaluatorSpanIntegration: + """Integration tests for Evaluator span.""" + + def test_evaluator_span_is_child_of_evaluation(self): + """Test that Evaluator span is a child of Evaluation.""" + recorder = SpanRecorder() + + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run"}, + ): + with recorder.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": str(uuid.uuid4()), + "span_type": "evaluation", + "eval_item_id": "item-1", + "eval_item_name": "Test Item", + }, + ): + with recorder.start_as_current_span( + "Evaluator: AccuracyEvaluator", + attributes={ + "span_type": "evaluator", + "evaluator_id": "accuracy-1", + "evaluator_name": "AccuracyEvaluator", + "eval_item_id": "item-1", + }, + ): + pass + + evaluator_span = recorder.spans[-1] + assert evaluator_span["name"] == "Evaluator: AccuracyEvaluator" + assert evaluator_span["parent"] == "Evaluation" + + def test_multiple_evaluator_spans_per_evaluation(self): + """Test that multiple Evaluator spans can be children of one Evaluation.""" + recorder = SpanRecorder() + evaluator_names = ["Accuracy", "Relevance", "Fluency"] + + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run"}, + ): + with recorder.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": str(uuid.uuid4()), + "span_type": "evaluation", + "eval_item_id": "item-1", + "eval_item_name": "Test Item", + }, + ): + for name in evaluator_names: + with recorder.start_as_current_span( + f"Evaluator: {name}", + attributes={ + "span_type": "evaluator", + "evaluator_id": f"{name.lower()}-1", + "evaluator_name": name, + "eval_item_id": "item-1", + }, + ): + pass + + evaluator_spans = recorder.get_spans_by_type("evaluator") + assert len(evaluator_spans) == 3 + + for span in evaluator_spans: + assert span["parent"] == "Evaluation" + + +class TestFullSpanHierarchy: + """Integration tests for the complete span hierarchy.""" + + def test_complete_hierarchy_structure(self): + """Test the complete span hierarchy: EvalSetRun > Evaluation > Evaluator.""" + recorder = SpanRecorder() + + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run", "eval_set_run_id": "run-1"}, + ): + for i in range(2): + with recorder.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": str(uuid.uuid4()), + "span_type": "evaluation", + "eval_item_id": f"item-{i}", + "eval_item_name": f"Test Item {i}", + }, + ): + with recorder.start_as_current_span( + "Evaluator: TestEvaluator", + attributes={ + "span_type": "evaluator", + "evaluator_id": "test-eval", + "evaluator_name": "TestEvaluator", + "eval_item_id": f"item-{i}", + }, + ): + pass + + # Should have: 1 EvalSetRun + 2 Evaluation + 2 Evaluator = 5 spans + assert len(recorder.spans) == 5 + + eval_set_run_spans = recorder.get_spans_by_type("eval_set_run") + evaluation_spans = recorder.get_spans_by_type("evaluation") + evaluator_spans = recorder.get_spans_by_type("evaluator") + + assert len(eval_set_run_spans) == 1 + assert len(evaluation_spans) == 2 + assert len(evaluator_spans) == 2 + + def test_span_attributes_are_complete(self): + """Test that all spans have the required attributes.""" + recorder = SpanRecorder() + + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run", "eval_set_run_id": "run-123"}, + ): + with recorder.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": "exec-456", + "span_type": "evaluation", + "eval_item_id": "item-789", + "eval_item_name": "My Test", + }, + ): + with recorder.start_as_current_span( + "Evaluator: Accuracy", + attributes={ + "span_type": "evaluator", + "evaluator_id": "acc-1", + "evaluator_name": "Accuracy", + "eval_item_id": "item-789", + }, + ): + pass + + # Verify EvalSetRun span + eval_set_run = recorder.get_spans_by_type("eval_set_run")[0] + assert eval_set_run["attributes"]["eval_set_run_id"] == "run-123" + + # Verify Evaluation span + evaluation = recorder.get_spans_by_type("evaluation")[0] + assert evaluation["attributes"]["execution.id"] == "exec-456" + assert evaluation["attributes"]["eval_item_id"] == "item-789" + assert evaluation["attributes"]["eval_item_name"] == "My Test" + + # Verify Evaluator span + evaluator = recorder.get_spans_by_type("evaluator")[0] + assert evaluator["attributes"]["evaluator_id"] == "acc-1" + assert evaluator["attributes"]["evaluator_name"] == "Accuracy" + assert evaluator["attributes"]["eval_item_id"] == "item-789" + + +class TestSpanNaming: + """Tests for span naming conventions.""" + + def test_eval_set_run_span_name(self): + """Test that EvalSetRun span has correct name.""" + recorder = SpanRecorder() + + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run"}, + ): + pass + + assert recorder.spans[0]["name"] == "Evaluation Set Run" + + def test_evaluation_span_name(self): + """Test that Evaluation span has correct name.""" + recorder = SpanRecorder() + + with recorder.start_as_current_span( + "Evaluation", + attributes={"span_type": "evaluation"}, + ): + pass + + assert recorder.spans[0]["name"] == "Evaluation" + + def test_evaluator_span_name_format(self): + """Test that Evaluator span name follows the pattern 'Evaluator: {name}'.""" + recorder = SpanRecorder() + evaluator_name = "MyCustomEvaluator" + + with recorder.start_as_current_span( + f"Evaluator: {evaluator_name}", + attributes={ + "span_type": "evaluator", + "evaluator_name": evaluator_name, + }, + ): + pass + + span = recorder.spans[0] + assert span["name"] == "Evaluator: MyCustomEvaluator" + assert span["name"].startswith("Evaluator: ") + + +class TestExecutionIdTracking: + """Tests for execution.id tracking in spans.""" + + def test_each_evaluation_has_unique_execution_id(self): + """Test that each Evaluation span gets a unique execution.id.""" + recorder = SpanRecorder() + execution_ids = [] + + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run"}, + ): + for i in range(3): + exec_id = str(uuid.uuid4()) + execution_ids.append(exec_id) + with recorder.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": exec_id, + "span_type": "evaluation", + "eval_item_id": f"item-{i}", + "eval_item_name": f"Item {i}", + }, + ): + pass + + # Verify all execution IDs are unique + assert len(set(execution_ids)) == 3 + + # Verify each evaluation span has its execution.id + evaluation_spans = recorder.get_spans_by_type("evaluation") + for i, span in enumerate(evaluation_spans): + assert span["attributes"]["execution.id"] == execution_ids[i] + + def test_eval_set_run_does_not_have_execution_id(self): + """Test that EvalSetRun span does NOT have execution.id. + + This is intentional to prevent ID propagation to child spans. + """ + recorder = SpanRecorder() + + with recorder.start_as_current_span( + "Evaluation Set Run", + attributes={"span_type": "eval_set_run"}, + ): + pass + + eval_set_run = recorder.spans[0] + assert "execution.id" not in eval_set_run["attributes"] + + +class TestEvaluatorSpanEvalItemId: + """Tests for eval_item_id in evaluator spans.""" + + def test_evaluator_span_has_eval_item_id(self): + """Test that Evaluator span includes the eval_item_id.""" + recorder = SpanRecorder() + eval_item_id = "item-specific-123" + + with recorder.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": str(uuid.uuid4()), + "span_type": "evaluation", + "eval_item_id": eval_item_id, + "eval_item_name": "Test", + }, + ): + with recorder.start_as_current_span( + "Evaluator: Test", + attributes={ + "span_type": "evaluator", + "evaluator_id": "test-1", + "evaluator_name": "Test", + "eval_item_id": eval_item_id, + }, + ): + pass + + evaluator_span = recorder.get_spans_by_type("evaluator")[0] + assert evaluator_span["attributes"]["eval_item_id"] == eval_item_id + + def test_evaluator_and_evaluation_share_eval_item_id(self): + """Test that Evaluator and Evaluation spans share the same eval_item_id.""" + recorder = SpanRecorder() + eval_item_id = "shared-item-456" + + with recorder.start_as_current_span( + "Evaluation", + attributes={ + "execution.id": str(uuid.uuid4()), + "span_type": "evaluation", + "eval_item_id": eval_item_id, + "eval_item_name": "Test", + }, + ): + with recorder.start_as_current_span( + "Evaluator: Test", + attributes={ + "span_type": "evaluator", + "evaluator_id": "test-1", + "evaluator_name": "Test", + "eval_item_id": eval_item_id, + }, + ): + pass + + evaluation_span = recorder.get_spans_by_type("evaluation")[0] + evaluator_span = recorder.get_spans_by_type("evaluator")[0] + + assert ( + evaluation_span["attributes"]["eval_item_id"] + == evaluator_span["attributes"]["eval_item_id"] + ) diff --git a/tests/telemetry/__init__.py b/tests/telemetry/__init__.py new file mode 100644 index 000000000..e673b8aab --- /dev/null +++ b/tests/telemetry/__init__.py @@ -0,0 +1 @@ +"""Tests for telemetry tracking functionality.""" diff --git a/tests/telemetry/test_track.py b/tests/telemetry/test_track.py new file mode 100644 index 000000000..aca2afd4e --- /dev/null +++ b/tests/telemetry/test_track.py @@ -0,0 +1,482 @@ +"""Tests for telemetry tracking functionality.""" + +import os +from unittest.mock import MagicMock, patch + +from uipath.telemetry._track import ( + _AppInsightsEventClient, + _parse_connection_string, + _TelemetryClient, + flush_events, + is_telemetry_enabled, + track, + track_event, +) + + +class TestParseConnectionString: + """Test connection string parsing functionality.""" + + def test_parse_valid_connection_string(self): + """Test parsing a valid Application Insights connection string.""" + connection_string = ( + "InstrumentationKey=test-key-123;" + "IngestionEndpoint=https://example.com/;" + "LiveEndpoint=https://live.example.com/" + ) + + result = _parse_connection_string(connection_string) + + assert result == "test-key-123" + + def test_parse_connection_string_only_instrumentation_key(self): + """Test parsing connection string with only InstrumentationKey.""" + connection_string = "InstrumentationKey=simple-key" + + result = _parse_connection_string(connection_string) + + assert result == "simple-key" + + def test_parse_connection_string_missing_instrumentation_key(self): + """Test parsing connection string without InstrumentationKey.""" + connection_string = ( + "IngestionEndpoint=https://example.com/;" + "LiveEndpoint=https://live.example.com/" + ) + + result = _parse_connection_string(connection_string) + + assert result is None + + def test_parse_malformed_connection_string(self): + """Test parsing malformed connection string.""" + connection_string = "not-a-valid-connection-string" + + result = _parse_connection_string(connection_string) + + assert result is None + + def test_parse_empty_connection_string(self): + """Test parsing empty connection string.""" + result = _parse_connection_string("") + + assert result is None + + def test_parse_connection_string_with_special_chars_in_value(self): + """Test parsing connection string with special characters in value.""" + connection_string = "InstrumentationKey=key=with=equals;Other=value" + + result = _parse_connection_string(connection_string) + + assert result == "key=with=equals" + + +class TestAppInsightsEventClient: + """Test _AppInsightsEventClient functionality.""" + + def setup_method(self): + """Reset AppInsightsEventClient state before each test.""" + _AppInsightsEventClient._initialized = False + _AppInsightsEventClient._client = None + + def teardown_method(self): + """Clean up after each test.""" + _AppInsightsEventClient._initialized = False + _AppInsightsEventClient._client = None + + def test_initialize_no_connection_string(self): + """Test initialization when no connection string is provided.""" + with patch.dict(os.environ, {}, clear=True): + # Remove APPLICATIONINSIGHTS_CONNECTION_STRING if it exists + os.environ.pop("APPLICATIONINSIGHTS_CONNECTION_STRING", None) + + _AppInsightsEventClient._initialize() + + assert _AppInsightsEventClient._initialized is True + assert _AppInsightsEventClient._client is None + + @patch("uipath.telemetry._track._HAS_APPINSIGHTS", False) + def test_initialize_no_appinsights_package(self): + """Test initialization when applicationinsights package is not available.""" + _AppInsightsEventClient._initialize() + + assert _AppInsightsEventClient._initialized is True + assert _AppInsightsEventClient._client is None + + @patch("uipath.telemetry._track._HAS_APPINSIGHTS", True) + @patch("uipath.telemetry._track.AppInsightsTelemetryClient") + def test_initialize_creates_client(self, mock_client_class): + """Test that initialization creates Application Insights client.""" + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + with patch.dict( + os.environ, + { + "APPLICATIONINSIGHTS_CONNECTION_STRING": ( + "InstrumentationKey=test-key;IngestionEndpoint=https://example.com/" + ) + }, + ): + _AppInsightsEventClient._initialize() + + assert _AppInsightsEventClient._initialized is True + assert _AppInsightsEventClient._client is mock_client + mock_client_class.assert_called_once_with("test-key") + + @patch("uipath.telemetry._track._HAS_APPINSIGHTS", True) + @patch("uipath.telemetry._track.AppInsightsTelemetryClient") + def test_initialize_invalid_connection_string(self, mock_client_class): + """Test initialization with invalid connection string.""" + with patch.dict( + os.environ, + {"APPLICATIONINSIGHTS_CONNECTION_STRING": "invalid-connection-string"}, + ): + _AppInsightsEventClient._initialize() + + assert _AppInsightsEventClient._initialized is True + assert _AppInsightsEventClient._client is None + mock_client_class.assert_not_called() + + def test_initialize_only_once(self): + """Test that initialization only happens once.""" + _AppInsightsEventClient._initialized = True + _AppInsightsEventClient._client = "existing_client" + + _AppInsightsEventClient._initialize() + + # Should not change the client since already initialized + assert _AppInsightsEventClient._client == "existing_client" + + @patch("uipath.telemetry._track._HAS_APPINSIGHTS", True) + @patch("uipath.telemetry._track.AppInsightsTelemetryClient") + def test_track_event_calls_client(self, mock_client_class): + """Test that track_event calls the Application Insights client.""" + mock_client = MagicMock() + mock_client_class.return_value = mock_client + _AppInsightsEventClient._initialized = True + _AppInsightsEventClient._client = mock_client + + properties = {"key1": "value1", "key2": 123, "key3": None} + + _AppInsightsEventClient.track_event("test_event", properties) + + mock_client.track_event.assert_called_once_with( + name="test_event", + properties={ + "key1": "value1", + "key2": "123", + }, # None filtered, int converted + measurements={}, + ) + + def test_track_event_no_client(self): + """Test that track_event does nothing when client is not initialized.""" + _AppInsightsEventClient._initialized = True + _AppInsightsEventClient._client = None + + # Should not raise any exception + _AppInsightsEventClient.track_event("test_event", {"key": "value"}) + + def test_track_event_empty_properties(self): + """Test track_event with empty properties.""" + mock_client = MagicMock() + _AppInsightsEventClient._initialized = True + _AppInsightsEventClient._client = mock_client + + _AppInsightsEventClient.track_event("test_event", None) + + mock_client.track_event.assert_called_once_with( + name="test_event", + properties={}, + measurements={}, + ) + + def test_flush_calls_client(self): + """Test that flush calls the client's flush method.""" + mock_client = MagicMock() + _AppInsightsEventClient._client = mock_client + + _AppInsightsEventClient.flush() + + mock_client.flush.assert_called_once() + + def test_flush_no_client(self): + """Test that flush does nothing when client is not available.""" + _AppInsightsEventClient._client = None + + # Should not raise any exception + _AppInsightsEventClient.flush() + + +class TestTelemetryClient: + """Test _TelemetryClient functionality.""" + + def setup_method(self): + """Reset TelemetryClient state before each test.""" + _TelemetryClient._initialized = False + + def teardown_method(self): + """Clean up after each test.""" + _TelemetryClient._initialized = False + + def test_is_enabled_default_true(self): + """Test that telemetry is enabled by default.""" + with patch.dict(os.environ, {}, clear=True): + os.environ.pop("UIPATH_TELEMETRY_ENABLED", None) + + assert _TelemetryClient._is_enabled() is True + + def test_is_enabled_explicit_true(self): + """Test telemetry enabled when explicitly set to true.""" + with patch.dict(os.environ, {"UIPATH_TELEMETRY_ENABLED": "true"}): + assert _TelemetryClient._is_enabled() is True + + def test_is_enabled_explicit_false(self): + """Test telemetry disabled when set to false.""" + with patch.dict(os.environ, {"UIPATH_TELEMETRY_ENABLED": "false"}): + assert _TelemetryClient._is_enabled() is False + + def test_is_enabled_case_insensitive(self): + """Test that telemetry enabled check is case insensitive.""" + with patch.dict(os.environ, {"UIPATH_TELEMETRY_ENABLED": "TRUE"}): + assert _TelemetryClient._is_enabled() is True + + with patch.dict(os.environ, {"UIPATH_TELEMETRY_ENABLED": "False"}): + assert _TelemetryClient._is_enabled() is False + + @patch.object(_TelemetryClient, "_is_enabled", return_value=False) + def test_track_event_disabled(self, mock_is_enabled): + """Test that track_event does nothing when telemetry is disabled.""" + with patch.object(_AppInsightsEventClient, "track_event") as mock_track: + _TelemetryClient.track_event("test_event", {"key": "value"}) + + mock_track.assert_not_called() + + @patch.object(_TelemetryClient, "_is_enabled", return_value=True) + @patch.object(_AppInsightsEventClient, "track_event") + def test_track_event_enabled(self, mock_track, mock_is_enabled): + """Test that track_event calls AppInsightsEventClient when enabled.""" + properties = {"key": "value"} + + _TelemetryClient.track_event("test_event", properties) + + mock_track.assert_called_once_with("test_event", properties) + + +class TestPublicFunctions: + """Test public telemetry functions.""" + + def setup_method(self): + """Reset state before each test.""" + _TelemetryClient._initialized = False + _AppInsightsEventClient._initialized = False + _AppInsightsEventClient._client = None + + @patch.object(_TelemetryClient, "track_event") + def test_track_event_function(self, mock_track): + """Test the global track_event function.""" + properties = {"key": "value"} + + track_event("test_event", properties) + + mock_track.assert_called_once_with("test_event", properties) + + @patch.object(_TelemetryClient, "_is_enabled", return_value=True) + def test_is_telemetry_enabled_true(self, mock_is_enabled): + """Test is_telemetry_enabled returns True when enabled.""" + assert is_telemetry_enabled() is True + + @patch.object(_TelemetryClient, "_is_enabled", return_value=False) + def test_is_telemetry_enabled_false(self, mock_is_enabled): + """Test is_telemetry_enabled returns False when disabled.""" + assert is_telemetry_enabled() is False + + @patch.object(_AppInsightsEventClient, "flush") + def test_flush_events_function(self, mock_flush): + """Test the global flush_events function.""" + flush_events() + + mock_flush.assert_called_once() + + +class TestTrackDecorator: + """Test the @track decorator functionality.""" + + def setup_method(self): + """Reset state before each test.""" + _TelemetryClient._initialized = False + + @patch.object(_TelemetryClient, "_track_method") + @patch.object(_TelemetryClient, "_is_enabled", return_value=True) + def test_track_decorator_with_name(self, mock_is_enabled, mock_track_method): + """Test @track decorator with explicit name.""" + + @track("custom_name") + def my_function(): + return "result" + + result = my_function() + + assert result == "result" + mock_track_method.assert_called_once_with("custom_name", None) + + @patch.object(_TelemetryClient, "_track_method") + @patch.object(_TelemetryClient, "_is_enabled", return_value=True) + def test_track_decorator_without_name(self, mock_is_enabled, mock_track_method): + """Test @track decorator without name uses function name.""" + + @track + def my_function(): + return "result" + + result = my_function() + + assert result == "result" + mock_track_method.assert_called_once_with("my_function", None) + + @patch.object(_TelemetryClient, "_track_method") + @patch.object(_TelemetryClient, "_is_enabled", return_value=True) + def test_track_decorator_with_extra(self, mock_is_enabled, mock_track_method): + """Test @track decorator with extra attributes.""" + extra = {"attr1": "value1"} + + @track("event_name", extra=extra) + def my_function(): + return "result" + + result = my_function() + + assert result == "result" + mock_track_method.assert_called_once_with("event_name", extra) + + @patch.object(_TelemetryClient, "_track_method") + @patch.object(_TelemetryClient, "_is_enabled", return_value=True) + def test_track_decorator_when_condition_true( + self, mock_is_enabled, mock_track_method + ): + """Test @track decorator with when condition that returns True.""" + + @track("event_name", when=True) + def my_function(): + return "result" + + result = my_function() + + assert result == "result" + mock_track_method.assert_called_once() + + @patch.object(_TelemetryClient, "_track_method") + @patch.object(_TelemetryClient, "_is_enabled", return_value=True) + def test_track_decorator_when_condition_false( + self, mock_is_enabled, mock_track_method + ): + """Test @track decorator with when condition that returns False.""" + + @track("event_name", when=False) + def my_function(): + return "result" + + result = my_function() + + assert result == "result" + mock_track_method.assert_not_called() + + @patch.object(_TelemetryClient, "_track_method") + @patch.object(_TelemetryClient, "_is_enabled", return_value=True) + def test_track_decorator_when_callable(self, mock_is_enabled, mock_track_method): + """Test @track decorator with callable when condition.""" + + @track("event_name", when=lambda x: x > 5) + def my_function(x): + return x * 2 + + # Should track when x > 5 + result = my_function(10) + assert result == 20 + mock_track_method.assert_called_once() + + mock_track_method.reset_mock() + + # Should not track when x <= 5 + result = my_function(3) + assert result == 6 + mock_track_method.assert_not_called() + + @patch.object(_TelemetryClient, "_is_enabled", return_value=False) + @patch.object(_TelemetryClient, "_initialize") + def test_track_decorator_telemetry_disabled(self, mock_initialize, mock_is_enabled): + """Test @track decorator doesn't initialize when telemetry is disabled. + + The decorator still calls _track_method, but _track_method should + short-circuit and not initialize when telemetry is disabled. + """ + + @track("event_name") + def my_function(): + return "result" + + result = my_function() + + assert result == "result" + # _initialize should not be called when telemetry is disabled + mock_initialize.assert_not_called() + + @patch.object(_TelemetryClient, "_track_method") + @patch.object(_TelemetryClient, "_is_enabled", return_value=True) + def test_track_decorator_preserves_function_metadata( + self, mock_is_enabled, mock_track_method + ): + """Test that @track decorator preserves function metadata.""" + + @track("event_name") + def my_function_with_doc(): + """This is a docstring.""" + return "result" + + assert my_function_with_doc.__name__ == "my_function_with_doc" + assert my_function_with_doc.__doc__ == "This is a docstring." + + +class TestTelemetryExceptionHandling: + """Test that telemetry never breaks the main application.""" + + def setup_method(self): + """Reset state before each test.""" + _AppInsightsEventClient._initialized = False + _AppInsightsEventClient._client = None + + def test_track_event_handles_client_exception(self): + """Test that track_event handles exceptions from the client.""" + mock_client = MagicMock() + mock_client.track_event.side_effect = Exception("Client error") + _AppInsightsEventClient._initialized = True + _AppInsightsEventClient._client = mock_client + + # Should not raise exception + _AppInsightsEventClient.track_event("test_event", {"key": "value"}) + + def test_flush_handles_exception(self): + """Test that flush handles exceptions from the client.""" + mock_client = MagicMock() + mock_client.flush.side_effect = Exception("Flush error") + _AppInsightsEventClient._client = mock_client + + # Should not raise exception + _AppInsightsEventClient.flush() + + @patch("uipath.telemetry._track._HAS_APPINSIGHTS", True) + @patch("uipath.telemetry._track.AppInsightsTelemetryClient") + def test_initialize_handles_exception(self, mock_client_class): + """Test that initialization handles exceptions.""" + mock_client_class.side_effect = Exception("Init error") + + with patch.dict( + os.environ, + {"APPLICATIONINSIGHTS_CONNECTION_STRING": "InstrumentationKey=test-key"}, + ): + # Should not raise exception + _AppInsightsEventClient._initialize() + + assert _AppInsightsEventClient._initialized is True + assert _AppInsightsEventClient._client is None From e136c254732d9e9cbc05f3f26bc25ada026bf9fb Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 7 Jan 2026 16:43:05 -0800 Subject: [PATCH 05/12] fix: linting errors and integration tests --- testcases/eval-spans-testcase/pyproject.toml | 11 + testcases/eval-spans-testcase/run.sh | 18 ++ testcases/eval-spans-testcase/src/assert.py | 252 ++++++++++++++++++ testcases/eval-spans-testcase/uipath.json | 5 + tests/cli/eval/test_eval_runtime_spans.py | 8 +- tests/cli/eval/test_eval_telemetry.py | 13 +- .../cli/eval/test_eval_tracing_integration.py | 8 +- 7 files changed, 303 insertions(+), 12 deletions(-) create mode 100644 testcases/eval-spans-testcase/pyproject.toml create mode 100755 testcases/eval-spans-testcase/run.sh create mode 100644 testcases/eval-spans-testcase/src/assert.py create mode 100644 testcases/eval-spans-testcase/uipath.json diff --git a/testcases/eval-spans-testcase/pyproject.toml b/testcases/eval-spans-testcase/pyproject.toml new file mode 100644 index 000000000..41b4430c7 --- /dev/null +++ b/testcases/eval-spans-testcase/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "eval-spans-testcase" +version = "0.1.0" +description = "E2E test for verifying eval spans (Evaluation Set Run, Evaluation, Evaluator)" +requires-python = ">=3.11" +dependencies = [ + "uipath", +] + +[tool.uv.sources] +uipath = { path = "../../", editable = true } diff --git a/testcases/eval-spans-testcase/run.sh b/testcases/eval-spans-testcase/run.sh new file mode 100755 index 000000000..80be32da9 --- /dev/null +++ b/testcases/eval-spans-testcase/run.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +echo "=== E2E Test: Eval Spans Verification ===" + +echo "Syncing dependencies..." +uv sync + +echo "Authenticating with UiPath..." +uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL" + +echo "Running evaluations with trace capture..." +# Run eval with trace file to capture spans +uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/default.json \ + --no-report \ + --trace-file __uipath/traces.jsonl + +echo "Test completed successfully!" diff --git a/testcases/eval-spans-testcase/src/assert.py b/testcases/eval-spans-testcase/src/assert.py new file mode 100644 index 000000000..670452fc1 --- /dev/null +++ b/testcases/eval-spans-testcase/src/assert.py @@ -0,0 +1,252 @@ +"""E2E assertions for eval spans testcase. + +This script validates that the new eval spans are created correctly: +1. "Evaluation Set Run" span with span_type: "eval_set_run" +2. "Evaluation" spans with span_type: "evaluation" +3. "Evaluator: {name}" spans with span_type: "evaluator" +""" + +import json +import os +import sys +from typing import Any + + +def load_traces(traces_file: str) -> list[dict[str, Any]]: + """Load traces from a JSONL file.""" + traces = [] + with open(traces_file, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + traces.append(json.loads(line)) + return traces + + +def get_attributes(span: dict[str, Any]) -> dict[str, Any]: + """Get attributes from a span.""" + return span.get("attributes", {}) + + +def find_spans_by_type( + traces: list[dict[str, Any]], span_type: str +) -> list[dict[str, Any]]: + """Find all spans with the given span_type attribute.""" + return [ + trace for trace in traces if get_attributes(trace).get("span_type") == span_type + ] + + +def find_spans_by_name(traces: list[dict[str, Any]], name: str) -> list[dict[str, Any]]: + """Find all spans with the given name.""" + return [trace for trace in traces if trace.get("name") == name] + + +def find_spans_by_name_prefix( + traces: list[dict[str, Any]], prefix: str +) -> list[dict[str, Any]]: + """Find all spans whose name starts with the given prefix.""" + return [trace for trace in traces if trace.get("name", "").startswith(prefix)] + + +def assert_eval_set_run_span(traces: list[dict[str, Any]]) -> None: + """Assert that the Evaluation Set Run span exists with correct attributes.""" + print("\n--- Checking 'Evaluation Set Run' span ---") + + # Find by span_type + eval_set_run_spans = find_spans_by_type(traces, "eval_set_run") + + assert len(eval_set_run_spans) >= 1, ( + "Expected at least 1 'eval_set_run' span, found 0. " + "Spans with span_type attribute: " + f"{[get_attributes(t).get('span_type') for t in traces if get_attributes(t).get('span_type')]}" + ) + + print(f" Found {len(eval_set_run_spans)} eval_set_run span(s)") + + for span in eval_set_run_spans: + name = span.get("name") + attrs = get_attributes(span) + + # Check span name + assert name == "Evaluation Set Run", ( + f"Expected span name 'Evaluation Set Run', got '{name}'" + ) + print(f" Name: {name}") + + # Check span_type attribute + assert attrs.get("span_type") == "eval_set_run", ( + f"Expected span_type 'eval_set_run', got '{attrs.get('span_type')}'" + ) + print(f" span_type: {attrs.get('span_type')}") + + # Check eval_set_run_id is present (may be execution_id fallback) + if "eval_set_run_id" in attrs: + print(f" eval_set_run_id: {attrs.get('eval_set_run_id')}") + + print("Evaluation Set Run span assertion passed") + + +def assert_evaluation_spans(traces: list[dict[str, Any]]) -> None: + """Assert that Evaluation spans exist with correct attributes.""" + print("\n--- Checking 'Evaluation' spans ---") + + # Find by span_type + evaluation_spans = find_spans_by_type(traces, "evaluation") + + assert len(evaluation_spans) >= 1, "Expected at least 1 'evaluation' span, found 0" + + print(f" Found {len(evaluation_spans)} evaluation span(s)") + + for i, span in enumerate(evaluation_spans): + name = span.get("name") + attrs = get_attributes(span) + + print(f"\n Evaluation span {i + 1}:") + + # Check span name + assert name == "Evaluation", f"Expected span name 'Evaluation', got '{name}'" + print(f" Name: {name}") + + # Check span_type attribute + assert attrs.get("span_type") == "evaluation", ( + f"Expected span_type 'evaluation', got '{attrs.get('span_type')}'" + ) + print(f" span_type: {attrs.get('span_type')}") + + # Check required attributes + assert "execution.id" in attrs, ( + "Expected 'execution.id' attribute in Evaluation span" + ) + print(f" execution.id: {attrs.get('execution.id')}") + + assert "eval_item_id" in attrs, ( + "Expected 'eval_item_id' attribute in Evaluation span" + ) + print(f" eval_item_id: {attrs.get('eval_item_id')}") + + assert "eval_item_name" in attrs, ( + "Expected 'eval_item_name' attribute in Evaluation span" + ) + print(f" eval_item_name: {attrs.get('eval_item_name')}") + + print("\nEvaluation spans assertion passed") + + +def assert_evaluator_spans(traces: list[dict[str, Any]]) -> None: + """Assert that Evaluator spans exist with correct attributes.""" + print("\n--- Checking 'Evaluator' spans ---") + + # Find by span_type + evaluator_spans = find_spans_by_type(traces, "evaluator") + + assert len(evaluator_spans) >= 1, "Expected at least 1 'evaluator' span, found 0" + + print(f" Found {len(evaluator_spans)} evaluator span(s)") + + for i, span in enumerate(evaluator_spans): + name = span.get("name") + attrs = get_attributes(span) + + print(f"\n Evaluator span {i + 1}:") + + # Check span name starts with "Evaluator: " + assert name and name.startswith("Evaluator: "), ( + f"Expected span name to start with 'Evaluator: ', got '{name}'" + ) + print(f" Name: {name}") + + # Check span_type attribute + assert attrs.get("span_type") == "evaluator", ( + f"Expected span_type 'evaluator', got '{attrs.get('span_type')}'" + ) + print(f" span_type: {attrs.get('span_type')}") + + # Check required attributes + assert "evaluator_id" in attrs, ( + "Expected 'evaluator_id' attribute in Evaluator span" + ) + print(f" evaluator_id: {attrs.get('evaluator_id')}") + + assert "evaluator_name" in attrs, ( + "Expected 'evaluator_name' attribute in Evaluator span" + ) + print(f" evaluator_name: {attrs.get('evaluator_name')}") + + assert "eval_item_id" in attrs, ( + "Expected 'eval_item_id' attribute in Evaluator span" + ) + print(f" eval_item_id: {attrs.get('eval_item_id')}") + + print("\nEvaluator spans assertion passed") + + +def assert_span_hierarchy(traces: list[dict[str, Any]]) -> None: + """Assert the span hierarchy is correct.""" + print("\n--- Checking span hierarchy ---") + + # Build span lookup by span_id + span_by_id: dict[str, dict[str, Any]] = {} + for trace in traces: + context = trace.get("context", {}) + span_id = context.get("span_id") + if span_id: + span_by_id[span_id] = trace + + # Get spans by type + eval_set_run_spans = find_spans_by_type(traces, "eval_set_run") + evaluation_spans = find_spans_by_type(traces, "evaluation") + evaluator_spans = find_spans_by_type(traces, "evaluator") + + # Get eval_set_run span_id + if eval_set_run_spans: + eval_set_run_span_id = eval_set_run_spans[0].get("context", {}).get("span_id") + print(f" EvalSetRun span_id: {eval_set_run_span_id}") + + # Check Evaluation spans are children of EvalSetRun (through parent chain) + # Note: In practice, there may be intermediate spans, so we just verify + # the relationship exists through the trace + print(f" Found {len(evaluation_spans)} Evaluation spans") + print(f" Found {len(evaluator_spans)} Evaluator spans") + + print("\nSpan hierarchy check passed") + + +def main() -> None: + """Main assertion logic.""" + traces_file = "__uipath/traces.jsonl" + + # Check if traces file exists + if not os.path.isfile(traces_file): + print(f"Traces file '{traces_file}' not found") + sys.exit(1) + + print(f"Loading traces from {traces_file}...") + traces = load_traces(traces_file) + print(f"Loaded {len(traces)} trace spans") + + # Print all span names and types for debugging + print("\n--- All spans ---") + for i, trace in enumerate(traces): + name = trace.get("name", "Unknown") + attrs = get_attributes(trace) + span_type = attrs.get("span_type", "N/A") + print(f" {i + 1}. {name} (span_type: {span_type})") + + # Run assertions + try: + assert_eval_set_run_span(traces) + assert_evaluation_spans(traces) + assert_evaluator_spans(traces) + assert_span_hierarchy(traces) + + print("\n" + "=" * 60) + print("All eval span assertions passed!") + print("=" * 60) + + except AssertionError as e: + print(f"\nAssertion failed: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/testcases/eval-spans-testcase/uipath.json b/testcases/eval-spans-testcase/uipath.json new file mode 100644 index 000000000..2b8e5b396 --- /dev/null +++ b/testcases/eval-spans-testcase/uipath.json @@ -0,0 +1,5 @@ +{ + "functions": { + "main": "../../samples/calculator/main.py:main" + } +} diff --git a/tests/cli/eval/test_eval_runtime_spans.py b/tests/cli/eval/test_eval_runtime_spans.py index 58e5da5e9..c0a7d74a0 100644 --- a/tests/cli/eval/test_eval_runtime_spans.py +++ b/tests/cli/eval/test_eval_runtime_spans.py @@ -21,7 +21,7 @@ class MockSpanContext: """Mock span context manager for testing span creation.""" - def __init__(self, name: str, attributes: Dict[str, Any]): + def __init__(self, name: str, attributes: dict[str, Any] | None): self.name = name self.attributes = attributes or {} self.span = MagicMock(spec=Span) @@ -40,7 +40,9 @@ class SpanCapturingTracer: def __init__(self): self.created_spans: List[Dict[str, Any]] = [] - def start_as_current_span(self, name: str, attributes: Dict[str, Any] = None): + def start_as_current_span( + self, name: str, attributes: dict[str, Any] | None = None + ): """Capture span creation and return a mock context manager.""" span_info = {"name": name, "attributes": attributes or {}} self.created_spans.append(span_info) @@ -415,7 +417,7 @@ def test_span_type_values_match_expected(self): "Evaluator": "evaluator", } - for span_name, span_type in expected_span_types.items(): + for _, span_type in expected_span_types.items(): assert isinstance(span_type, str) assert span_type.islower() or "_" in span_type diff --git a/tests/cli/eval/test_eval_telemetry.py b/tests/cli/eval/test_eval_telemetry.py index 63f8f913f..06c48b011 100644 --- a/tests/cli/eval/test_eval_telemetry.py +++ b/tests/cli/eval/test_eval_telemetry.py @@ -1,6 +1,7 @@ """Tests for EvalTelemetrySubscriber functionality.""" import os +from typing import Any from unittest.mock import patch import pytest @@ -93,7 +94,7 @@ def _create_eval_set_run_created_event( eval_set_run_id: str | None = "run-456", entrypoint: str = "agent.py", no_of_evals: int = 5, - evaluators: list = None, + evaluators: list[Any] | None = None, ) -> EvalSetRunCreatedEvent: """Helper to create EvalSetRunCreatedEvent.""" return EvalSetRunCreatedEvent( @@ -212,7 +213,7 @@ def _create_eval_run_updated_event( eval_item_name: str = "Test Eval", success: bool = True, agent_execution_time: float = 1.5, - eval_results: list = None, + eval_results: list[Any] | None = None, exception_details: EvalItemExceptionDetails | None = None, ) -> EvalRunUpdatedEvent: """Helper to create EvalRunUpdatedEvent.""" @@ -327,7 +328,7 @@ class TestEvalSetRunUpdated: def _create_eval_set_run_updated_event( self, execution_id: str = "exec-123", - evaluator_scores: dict = None, + evaluator_scores: dict[str, Any] | None = None, success: bool = True, ) -> EvalSetRunUpdatedEvent: """Helper to create EvalSetRunUpdatedEvent.""" @@ -409,7 +410,7 @@ class TestEnrichProperties: def test_enrich_properties_adds_source(self): """Test that source and application name are always added.""" subscriber = EvalTelemetrySubscriber() - properties = {} + properties: dict[str, Any] = {} subscriber._enrich_properties(properties) @@ -419,7 +420,7 @@ def test_enrich_properties_adds_source(self): def test_enrich_properties_adds_env_vars(self): """Test that environment variables are added when present.""" subscriber = EvalTelemetrySubscriber() - properties = {} + properties: dict[str, Any] = {} with patch.dict( os.environ, @@ -440,7 +441,7 @@ def test_enrich_properties_adds_env_vars(self): def test_enrich_properties_skips_missing_env_vars(self): """Test that missing environment variables are not added.""" subscriber = EvalTelemetrySubscriber() - properties = {} + properties: dict[str, Any] = {} with patch.dict(os.environ, {}, clear=True): # Remove env vars if they exist diff --git a/tests/cli/eval/test_eval_tracing_integration.py b/tests/cli/eval/test_eval_tracing_integration.py index 4d4556771..644551c5b 100644 --- a/tests/cli/eval/test_eval_tracing_integration.py +++ b/tests/cli/eval/test_eval_tracing_integration.py @@ -4,13 +4,13 @@ """ import uuid -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional class MockSpan: """Mock span that captures attributes for testing.""" - def __init__(self, name: str, attributes: Dict[str, Any] = None): + def __init__(self, name: str, attributes: Optional[Dict[str, Any]] = None): self.name = name self.attributes = attributes or {} self._status = None @@ -26,7 +26,9 @@ def __init__(self): self.spans: List[Dict[str, Any]] = [] self._span_stack: List[MockSpan] = [] - def start_as_current_span(self, name: str, attributes: Dict[str, Any] = None): + def start_as_current_span( + self, name: str, attributes: Optional[Dict[str, Any]] = None + ): """Mock tracer method that records span creation.""" span_info = { "name": name, From f26b32e71237ef94592e8c64f6fb8e683f0870ad Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 7 Jan 2026 16:52:21 -0800 Subject: [PATCH 06/12] feat: add trace-file option --- src/uipath/_cli/cli_eval.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 8c9f9870e..570832b47 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -21,7 +21,7 @@ from uipath.eval._helpers import auto_discover_entrypoint from uipath.platform.common import UiPathConfig from uipath.telemetry._track import flush_events -from uipath.tracing import LlmOpsHttpExporter +from uipath.tracing import JsonLinesFileExporter, LlmOpsHttpExporter from ._utils._console import ConsoleLogger from ._utils._eval_set import EvalHelpers @@ -100,6 +100,12 @@ def setup_reporting_prereq(no_report: bool) -> bool: default="default", help="Model settings ID from evaluation set to override agent settings (default: 'default')", ) +@click.option( + "--trace-file", + required=False, + type=click.Path(exists=False), + help="File path where traces will be written in JSONL format", +) def eval( entrypoint: str | None, eval_set: str | None, @@ -111,6 +117,7 @@ def eval( enable_mocker_cache: bool, report_coverage: bool, model_settings_id: str, + trace_file: str | None, ) -> None: """Run an evaluation set against the agent. @@ -185,6 +192,11 @@ async def execute_eval(): if ctx.job_id: trace_manager.add_span_exporter(LlmOpsHttpExporter()) + if trace_file: + trace_manager.add_span_exporter( + JsonLinesFileExporter(trace_file) + ) + project_id = UiPathConfig.project_id runtime_factory = UiPathRuntimeFactoryRegistry.get(context=ctx) From 348a1285b0a881241c6f01a6c4a7f51c95f2a61d Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 7 Jan 2026 17:06:06 -0800 Subject: [PATCH 07/12] feat: add integration tests for telemetry --- .github/workflows/integration_tests.yml | 5 + .../eval-telemetry-testcase/pyproject.toml | 12 ++ testcases/eval-telemetry-testcase/run.sh | 40 ++++ .../eval-telemetry-testcase/src/assert.py | 194 ++++++++++++++++++ testcases/eval-telemetry-testcase/uipath.json | 5 + 5 files changed, 256 insertions(+) create mode 100644 testcases/eval-telemetry-testcase/pyproject.toml create mode 100755 testcases/eval-telemetry-testcase/run.sh create mode 100644 testcases/eval-telemetry-testcase/src/assert.py create mode 100644 testcases/eval-telemetry-testcase/uipath.json diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 5ef766f90..448435f91 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -62,6 +62,11 @@ jobs: BASE_URL: ${{ matrix.environment == 'alpha' && secrets.ALPHA_BASE_URL || matrix.environment == 'staging' && secrets.STAGING_BASE_URL || matrix.environment == 'cloud' && secrets.CLOUD_BASE_URL }} USE_AZURE_CHAT: ${{ matrix.use_azure_chat }} + + # App Insights for telemetry testing + APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.APPLICATIONINSIGHTS_CONNECTION_STRING }} + APP_INSIGHTS_APP_ID: ${{ secrets.APP_INSIGHTS_APP_ID }} + APP_INSIGHTS_API_KEY: ${{ secrets.APP_INSIGHTS_API_KEY }} working-directory: testcases/${{ matrix.testcase }} run: | # If any errors occur execution will stop with exit code diff --git a/testcases/eval-telemetry-testcase/pyproject.toml b/testcases/eval-telemetry-testcase/pyproject.toml new file mode 100644 index 000000000..e9c2e52d9 --- /dev/null +++ b/testcases/eval-telemetry-testcase/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "eval-telemetry-testcase" +version = "0.1.0" +description = "E2E test for verifying eval telemetry events in Application Insights" +requires-python = ">=3.11" +dependencies = [ + "uipath", + "httpx", +] + +[tool.uv.sources] +uipath = { path = "../../", editable = true } diff --git a/testcases/eval-telemetry-testcase/run.sh b/testcases/eval-telemetry-testcase/run.sh new file mode 100755 index 000000000..e28fb04f5 --- /dev/null +++ b/testcases/eval-telemetry-testcase/run.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -e + +echo "=== E2E Test: Eval Telemetry Integration ===" + +# Validate required environment variables +if [ -z "$APPLICATIONINSIGHTS_CONNECTION_STRING" ]; then + echo "Warning: APPLICATIONINSIGHTS_CONNECTION_STRING not set, telemetry won't be sent" +fi +if [ -z "$APP_INSIGHTS_APP_ID" ]; then + echo "Warning: APP_INSIGHTS_APP_ID not set, skipping telemetry verification" +fi +if [ -z "$APP_INSIGHTS_API_KEY" ]; then + echo "Warning: APP_INSIGHTS_API_KEY not set, skipping telemetry verification" +fi + +echo "Syncing dependencies..." +uv sync + +echo "Authenticating with UiPath..." +uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL" + +# Generate a unique run ID to identify this test run's telemetry events +export EVAL_TEST_RUN_ID="e2e-test-$(date +%s)-$$" +echo "Test Run ID: $EVAL_TEST_RUN_ID" + +echo "Running evaluations with telemetry enabled..." +# Run eval with telemetry explicitly enabled and App Insights connection string +UIPATH_TELEMETRY_ENABLED=true uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/default.json \ + --no-report \ + --output-file __uipath/output.json \ + --eval-set-run-id "$EVAL_TEST_RUN_ID" + +# Wait for telemetry to be ingested into App Insights +if [ -n "$APP_INSIGHTS_APP_ID" ] && [ -n "$APP_INSIGHTS_API_KEY" ]; then + echo "Waiting for telemetry to be ingested (30 seconds)..." + sleep 30 +fi + +echo "Test completed successfully!" diff --git a/testcases/eval-telemetry-testcase/src/assert.py b/testcases/eval-telemetry-testcase/src/assert.py new file mode 100644 index 000000000..bdd512eb1 --- /dev/null +++ b/testcases/eval-telemetry-testcase/src/assert.py @@ -0,0 +1,194 @@ +"""E2E assertions for eval telemetry testcase. + +This script validates that telemetry events are sent to Application Insights by: +1. Verifying eval completed successfully +2. Querying App Insights API to check for expected telemetry events +3. Validating event properties match expected values +""" + +import json +import os +import sys +import time +from typing import Any + +import httpx + +# Expected telemetry event names +EXPECTED_EVENTS = [ + "EvalSetRun.Start.URT", + "EvalSetRun.End.URT", + "EvalRun.Start.URT", + "EvalRun.End.URT", +] + + +def load_output(output_file: str) -> dict[str, Any]: + """Load output from a JSON file.""" + with open(output_file, "r", encoding="utf-8") as f: + return json.load(f) + + +def query_app_insights( + app_id: str, api_key: str, query: str, max_retries: int = 3 +) -> dict[str, Any]: + """Query Application Insights using the REST API. + + Args: + app_id: Application Insights App ID + api_key: Application Insights API Key + query: Kusto query to execute + max_retries: Number of retries on failure + + Returns: + Query results as dictionary + """ + url = f"https://api.applicationinsights.io/v1/apps/{app_id}/query" + headers = {"x-api-key": api_key, "Content-Type": "application/json"} + payload = {"query": query} + + for attempt in range(max_retries): + try: + response = httpx.post(url, headers=headers, json=payload, timeout=30) + response.raise_for_status() + return response.json() + except Exception as e: + if attempt < max_retries - 1: + print(f" Retry {attempt + 1}/{max_retries} after error: {e}") + time.sleep(5) + else: + raise + + +def verify_telemetry_events(app_id: str, api_key: str, eval_set_run_id: str) -> bool: + """Verify telemetry events were sent to Application Insights. + + Args: + app_id: Application Insights App ID + api_key: Application Insights API Key + eval_set_run_id: The eval set run ID to search for + + Returns: + True if all expected events were found + """ + print("\n--- Querying App Insights for events ---") + print(f" EvalSetRunId: {eval_set_run_id}") + + # Query for events with the specific EvalSetRunId + query = f""" + customEvents + | where timestamp > ago(10m) + | where customDimensions.EvalSetRunId == "{eval_set_run_id}" + or customDimensions["EvalSetRunId"] == "{eval_set_run_id}" + | project name, timestamp, customDimensions + | order by timestamp asc + """ + + try: + result = query_app_insights(app_id, api_key, query) + except Exception as e: + print(f" Error querying App Insights: {e}") + return False + + # Parse results + tables = result.get("tables", []) + if not tables: + print(" No tables returned from query") + return False + + rows = tables[0].get("rows", []) + columns = [col["name"] for col in tables[0].get("columns", [])] + + print(f" Found {len(rows)} events") + + # Extract event names + found_events: list[str] = [] + name_idx = columns.index("name") if "name" in columns else 0 + + for row in rows: + event_name = row[name_idx] + found_events.append(event_name) + print(f" - {event_name}") + + # Check for expected events + print("\n--- Verifying expected events ---") + all_found = True + for expected in EXPECTED_EVENTS: + if expected in found_events: + print(f" [OK] {expected}") + else: + print(f" [MISSING] {expected}") + all_found = False + + return all_found + + +def verify_output(output_file: str) -> bool: + """Verify the eval output file.""" + print("\n--- Verifying eval output ---") + + if not os.path.isfile(output_file): + print(f" Output file '{output_file}' not found") + return False + + output_data = load_output(output_file) + status = output_data.get("status") + + if status != "successful": + print(f" Eval failed with status: {status}") + return False + + print(f" Status: {status}") + + output = output_data.get("output", {}) + evaluation_results = output.get("evaluationSetResults", []) + print(f" Evaluation results: {len(evaluation_results)}") + + return True + + +def main() -> None: + """Main assertion logic.""" + output_file = "__uipath/output.json" + + # Get environment variables + app_id = os.environ.get("APP_INSIGHTS_APP_ID") + api_key = os.environ.get("APP_INSIGHTS_API_KEY") + eval_set_run_id = os.environ.get("EVAL_TEST_RUN_ID") + + # Verify eval output first + if not verify_output(output_file): + print("\nEval output verification failed") + sys.exit(1) + + # Check if App Insights verification is possible + if not app_id or not api_key: + print("\n--- Skipping App Insights verification ---") + print(" APP_INSIGHTS_APP_ID or APP_INSIGHTS_API_KEY not set") + print(" Telemetry verification skipped (eval completed successfully)") + print("\nAll assertions passed! (telemetry verification skipped)") + return + + if not eval_set_run_id: + print("\n--- Skipping App Insights verification ---") + print(" EVAL_TEST_RUN_ID not set") + print("\nAll assertions passed! (telemetry verification skipped)") + return + + # Verify telemetry events in App Insights + if not verify_telemetry_events(app_id, api_key, eval_set_run_id): + print("\n" + "=" * 60) + print("Telemetry verification FAILED") + print("Expected events not found in App Insights") + print("=" * 60) + sys.exit(1) + + print("\n" + "=" * 60) + print("All assertions passed!") + print(" - Eval completed successfully") + print(" - Telemetry events verified in App Insights") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/testcases/eval-telemetry-testcase/uipath.json b/testcases/eval-telemetry-testcase/uipath.json new file mode 100644 index 000000000..2b8e5b396 --- /dev/null +++ b/testcases/eval-telemetry-testcase/uipath.json @@ -0,0 +1,5 @@ +{ + "functions": { + "main": "../../samples/calculator/main.py:main" + } +} From fdcaed4762cb596d6393a0ea21cc9728e32b49e3 Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 7 Jan 2026 17:09:47 -0800 Subject: [PATCH 08/12] fix: failing telemetry integration test --- testcases/common/validate_output.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testcases/common/validate_output.sh b/testcases/common/validate_output.sh index 83f587f99..1e4cd1a59 100644 --- a/testcases/common/validate_output.sh +++ b/testcases/common/validate_output.sh @@ -26,9 +26,9 @@ debug_print_uipath_output() { run_assertions() { echo "Running assertions..." if [ -f "src/assert.py" ]; then - # Use the Python from the virtual environment + # Use uv run to ensure testcase dependencies are available # Prepend the common directory to the python path so it can be resolved - PYTHONPATH="../common:$PYTHONPATH" python src/assert.py + PYTHONPATH="../common:$PYTHONPATH" uv run python src/assert.py else echo "assert.py not found in src directory!" exit 1 From 8a54b506bcf899e4797a0263a65c710a408abcfd Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 7 Jan 2026 17:14:13 -0800 Subject: [PATCH 09/12] fix: failing telemetry integration test --- .../eval-telemetry-testcase/src/assert.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/testcases/eval-telemetry-testcase/src/assert.py b/testcases/eval-telemetry-testcase/src/assert.py index bdd512eb1..b9077ba8e 100644 --- a/testcases/eval-telemetry-testcase/src/assert.py +++ b/testcases/eval-telemetry-testcase/src/assert.py @@ -132,18 +132,33 @@ def verify_output(output_file: str) -> bool: return False output_data = load_output(output_file) - status = output_data.get("status") - if status != "successful": - print(f" Eval failed with status: {status}") - return False - - print(f" Status: {status}") + # The eval output can have two formats: + # 1. Direct results: {"evaluationSetName": "...", "evaluationSetResults": [...]} + # 2. Wrapped results: {"status": "successful", "output": {...}} + if "status" in output_data: + status = output_data.get("status") + if status != "successful": + print(f" Eval failed with status: {status}") + return False + print(f" Status: {status}") + output = output_data.get("output", {}) + evaluation_results = output.get("evaluationSetResults", []) + else: + # Direct format - check for evaluationSetResults + evaluation_results = output_data.get("evaluationSetResults", []) + if not evaluation_results: + print(" No evaluationSetResults found in output") + return False + print(" Status: completed (direct output format)") - output = output_data.get("output", {}) - evaluation_results = output.get("evaluationSetResults", []) print(f" Evaluation results: {len(evaluation_results)}") + # Verify we have results with scores + if len(evaluation_results) == 0: + print(" No evaluation results found") + return False + return True From 36cef7d21aeab243d4d2a0f923a8e3a9175ea276 Mon Sep 17 00:00:00 2001 From: Anipik Date: Thu, 8 Jan 2026 09:36:38 -0800 Subject: [PATCH 10/12] fix: update the version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 85f0f2f66..4b77acf3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.4.4" +version = "2.4.5" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" From 59289366ca51f81622b80a739ac175a2d7d0e7da Mon Sep 17 00:00:00 2001 From: Anipik Date: Thu, 8 Jan 2026 09:41:10 -0800 Subject: [PATCH 11/12] fix: linting package mismatch error --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index 8c90eda7b..9a7a728ce 100644 --- a/uv.lock +++ b/uv.lock @@ -2486,7 +2486,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.4.4" +version = "2.4.5" source = { editable = "." } dependencies = [ { name = "applicationinsights" }, From 9520ec5c4c372a561bd33054559d7f523edf4737 Mon Sep 17 00:00:00 2001 From: Anipik Date: Thu, 8 Jan 2026 10:19:00 -0800 Subject: [PATCH 12/12] fix: tracing integration tests --- .../cli/eval/test_eval_tracing_integration.py | 902 ++++++++++-------- 1 file changed, 478 insertions(+), 424 deletions(-) diff --git a/tests/cli/eval/test_eval_tracing_integration.py b/tests/cli/eval/test_eval_tracing_integration.py index 644551c5b..1bb5a6ddf 100644 --- a/tests/cli/eval/test_eval_tracing_integration.py +++ b/tests/cli/eval/test_eval_tracing_integration.py @@ -1,487 +1,541 @@ """Integration tests for eval tracing flow. -These tests verify the end-to-end span creation and hierarchy in the eval runtime. +These tests verify that the eval runtime code correctly creates spans +with the expected attributes by mocking the tracer. """ -import uuid -from typing import Any, Dict, List, Optional +from contextlib import contextmanager +from typing import Any +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime +from uipath.eval.evaluators import BaseEvaluator +from uipath.eval.models import NumericEvaluationResult class MockSpan: - """Mock span that captures attributes for testing.""" + """Mock span that captures attributes.""" - def __init__(self, name: str, attributes: Optional[Dict[str, Any]] = None): + def __init__(self, name: str, attributes: dict[str, Any] | None = None): self.name = name self.attributes = attributes or {} self._status = None - def set_status(self, status): + def set_status(self, status: Any) -> None: self._status = status + def __enter__(self) -> "MockSpan": + return self + + def __exit__(self, *args: Any) -> None: + pass -class SpanRecorder: - """Records all spans created during test execution.""" - def __init__(self): - self.spans: List[Dict[str, Any]] = [] - self._span_stack: List[MockSpan] = [] +class SpanCapturingTracer: + """A tracer that captures all created spans for verification.""" + def __init__(self) -> None: + self.captured_spans: list[dict[str, Any]] = [] + + @contextmanager def start_as_current_span( - self, name: str, attributes: Optional[Dict[str, Any]] = None + self, name: str, attributes: dict[str, Any] | None = None ): - """Mock tracer method that records span creation.""" - span_info = { - "name": name, - "attributes": dict(attributes) if attributes else {}, - "parent": self._span_stack[-1].name if self._span_stack else None, - } - self.spans.append(span_info) - - mock_span = MockSpan(name, attributes) - return _SpanContextManager(mock_span, self._span_stack) - - def get_spans_by_type(self, span_type: str) -> List[Dict[str, Any]]: - """Get all spans with the given span_type attribute.""" - return [s for s in self.spans if s["attributes"].get("span_type") == span_type] - - def get_span_by_name(self, name: str) -> Dict[str, Any] | None: + """Capture span creation and yield a mock span.""" + span_info = {"name": name, "attributes": dict(attributes) if attributes else {}} + self.captured_spans.append(span_info) + yield MockSpan(name, attributes) + + def get_spans_by_type(self, span_type: str) -> list[dict[str, Any]]: + """Get all captured spans with the given span_type.""" + return [ + s + for s in self.captured_spans + if s["attributes"].get("span_type") == span_type + ] + + def get_span_by_name(self, name: str) -> dict[str, Any] | None: """Get the first span with the given name.""" - for span in self.spans: + for span in self.captured_spans: if span["name"] == name: return span return None -class _SpanContextManager: - """Context manager for mock spans.""" - - def __init__(self, span: MockSpan, stack: List[MockSpan]): - self.span = span - self.stack = stack - - def __enter__(self): - self.stack.append(self.span) - return self.span - - def __exit__(self, *args): - self.stack.pop() - +def create_eval_context(**kwargs: Any) -> UiPathEvalContext: + """Helper to create UiPathEvalContext with specific attribute values.""" + context = UiPathEvalContext() + for key, value in kwargs.items(): + setattr(context, key, value) + return context + + +class TestEvalSetRunSpanCreation: + """Tests that verify EvalSetRun span is created correctly by the runtime.""" + + @pytest.fixture + def mock_trace_manager(self) -> MagicMock: + """Create a mock trace manager with a capturing tracer.""" + trace_manager = MagicMock() + self.capturing_tracer = SpanCapturingTracer() + trace_manager.tracer_provider.get_tracer.return_value = self.capturing_tracer + trace_manager.tracer_span_processors = [] + return trace_manager + + @pytest.fixture + def mock_factory(self) -> MagicMock: + """Create a mock runtime factory.""" + factory = MagicMock() + mock_runtime = AsyncMock() + mock_runtime.get_schema = AsyncMock(return_value=MagicMock()) + factory.new_runtime = AsyncMock(return_value=mock_runtime) + return factory + + @pytest.fixture + def mock_event_bus(self) -> MagicMock: + """Create a mock event bus.""" + event_bus = MagicMock() + event_bus.publish = AsyncMock() + return event_bus + + @pytest.mark.asyncio + async def test_execute_creates_eval_set_run_span( + self, + mock_trace_manager: MagicMock, + mock_factory: MagicMock, + mock_event_bus: MagicMock, + ) -> None: + """Test that execute() creates the Evaluation Set Run span.""" + context = create_eval_context( + eval_set="test.json", + entrypoint="main.py:main", + ) -class TestEvalSetRunSpanIntegration: - """Integration tests for Evaluation Set Run span.""" + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) - def test_eval_set_run_span_created_first(self): - """Test that Evaluation Set Run span is created as the root span.""" - recorder = SpanRecorder() + # Mock initiate_evaluation to return empty results + mock_eval_set = MagicMock() + mock_eval_set.name = "Test Eval Set" + mock_eval_set.evaluations = [] - # Simulate the span creation from _runtime.py:315-317 - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run"}, + with patch.object( + runtime, + "initiate_evaluation", + new=AsyncMock(return_value=(mock_eval_set, [], iter([]))), ): - pass + try: + await runtime.execute() + except Exception: + pass # We just want to verify span creation - assert len(recorder.spans) == 1 - span = recorder.spans[0] + # Verify the span was created + eval_set_run_spans = self.capturing_tracer.get_spans_by_type("eval_set_run") + assert len(eval_set_run_spans) >= 1 + + span = eval_set_run_spans[0] assert span["name"] == "Evaluation Set Run" assert span["attributes"]["span_type"] == "eval_set_run" - assert span["parent"] is None - - def test_eval_set_run_span_with_run_id(self): - """Test that eval_set_run_id is included when provided.""" - recorder = SpanRecorder() - eval_set_run_id = "custom-run-123" - - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - span_attributes["eval_set_run_id"] = eval_set_run_id - - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes=span_attributes, - ): - pass - - span = recorder.spans[0] - assert span["attributes"]["eval_set_run_id"] == "custom-run-123" + @pytest.mark.asyncio + async def test_execute_includes_eval_set_run_id_when_provided( + self, + mock_trace_manager: MagicMock, + mock_factory: MagicMock, + mock_event_bus: MagicMock, + ) -> None: + """Test that eval_set_run_id is included in span when provided.""" + context = create_eval_context( + eval_set="test.json", + entrypoint="main.py:main", + eval_set_run_id="custom-run-123", + ) -class TestEvaluationSpanIntegration: - """Integration tests for Evaluation span.""" + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) - def test_evaluation_span_is_child_of_eval_set_run(self): - """Test that Evaluation span is a child of Evaluation Set Run.""" - recorder = SpanRecorder() - execution_id = str(uuid.uuid4()) + mock_eval_set = MagicMock() + mock_eval_set.name = "Test Eval Set" + mock_eval_set.evaluations = [] - # Simulate the nested span creation - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run"}, + with patch.object( + runtime, + "initiate_evaluation", + new=AsyncMock(return_value=(mock_eval_set, [], iter([]))), ): - with recorder.start_as_current_span( - "Evaluation", - attributes={ - "execution.id": execution_id, - "span_type": "evaluation", - "eval_item_id": "item-1", - "eval_item_name": "Test Item", - }, - ): + try: + await runtime.execute() + except Exception: pass - assert len(recorder.spans) == 2 - - eval_set_run_span = recorder.get_span_by_name("Evaluation Set Run") - evaluation_span = recorder.get_span_by_name("Evaluation") - - assert eval_set_run_span is not None - assert evaluation_span is not None - assert evaluation_span["parent"] == "Evaluation Set Run" - - def test_multiple_evaluation_spans_share_parent(self): - """Test that multiple Evaluation spans share the same parent.""" - recorder = SpanRecorder() - - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run"}, - ): - for i in range(3): - with recorder.start_as_current_span( - "Evaluation", - attributes={ - "execution.id": str(uuid.uuid4()), - "span_type": "evaluation", - "eval_item_id": f"item-{i}", - "eval_item_name": f"Test Item {i}", - }, - ): - pass - - evaluation_spans = recorder.get_spans_by_type("evaluation") - assert len(evaluation_spans) == 3 - - for span in evaluation_spans: - assert span["parent"] == "Evaluation Set Run" - - -class TestEvaluatorSpanIntegration: - """Integration tests for Evaluator span.""" - - def test_evaluator_span_is_child_of_evaluation(self): - """Test that Evaluator span is a child of Evaluation.""" - recorder = SpanRecorder() - - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run"}, - ): - with recorder.start_as_current_span( - "Evaluation", - attributes={ - "execution.id": str(uuid.uuid4()), - "span_type": "evaluation", - "eval_item_id": "item-1", - "eval_item_name": "Test Item", - }, - ): - with recorder.start_as_current_span( - "Evaluator: AccuracyEvaluator", - attributes={ - "span_type": "evaluator", - "evaluator_id": "accuracy-1", - "evaluator_name": "AccuracyEvaluator", - "eval_item_id": "item-1", - }, - ): - pass - - evaluator_span = recorder.spans[-1] - assert evaluator_span["name"] == "Evaluator: AccuracyEvaluator" - assert evaluator_span["parent"] == "Evaluation" - - def test_multiple_evaluator_spans_per_evaluation(self): - """Test that multiple Evaluator spans can be children of one Evaluation.""" - recorder = SpanRecorder() - evaluator_names = ["Accuracy", "Relevance", "Fluency"] - - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run"}, - ): - with recorder.start_as_current_span( - "Evaluation", - attributes={ - "execution.id": str(uuid.uuid4()), - "span_type": "evaluation", - "eval_item_id": "item-1", - "eval_item_name": "Test Item", - }, - ): - for name in evaluator_names: - with recorder.start_as_current_span( - f"Evaluator: {name}", - attributes={ - "span_type": "evaluator", - "evaluator_id": f"{name.lower()}-1", - "evaluator_name": name, - "eval_item_id": "item-1", - }, - ): - pass - - evaluator_spans = recorder.get_spans_by_type("evaluator") - assert len(evaluator_spans) == 3 - - for span in evaluator_spans: - assert span["parent"] == "Evaluation" + span = self.capturing_tracer.get_spans_by_type("eval_set_run")[0] + assert span["attributes"]["eval_set_run_id"] == "custom-run-123" -class TestFullSpanHierarchy: - """Integration tests for the complete span hierarchy.""" +class TestEvaluationSpanCreation: + """Tests that verify Evaluation span is created correctly.""" + + @pytest.fixture + def capturing_tracer(self) -> SpanCapturingTracer: + return SpanCapturingTracer() + + @pytest.fixture + def mock_trace_manager(self, capturing_tracer: SpanCapturingTracer) -> MagicMock: + trace_manager = MagicMock() + trace_manager.tracer_provider.get_tracer.return_value = capturing_tracer + trace_manager.tracer_span_processors = [] + return trace_manager + + @pytest.fixture + def mock_factory(self) -> MagicMock: + factory = MagicMock() + mock_runtime = AsyncMock() + mock_runtime.get_schema = AsyncMock(return_value=MagicMock()) + factory.new_runtime = AsyncMock(return_value=mock_runtime) + return factory + + @pytest.fixture + def mock_event_bus(self) -> MagicMock: + event_bus = MagicMock() + event_bus.publish = AsyncMock() + return event_bus + + @pytest.fixture + def mock_eval_item(self) -> Any: + """Create a real EvaluationItem instance for testing.""" + from uipath._cli._evals._models._evaluation_set import EvaluationItem + + return EvaluationItem( + id="item-123", + name="Test Evaluation", + inputs={}, + evaluation_criterias={}, + ) - def test_complete_hierarchy_structure(self): - """Test the complete span hierarchy: EvalSetRun > Evaluation > Evaluator.""" - recorder = SpanRecorder() + @pytest.mark.asyncio + async def test_execute_eval_creates_evaluation_span( + self, + capturing_tracer: SpanCapturingTracer, + mock_trace_manager: MagicMock, + mock_factory: MagicMock, + mock_event_bus: MagicMock, + mock_eval_item: Any, + ) -> None: + """Test that _execute_eval creates an Evaluation span with correct attributes.""" + context = create_eval_context( + eval_set="test.json", + entrypoint="main.py:main", + ) - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run", "eval_set_run_id": "run-1"}, - ): - for i in range(2): - with recorder.start_as_current_span( - "Evaluation", - attributes={ - "execution.id": str(uuid.uuid4()), - "span_type": "evaluation", - "eval_item_id": f"item-{i}", - "eval_item_name": f"Test Item {i}", - }, - ): - with recorder.start_as_current_span( - "Evaluator: TestEvaluator", - attributes={ - "span_type": "evaluator", - "evaluator_id": "test-eval", - "evaluator_name": "TestEvaluator", - "eval_item_id": f"item-{i}", - }, - ): - pass - - # Should have: 1 EvalSetRun + 2 Evaluation + 2 Evaluator = 5 spans - assert len(recorder.spans) == 5 - - eval_set_run_spans = recorder.get_spans_by_type("eval_set_run") - evaluation_spans = recorder.get_spans_by_type("evaluation") - evaluator_spans = recorder.get_spans_by_type("evaluator") - - assert len(eval_set_run_spans) == 1 - assert len(evaluation_spans) == 2 - assert len(evaluator_spans) == 2 - - def test_span_attributes_are_complete(self): - """Test that all spans have the required attributes.""" - recorder = SpanRecorder() - - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run", "eval_set_run_id": "run-123"}, - ): - with recorder.start_as_current_span( - "Evaluation", - attributes={ - "execution.id": "exec-456", - "span_type": "evaluation", - "eval_item_id": "item-789", - "eval_item_name": "My Test", - }, - ): - with recorder.start_as_current_span( - "Evaluator: Accuracy", - attributes={ - "span_type": "evaluator", - "evaluator_id": "acc-1", - "evaluator_name": "Accuracy", - "eval_item_id": "item-789", - }, - ): - pass - - # Verify EvalSetRun span - eval_set_run = recorder.get_spans_by_type("eval_set_run")[0] - assert eval_set_run["attributes"]["eval_set_run_id"] == "run-123" - - # Verify Evaluation span - evaluation = recorder.get_spans_by_type("evaluation")[0] - assert evaluation["attributes"]["execution.id"] == "exec-456" - assert evaluation["attributes"]["eval_item_id"] == "item-789" - assert evaluation["attributes"]["eval_item_name"] == "My Test" - - # Verify Evaluator span - evaluator = recorder.get_spans_by_type("evaluator")[0] - assert evaluator["attributes"]["evaluator_id"] == "acc-1" - assert evaluator["attributes"]["evaluator_name"] == "Accuracy" - assert evaluator["attributes"]["eval_item_id"] == "item-789" - - -class TestSpanNaming: - """Tests for span naming conventions.""" - - def test_eval_set_run_span_name(self): - """Test that EvalSetRun span has correct name.""" - recorder = SpanRecorder() - - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run"}, - ): - pass + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) - assert recorder.spans[0]["name"] == "Evaluation Set Run" + # Mock execute_runtime to return a successful result + mock_execution_output = MagicMock() + mock_execution_output.result.output = {"result": 42} + mock_execution_output.result.status = "successful" + mock_execution_output.result.error = None + mock_execution_output.spans = [] + mock_execution_output.logs = [] - def test_evaluation_span_name(self): - """Test that Evaluation span has correct name.""" - recorder = SpanRecorder() + mock_runtime = AsyncMock() - with recorder.start_as_current_span( - "Evaluation", - attributes={"span_type": "evaluation"}, + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=mock_execution_output), ): - pass - - assert recorder.spans[0]["name"] == "Evaluation" + await runtime._execute_eval(mock_eval_item, [], mock_runtime) + + # Verify Evaluation span was created + evaluation_spans = capturing_tracer.get_spans_by_type("evaluation") + assert len(evaluation_spans) == 1 + + span = evaluation_spans[0] + assert span["name"] == "Evaluation" + assert span["attributes"]["span_type"] == "evaluation" + assert span["attributes"]["eval_item_id"] == "item-123" + assert span["attributes"]["eval_item_name"] == "Test Evaluation" + assert "execution.id" in span["attributes"] + + +class TestEvaluatorSpanCreation: + """Tests that verify Evaluator span is created correctly.""" + + @pytest.fixture + def capturing_tracer(self) -> SpanCapturingTracer: + return SpanCapturingTracer() + + @pytest.fixture + def mock_trace_manager(self, capturing_tracer: SpanCapturingTracer) -> MagicMock: + trace_manager = MagicMock() + trace_manager.tracer_provider.get_tracer.return_value = capturing_tracer + trace_manager.tracer_span_processors = [] + return trace_manager + + @pytest.fixture + def mock_factory(self) -> MagicMock: + factory = MagicMock() + return factory + + @pytest.fixture + def mock_event_bus(self) -> MagicMock: + event_bus = MagicMock() + event_bus.publish = AsyncMock() + return event_bus + + @pytest.fixture + def mock_evaluator(self) -> MagicMock: + evaluator = MagicMock(spec=BaseEvaluator) + evaluator.id = "accuracy-evaluator" + evaluator.name = "AccuracyEvaluator" + evaluator.validate_and_evaluate_criteria = AsyncMock( + return_value=NumericEvaluationResult(score=0.95, details="Good accuracy") + ) + return evaluator + + @pytest.fixture + def mock_eval_item(self) -> MagicMock: + eval_item = MagicMock() + eval_item.id = "eval-item-456" + eval_item.name = "Test Item" + eval_item.inputs = {"input": "test"} + eval_item.expected_agent_behavior = None + return eval_item + + @pytest.fixture + def mock_execution_output(self) -> MagicMock: + output = MagicMock() + output.result.output = {"result": 42} + output.spans = [] + return output + + @pytest.mark.asyncio + async def test_run_evaluator_creates_evaluator_span( + self, + capturing_tracer: SpanCapturingTracer, + mock_trace_manager: MagicMock, + mock_factory: MagicMock, + mock_event_bus: MagicMock, + mock_evaluator: MagicMock, + mock_eval_item: MagicMock, + mock_execution_output: MagicMock, + ) -> None: + """Test that run_evaluator creates an Evaluator span with correct attributes.""" + context = create_eval_context( + eval_set="test.json", + entrypoint="main.py:main", + ) - def test_evaluator_span_name_format(self): - """Test that Evaluator span name follows the pattern 'Evaluator: {name}'.""" - recorder = SpanRecorder() - evaluator_name = "MyCustomEvaluator" + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) - with recorder.start_as_current_span( - f"Evaluator: {evaluator_name}", - attributes={ - "span_type": "evaluator", - "evaluator_name": evaluator_name, - }, - ): - pass + await runtime.run_evaluator( + evaluator=mock_evaluator, + execution_output=mock_execution_output, + eval_item=mock_eval_item, + evaluation_criteria=None, + ) - span = recorder.spans[0] - assert span["name"] == "Evaluator: MyCustomEvaluator" - assert span["name"].startswith("Evaluator: ") + # Verify Evaluator span was created + evaluator_spans = capturing_tracer.get_spans_by_type("evaluator") + assert len(evaluator_spans) == 1 + + span = evaluator_spans[0] + assert span["name"] == "Evaluator: AccuracyEvaluator" + assert span["attributes"]["span_type"] == "evaluator" + assert span["attributes"]["evaluator_id"] == "accuracy-evaluator" + assert span["attributes"]["evaluator_name"] == "AccuracyEvaluator" + assert span["attributes"]["eval_item_id"] == "eval-item-456" + + @pytest.mark.asyncio + async def test_multiple_evaluators_create_multiple_spans( + self, + capturing_tracer: SpanCapturingTracer, + mock_trace_manager: MagicMock, + mock_factory: MagicMock, + mock_event_bus: MagicMock, + mock_eval_item: MagicMock, + mock_execution_output: MagicMock, + ) -> None: + """Test that running multiple evaluators creates multiple spans.""" + context = create_eval_context( + eval_set="test.json", + entrypoint="main.py:main", + ) + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) -class TestExecutionIdTracking: - """Tests for execution.id tracking in spans.""" + evaluator_names = ["Accuracy", "Relevance", "Fluency"] + for name in evaluator_names: + evaluator = MagicMock(spec=BaseEvaluator) + evaluator.id = f"{name.lower()}-id" + evaluator.name = name + evaluator.validate_and_evaluate_criteria = AsyncMock( + return_value=NumericEvaluationResult(score=0.9) + ) + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=mock_execution_output, + eval_item=mock_eval_item, + evaluation_criteria=None, + ) + + evaluator_spans = capturing_tracer.get_spans_by_type("evaluator") + assert len(evaluator_spans) == 3 - def test_each_evaluation_has_unique_execution_id(self): + span_names = [s["name"] for s in evaluator_spans] + assert "Evaluator: Accuracy" in span_names + assert "Evaluator: Relevance" in span_names + assert "Evaluator: Fluency" in span_names + + +class TestSpanAttributeValues: + """Tests for verifying specific span attribute values.""" + + @pytest.fixture + def capturing_tracer(self) -> SpanCapturingTracer: + return SpanCapturingTracer() + + @pytest.fixture + def mock_trace_manager(self, capturing_tracer: SpanCapturingTracer) -> MagicMock: + trace_manager = MagicMock() + trace_manager.tracer_provider.get_tracer.return_value = capturing_tracer + trace_manager.tracer_span_processors = [] + return trace_manager + + @pytest.fixture + def mock_factory(self) -> MagicMock: + factory = MagicMock() + return factory + + @pytest.fixture + def mock_event_bus(self) -> MagicMock: + event_bus = MagicMock() + event_bus.publish = AsyncMock() + return event_bus + + @pytest.mark.asyncio + async def test_evaluation_span_has_unique_execution_id( + self, + capturing_tracer: SpanCapturingTracer, + mock_trace_manager: MagicMock, + mock_factory: MagicMock, + mock_event_bus: MagicMock, + ) -> None: """Test that each Evaluation span gets a unique execution.id.""" - recorder = SpanRecorder() - execution_ids = [] - - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run"}, - ): - for i in range(3): - exec_id = str(uuid.uuid4()) - execution_ids.append(exec_id) - with recorder.start_as_current_span( - "Evaluation", - attributes={ - "execution.id": exec_id, - "span_type": "evaluation", - "eval_item_id": f"item-{i}", - "eval_item_name": f"Item {i}", - }, - ): - pass - - # Verify all execution IDs are unique - assert len(set(execution_ids)) == 3 - - # Verify each evaluation span has its execution.id - evaluation_spans = recorder.get_spans_by_type("evaluation") - for i, span in enumerate(evaluation_spans): - assert span["attributes"]["execution.id"] == execution_ids[i] - - def test_eval_set_run_does_not_have_execution_id(self): - """Test that EvalSetRun span does NOT have execution.id. - - This is intentional to prevent ID propagation to child spans. - """ - recorder = SpanRecorder() - - with recorder.start_as_current_span( - "Evaluation Set Run", - attributes={"span_type": "eval_set_run"}, - ): - pass + context = create_eval_context( + eval_set="test.json", + entrypoint="main.py:main", + ) - eval_set_run = recorder.spans[0] - assert "execution.id" not in eval_set_run["attributes"] + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) + mock_runtime = AsyncMock() + mock_execution_output = MagicMock() + mock_execution_output.result.output = {} + mock_execution_output.result.status = "successful" + mock_execution_output.result.error = None + mock_execution_output.spans = [] + mock_execution_output.logs = [] + + from uipath._cli._evals._models._evaluation_set import EvaluationItem + + for i in range(3): + eval_item = EvaluationItem( + id=f"item-{i}", + name=f"Test {i}", + inputs={}, + evaluation_criterias={}, + ) + + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=mock_execution_output), + ): + await runtime._execute_eval(eval_item, [], mock_runtime) -class TestEvaluatorSpanEvalItemId: - """Tests for eval_item_id in evaluator spans.""" + # Get execution IDs from spans + evaluation_spans = capturing_tracer.get_spans_by_type("evaluation") + execution_ids = [s["attributes"]["execution.id"] for s in evaluation_spans] - def test_evaluator_span_has_eval_item_id(self): - """Test that Evaluator span includes the eval_item_id.""" - recorder = SpanRecorder() - eval_item_id = "item-specific-123" + # All execution IDs should be unique + assert len(set(execution_ids)) == 3 - with recorder.start_as_current_span( - "Evaluation", - attributes={ - "execution.id": str(uuid.uuid4()), - "span_type": "evaluation", - "eval_item_id": eval_item_id, - "eval_item_name": "Test", - }, - ): - with recorder.start_as_current_span( - "Evaluator: Test", - attributes={ - "span_type": "evaluator", - "evaluator_id": "test-1", - "evaluator_name": "Test", - "eval_item_id": eval_item_id, - }, - ): - pass + @pytest.mark.asyncio + async def test_evaluator_span_inherits_eval_item_id( + self, + capturing_tracer: SpanCapturingTracer, + mock_trace_manager: MagicMock, + mock_factory: MagicMock, + mock_event_bus: MagicMock, + ) -> None: + """Test that Evaluator span contains the same eval_item_id as its parent Evaluation.""" + context = create_eval_context( + eval_set="test.json", + entrypoint="main.py:main", + ) - evaluator_span = recorder.get_spans_by_type("evaluator")[0] - assert evaluator_span["attributes"]["eval_item_id"] == eval_item_id - - def test_evaluator_and_evaluation_share_eval_item_id(self): - """Test that Evaluator and Evaluation spans share the same eval_item_id.""" - recorder = SpanRecorder() - eval_item_id = "shared-item-456" - - with recorder.start_as_current_span( - "Evaluation", - attributes={ - "execution.id": str(uuid.uuid4()), - "span_type": "evaluation", - "eval_item_id": eval_item_id, - "eval_item_name": "Test", - }, - ): - with recorder.start_as_current_span( - "Evaluator: Test", - attributes={ - "span_type": "evaluator", - "evaluator_id": "test-1", - "evaluator_name": "Test", - "eval_item_id": eval_item_id, - }, - ): - pass + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) - evaluation_span = recorder.get_spans_by_type("evaluation")[0] - evaluator_span = recorder.get_spans_by_type("evaluator")[0] + eval_item = MagicMock() + eval_item.id = "shared-item-id-789" + eval_item.name = "Test" + eval_item.inputs = {} + eval_item.expected_agent_behavior = None + + mock_execution_output = MagicMock() + mock_execution_output.result.output = {} + mock_execution_output.spans = [] + + evaluator = MagicMock(spec=BaseEvaluator) + evaluator.id = "test-evaluator" + evaluator.name = "TestEvaluator" + evaluator.validate_and_evaluate_criteria = AsyncMock( + return_value=NumericEvaluationResult(score=1.0) + ) - assert ( - evaluation_span["attributes"]["eval_item_id"] - == evaluator_span["attributes"]["eval_item_id"] + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=mock_execution_output, + eval_item=eval_item, + evaluation_criteria=None, ) + + evaluator_span = capturing_tracer.get_spans_by_type("evaluator")[0] + assert evaluator_span["attributes"]["eval_item_id"] == "shared-item-id-789"