From d03dd412b426469725b55e8f340395f8f5a1cc6b Mon Sep 17 00:00:00 2001 From: Akshaya Shanbhogue Date: Mon, 29 Dec 2025 15:01:57 -0800 Subject: [PATCH] refactor(EvalRuntimeInstance): evaluate specific runtime instance Doing this will avoid creation of temporary runtimes everywhere. --- pyproject.toml | 2 +- src/uipath/_cli/_evals/_runtime.py | 187 +++++++++---------- tests/cli/eval/test_eval_runtime_metadata.py | 154 +++++---------- uv.lock | 2 +- 4 files changed, 131 insertions(+), 214 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8ba033b00..063de211b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.3.2" +version = "2.3.3" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index 508230a7f..e3b104735 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -26,6 +26,7 @@ from uipath.runtime import ( UiPathExecutionRuntime, UiPathRuntimeFactoryProtocol, + UiPathRuntimeProtocol, UiPathRuntimeResult, UiPathRuntimeStatus, ) @@ -209,9 +210,6 @@ def __init__( self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter() self.execution_id = str(uuid.uuid4()) - self.schema: UiPathRuntimeSchema | None = None - self._agent_model: str | None = None - self._metadata_loaded: bool = False self.coverage = coverage.Coverage(branch=True) async def __aenter__(self) -> "UiPathEvalRuntime": @@ -224,34 +222,11 @@ async def __aexit__(self, *args: Any) -> None: self.coverage.stop() self.coverage.report(include=["./*"], show_missing=True) - async def _ensure_metadata_loaded(self) -> None: - """Load metadata (schema, agent model) from a single temporary runtime. - - This method creates one temporary runtime to fetch both schema and agent - model, avoiding the overhead of creating multiple runtimes for metadata - queries. Results are cached for subsequent access. - """ - if self._metadata_loaded: - return - - temp_runtime = await self.factory.new_runtime( - entrypoint=self.context.entrypoint or "", - runtime_id="metadata-query", - ) - try: - self.schema = await temp_runtime.get_schema() - self._agent_model = self._find_agent_model_in_runtime(temp_runtime) - if self._agent_model: - logger.debug(f"Got agent model from runtime: {self._agent_model}") - self._metadata_loaded = True - finally: - await temp_runtime.dispose() - - async def get_schema(self) -> UiPathRuntimeSchema: - await self._ensure_metadata_loaded() - if self.schema is None: + async def get_schema(self, runtime: UiPathRuntimeProtocol) -> UiPathRuntimeSchema: + schema = await runtime.get_schema() + if schema is None: raise ValueError("Schema could not be loaded") - return self.schema + return schema @contextmanager def _mocker_cache(self) -> Iterator[None]: @@ -271,6 +246,7 @@ def _mocker_cache(self) -> Iterator[None]: async def initiate_evaluation( self, + runtime: UiPathRuntimeProtocol, ) -> Tuple[ EvaluationSet, list[BaseEvaluator[Any, Any, Any]], @@ -283,7 +259,7 @@ async def initiate_evaluation( evaluation_set, _ = EvalHelpers.load_eval_set( self.context.eval_set, self.context.eval_ids ) - evaluators = await self._load_evaluators(evaluation_set) + evaluators = await self._load_evaluators(evaluation_set, runtime) await self.event_bus.publish( EvaluationEvents.CREATE_EVAL_SET_RUN, @@ -301,74 +277,84 @@ async def initiate_evaluation( evaluation_set, evaluators, ( - self._execute_eval(eval_item, evaluators) + self._execute_eval(eval_item, evaluators, runtime) for eval_item in evaluation_set.evaluations ), ) async def execute(self) -> UiPathRuntimeResult: - with self._mocker_cache(): - ( - evaluation_set, - evaluators, - evaluation_iterable, - ) = await self.initiate_evaluation() - workers = self.context.workers or 1 - assert workers >= 1 - eval_run_result_list = await execute_parallel(evaluation_iterable, workers) - results = UiPathEvalOutput( - evaluation_set_name=evaluation_set.name, - evaluation_set_results=eval_run_result_list, - ) + runtime = await self.factory.new_runtime( + entrypoint=self.context.entrypoint or "", + runtime_id=self.execution_id, + ) + try: + with self._mocker_cache(): + ( + evaluation_set, + evaluators, + evaluation_iterable, + ) = await self.initiate_evaluation(runtime) + workers = self.context.workers or 1 + assert workers >= 1 + eval_run_result_list = await execute_parallel( + evaluation_iterable, workers + ) + results = UiPathEvalOutput( + evaluation_set_name=evaluation_set.name, + evaluation_set_results=eval_run_result_list, + ) - # Computing evaluator averages - evaluator_averages: dict[str, float] = defaultdict(float) - evaluator_count: dict[str, int] = defaultdict(int) - - # Check if any eval runs failed - any_failed = False - for eval_run_result in results.evaluation_set_results: - # Check if the agent execution had an error - if ( - eval_run_result.agent_execution_output - and eval_run_result.agent_execution_output.result.error - ): - any_failed = True - - for result_dto in eval_run_result.evaluation_run_results: - evaluator_averages[result_dto.evaluator_id] += ( - result_dto.result.score - ) - evaluator_count[result_dto.evaluator_id] += 1 + # Computing evaluator averages + evaluator_averages: dict[str, float] = defaultdict(float) + evaluator_count: dict[str, int] = defaultdict(int) + + # Check if any eval runs failed + any_failed = False + for eval_run_result in results.evaluation_set_results: + # Check if the agent execution had an error + if ( + eval_run_result.agent_execution_output + and eval_run_result.agent_execution_output.result.error + ): + any_failed = True + + for result_dto in eval_run_result.evaluation_run_results: + evaluator_averages[result_dto.evaluator_id] += ( + result_dto.result.score + ) + evaluator_count[result_dto.evaluator_id] += 1 - for eval_id in evaluator_averages: - evaluator_averages[eval_id] = ( - evaluator_averages[eval_id] / evaluator_count[eval_id] + for eval_id in evaluator_averages: + evaluator_averages[eval_id] = ( + evaluator_averages[eval_id] / evaluator_count[eval_id] + ) + await self.event_bus.publish( + EvaluationEvents.UPDATE_EVAL_SET_RUN, + EvalSetRunUpdatedEvent( + execution_id=self.execution_id, + evaluator_scores=evaluator_averages, + success=not any_failed, + ), + wait_for_completion=False, ) - await self.event_bus.publish( - EvaluationEvents.UPDATE_EVAL_SET_RUN, - EvalSetRunUpdatedEvent( - execution_id=self.execution_id, - evaluator_scores=evaluator_averages, - success=not any_failed, - ), - wait_for_completion=False, - ) - result = UiPathRuntimeResult( - output={**results.model_dump(by_alias=True)}, - status=UiPathRuntimeStatus.SUCCESSFUL, - ) - return result + result = UiPathRuntimeResult( + output={**results.model_dump(by_alias=True)}, + status=UiPathRuntimeStatus.SUCCESSFUL, + ) + return result + finally: + await runtime.dispose() async def _execute_eval( self, eval_item: EvaluationItem, evaluators: list[BaseEvaluator[Any, Any, Any]], + runtime: UiPathRuntimeProtocol, ) -> EvaluationRunResult: # Generate LLM-based input if input_mocking_strategy is defined if eval_item.input_mocking_strategy: - eval_item = await self._generate_input_for_eval(eval_item) + eval_item = await self._generate_input_for_eval(eval_item, runtime) execution_id = str(uuid.uuid4()) @@ -389,7 +375,7 @@ async def _execute_eval( try: try: agent_execution_output = await self.execute_runtime( - eval_item, execution_id + eval_item, execution_id, runtime ) except Exception as e: if self.context.verbose: @@ -543,11 +529,11 @@ async def _execute_eval( return evaluation_run_results async def _generate_input_for_eval( - self, eval_item: EvaluationItem + self, eval_item: EvaluationItem, runtime: UiPathRuntimeProtocol ) -> EvaluationItem: """Use LLM to generate a mock input for an evaluation item.""" generated_input = await generate_llm_input( - eval_item, (await self.get_schema()).input + eval_item, (await self.get_schema(runtime)).input ) updated_eval_item = eval_item.model_copy(update={"inputs": generated_input}) return updated_eval_item @@ -565,12 +551,11 @@ def _get_and_clear_execution_data( return spans, logs async def execute_runtime( - self, eval_item: EvaluationItem, execution_id: str + self, + eval_item: EvaluationItem, + execution_id: str, + runtime: UiPathRuntimeProtocol, ) -> UiPathEvalRunExecutionOutput: - runtime = await self.factory.new_runtime( - entrypoint=self.context.entrypoint or "", - runtime_id=execution_id, - ) log_handler = self._setup_execution_logging(execution_id) attributes = { "evalId": eval_item.id, @@ -600,9 +585,6 @@ async def execute_runtime( execution_time=end_time - start_time, ) from e - finally: - await runtime.dispose() - end_time = time() spans, logs = self._get_and_clear_execution_data(execution_id) @@ -652,22 +634,23 @@ async def run_evaluator( return result - async def _get_agent_model(self) -> str | None: + async def _get_agent_model(self, runtime: UiPathRuntimeProtocol) -> str | None: """Get agent model from the runtime. - Uses the cached metadata from _ensure_metadata_loaded(), which creates - a single temporary runtime to fetch both schema and agent model. - Returns: The model name from agent settings, or None if not found. """ try: - await self._ensure_metadata_loaded() - return self._agent_model + model = self._find_agent_model_in_runtime(runtime) + if model: + logger.debug(f"Got agent model from runtime: {model}") + return model except Exception: return None - def _find_agent_model_in_runtime(self, runtime: Any) -> str | None: + def _find_agent_model_in_runtime( + self, runtime: UiPathRuntimeProtocol + ) -> str | None: """Recursively search for get_agent_model in runtime and its delegates. Runtimes may be wrapped (e.g., ResumableRuntime wraps TelemetryWrapper @@ -694,7 +677,7 @@ def _find_agent_model_in_runtime(self, runtime: Any) -> str | None: return None async def _load_evaluators( - self, evaluation_set: EvaluationSet + self, evaluation_set: EvaluationSet, runtime: UiPathRuntimeProtocol ) -> list[BaseEvaluator[Any, Any, Any]]: """Load evaluators referenced by the evaluation set.""" evaluators = [] @@ -704,7 +687,7 @@ async def _load_evaluators( evaluators_dir = Path(eval_set).parent.parent / "evaluators" # Load agent model for 'same-as-agent' resolution in legacy evaluators - agent_model = await self._get_agent_model() + agent_model = await self._get_agent_model(runtime) # If evaluatorConfigs is specified, use that (new field with weights) # Otherwise, fall back to evaluatorRefs (old field without weights) diff --git a/tests/cli/eval/test_eval_runtime_metadata.py b/tests/cli/eval/test_eval_runtime_metadata.py index c11bf122d..ce705c625 100644 --- a/tests/cli/eval/test_eval_runtime_metadata.py +++ b/tests/cli/eval/test_eval_runtime_metadata.py @@ -207,84 +207,6 @@ def test_returns_none_for_none_model(self, eval_runtime): assert result is None -class TestEnsureMetadataLoaded: - """Tests for _ensure_metadata_loaded caching behavior.""" - - @pytest.fixture - def context(self): - """Create eval context.""" - context = UiPathEvalContext() - context.eval_set = str( - Path(__file__).parent / "evals" / "eval-sets" / "default.json" - ) - return context - - async def test_loads_both_schema_and_model(self, context): - """Test that _ensure_metadata_loaded fetches both schema and agent model.""" - - async def create_runtime(): - return AgentModelRuntime("gpt-4o-mini") - - factory = MockFactory(create_runtime) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - - # Initially not loaded - assert eval_runtime._metadata_loaded is False - assert eval_runtime.schema is None - assert eval_runtime._agent_model is None - - await eval_runtime._ensure_metadata_loaded() - - # Both should now be loaded - assert eval_runtime._metadata_loaded is True - assert eval_runtime.schema is not None - assert eval_runtime._agent_model == "gpt-4o-mini" - - async def test_creates_only_one_runtime(self, context): - """Test that only one temporary runtime is created for metadata.""" - - async def create_runtime(): - return AgentModelRuntime("test-model") - - factory = MockFactory(create_runtime) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - - # Call multiple times - await eval_runtime._ensure_metadata_loaded() - await eval_runtime._ensure_metadata_loaded() - await eval_runtime._ensure_metadata_loaded() - - # Factory should only have been called once - assert factory.new_runtime_call_count == 1 - - async def test_caches_results(self, context): - """Test that results are cached after first load.""" - call_count = 0 - - async def create_runtime(): - nonlocal call_count - call_count += 1 - return AgentModelRuntime(f"model-{call_count}") - - factory = MockFactory(create_runtime) - event_bus = EventBus() - trace_manager = UiPathTraceManager() - eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - - await eval_runtime._ensure_metadata_loaded() - first_model = eval_runtime._agent_model - - await eval_runtime._ensure_metadata_loaded() - second_model = eval_runtime._agent_model - - # Should be the same cached value - assert first_model == second_model == "model-1" - - class TestGetAgentModel: """Tests for _get_agent_model method.""" @@ -308,7 +230,8 @@ async def create_runtime(): trace_manager = UiPathTraceManager() eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - model = await eval_runtime._get_agent_model() + runtime = await create_runtime() + model = await eval_runtime._get_agent_model(runtime) assert model == "gpt-4o-2024-11-20" async def test_returns_none_when_no_model(self, context): @@ -322,40 +245,47 @@ async def create_runtime(): trace_manager = UiPathTraceManager() eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - model = await eval_runtime._get_agent_model() + runtime = await create_runtime() + model = await eval_runtime._get_agent_model(runtime) assert model is None - async def test_returns_cached_model(self, context): - """Test that _get_agent_model uses cached value.""" + async def test_returns_model_consistently(self, context): + """Test that _get_agent_model returns consistent results.""" async def create_runtime(): - return AgentModelRuntime("cached-model") + return AgentModelRuntime("consistent-model") factory = MockFactory(create_runtime) event_bus = EventBus() trace_manager = UiPathTraceManager() eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - # First call loads metadata - model1 = await eval_runtime._get_agent_model() - # Second call should use cache - model2 = await eval_runtime._get_agent_model() + runtime = await create_runtime() + + # Multiple calls should return the same value + model1 = await eval_runtime._get_agent_model(runtime) + model2 = await eval_runtime._get_agent_model(runtime) - assert model1 == model2 == "cached-model" - assert factory.new_runtime_call_count == 1 + assert model1 == model2 == "consistent-model" async def test_handles_exception_gracefully(self, context): """Test that _get_agent_model returns None on exception.""" - async def create_runtime(): - raise RuntimeError("Factory error") + async def create_good_runtime(): + return AgentModelRuntime("model") - factory = MockFactory(create_runtime) + factory = MockFactory(create_good_runtime) event_bus = EventBus() trace_manager = UiPathTraceManager() eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - model = await eval_runtime._get_agent_model() + # Create a bad runtime that raises during get_agent_model + class BadRuntime(BaseTestRuntime): + def get_agent_model(self): + raise RuntimeError("Get model error") + + bad_runtime = BadRuntime() + model = await eval_runtime._get_agent_model(bad_runtime) assert model is None @@ -382,12 +312,13 @@ async def create_runtime(): trace_manager = UiPathTraceManager() eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - schema = await eval_runtime.get_schema() + runtime = await create_runtime() + schema = await eval_runtime.get_schema(runtime) assert schema is not None assert schema.file_path == "test.py" - async def test_returns_cached_schema(self, context): - """Test that get_schema uses cached value.""" + async def test_returns_schema_consistently(self, context): + """Test that get_schema returns consistent results.""" async def create_runtime(): return BaseTestRuntime() @@ -397,16 +328,17 @@ async def create_runtime(): trace_manager = UiPathTraceManager() eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - # First call loads metadata - schema1 = await eval_runtime.get_schema() - # Second call should use cache - schema2 = await eval_runtime.get_schema() + runtime = await create_runtime() - assert schema1 is schema2 - assert factory.new_runtime_call_count == 1 + # Multiple calls should return equivalent values + schema1 = await eval_runtime.get_schema(runtime) + schema2 = await eval_runtime.get_schema(runtime) - async def test_schema_and_model_share_runtime(self, context): - """Test that get_schema and _get_agent_model share the same runtime creation.""" + # Should have the same properties + assert schema1.file_path == schema2.file_path == "test.py" + + async def test_schema_and_model_work_with_same_runtime(self, context): + """Test that get_schema and _get_agent_model work with the same runtime.""" async def create_runtime(): return AgentModelRuntime("shared-model") @@ -416,13 +348,15 @@ async def create_runtime(): trace_manager = UiPathTraceManager() eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - # Call both methods - schema = await eval_runtime.get_schema() - model = await eval_runtime._get_agent_model() + runtime = await create_runtime() + + # Call both methods with the same runtime + schema = await eval_runtime.get_schema(runtime) + model = await eval_runtime._get_agent_model(runtime) - # Should only create one runtime - assert factory.new_runtime_call_count == 1 + # Both should work correctly assert schema is not None + assert schema.file_path == "test.py" assert model == "shared-model" @@ -461,5 +395,5 @@ async def create_runtime(): trace_manager = UiPathTraceManager() eval_runtime = UiPathEvalRuntime(context, factory, trace_manager, event_bus) - model = await eval_runtime._get_agent_model() + model = await eval_runtime._get_agent_model(resumable_runtime) assert model == "gpt-4o-from-agent-json" diff --git a/uv.lock b/uv.lock index e4c400155..05978e05a 100644 --- a/uv.lock +++ b/uv.lock @@ -2477,7 +2477,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.3.2" +version = "2.3.3" source = { editable = "." } dependencies = [ { name = "click" },