Skip to content

Commit 5966112

Browse files
committed
Add evals
1 parent 7d2aeda commit 5966112

21 files changed

+1895
-2679
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Procurement Agent Evals
2+
3+
Integration tests for the procurement agent that verify tool calls and database state.
4+
5+
## Prerequisites
6+
7+
1. AgentEx backend running (`make dev` from scale-agentex)
8+
2. Procurement agent running:
9+
```bash
10+
cd examples/demos/procurement_agent
11+
export ENVIRONMENT=development
12+
uv run agentex agents run --manifest manifest.yaml
13+
```
14+
15+
## Running Tests
16+
17+
From the `procurement_agent` directory:
18+
19+
```bash
20+
# Run all tests
21+
cd evals && uv run pytest
22+
23+
# Run specific test file
24+
cd evals && uv run pytest tasks/test_shipment_departed.py -v
25+
26+
# Run single test
27+
cd evals && uv run pytest tasks/test_shipment_departed.py::test_departed_01_no_flag_5_days_early -v
28+
```
29+
30+
## Test Structure
31+
32+
| File | Event Type | Focus |
33+
|------|------------|-------|
34+
| `test_submittal_approved.py` | Submittal_Approved | PO issued, DB entry |
35+
| `test_shipment_departed.py` | Shipment_Departed | **False positive detection** |
36+
| `test_shipment_arrived.py` | Shipment_Arrived | Team notification, inspection |
37+
| `test_inspection_failed.py` | Inspection_Failed | Human-in-the-loop |
38+
| `test_inspection_passed.py` | Inspection_Passed | Status update |
39+
40+
## Test Cases Summary
41+
42+
| Event | Tests | Key Assertions |
43+
|-------|-------|----------------|
44+
| Submittal_Approved | 2 | `issue_purchase_order` called, DB item created |
45+
| Shipment_Departed | 6 | Forbidden: `flag_potential_issue` when ETA < required_by |
46+
| Shipment_Arrived | 2 | `notify_team`, `schedule_inspection` called |
47+
| Inspection_Failed | 3 | Human-in-loop: approve, approve+extra, reject+delete |
48+
| Inspection_Passed | 2 | Forbidden: `wait_for_human`, `flag_potential_issue` |
49+
50+
## Graders
51+
52+
- **tool_calls.py**: Verifies required and forbidden tool calls in transcripts
53+
- **database.py**: Verifies database state changes
54+
55+
## False Positive Detection
56+
57+
The `test_shipment_departed.py` tests are specifically designed to catch the false positive issue where the agent incorrectly flags conflicts.
58+
59+
**Conflict logic:**
60+
- **Flag if** ETA >= required_by (zero/negative buffer)
61+
- **Don't flag if** ETA < required_by (has buffer remaining)
62+
63+
The tests use `assert_forbidden_tools(["flag_potential_issue"])` to catch cases where the agent incorrectly escalates.

examples/demos/procurement_agent/evals/__init__.py

Whitespace-only changes.
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
"""
2+
Pytest fixtures for procurement agent evals.
3+
4+
Provides workflow setup, transcript extraction, and human input simulation.
5+
"""
6+
from __future__ import annotations
7+
8+
import os
9+
import uuid
10+
import asyncio
11+
from typing import Any, AsyncGenerator
12+
from datetime import datetime as dt
13+
14+
import pytest
15+
import pytest_asyncio
16+
from temporalio.client import Client, WorkflowHandle
17+
18+
from agentex.types.task import Task
19+
from agentex.types.agent import Agent
20+
from agentex.lib.types.acp import CreateTaskParams
21+
22+
# Set environment variables for local development
23+
os.environ.setdefault("AGENT_NAME", "procurement-agent")
24+
os.environ.setdefault("ACP_URL", "http://localhost:8000")
25+
os.environ.setdefault("WORKFLOW_NAME", "procurement-agent")
26+
os.environ.setdefault("WORKFLOW_TASK_QUEUE", "procurement_agent_queue")
27+
os.environ.setdefault("TEMPORAL_ADDRESS", "localhost:7233")
28+
29+
30+
@pytest.fixture(scope="session")
31+
def event_loop():
32+
"""Create an event loop for the test session."""
33+
loop = asyncio.get_event_loop_policy().new_event_loop()
34+
yield loop
35+
loop.close()
36+
37+
38+
@pytest_asyncio.fixture(scope="session")
39+
async def temporal_client() -> AsyncGenerator[Client, None]:
40+
"""Create a Temporal client for the test session."""
41+
client = await Client.connect(
42+
os.environ.get("TEMPORAL_ADDRESS", "localhost:7233")
43+
)
44+
yield client
45+
# Client doesn't need explicit close
46+
47+
48+
@pytest_asyncio.fixture
49+
async def workflow_handle(temporal_client: Client) -> AsyncGenerator[WorkflowHandle, None]:
50+
"""
51+
Start a fresh workflow for each test.
52+
53+
Creates a unique workflow ID and starts the procurement agent workflow.
54+
Yields the handle for sending signals and querying state.
55+
"""
56+
workflow_id = f"eval-{uuid.uuid4()}"
57+
task_queue = os.environ.get("WORKFLOW_TASK_QUEUE", "procurement_agent_queue")
58+
workflow_name = os.environ.get("WORKFLOW_NAME", "procurement-agent")
59+
60+
# Create agent and task params
61+
now = dt.now()
62+
agent = Agent(
63+
id="procurement-agent",
64+
name="procurement-agent",
65+
acp_type="agentic",
66+
description="Procurement agent for construction delivery management",
67+
created_at=now,
68+
updated_at=now,
69+
)
70+
task = Task(id=workflow_id)
71+
create_task_params = CreateTaskParams(agent=agent, task=task, params=None)
72+
73+
# Start the workflow
74+
handle = await temporal_client.start_workflow(
75+
workflow_name,
76+
create_task_params,
77+
id=workflow_id,
78+
task_queue=task_queue,
79+
)
80+
81+
# Give workflow time to initialize
82+
await asyncio.sleep(2)
83+
84+
yield handle
85+
86+
# Cleanup: terminate the workflow after test
87+
try:
88+
await handle.terminate("Test completed")
89+
except Exception:
90+
pass # Workflow may have already completed
91+
92+
93+
async def send_event(handle: WorkflowHandle, event: Any) -> None:
94+
"""
95+
Send an event to the workflow via signal.
96+
97+
Args:
98+
handle: The workflow handle
99+
event: A Pydantic event model (will be serialized to JSON)
100+
"""
101+
event_json = event.model_dump_json()
102+
await handle.signal("send_event", event_json)
103+
104+
105+
async def send_human_response(handle: WorkflowHandle, response: str) -> None:
106+
"""
107+
Send a human response to the workflow.
108+
109+
This simulates a user responding in the UI to a wait_for_human escalation.
110+
111+
Args:
112+
handle: The workflow handle
113+
response: The human's text response
114+
"""
115+
# Import here to avoid circular imports
116+
from agentex.types.task import Task
117+
from agentex.types.agent import Agent
118+
from agentex.types.event import Event
119+
from agentex.lib.types.acp import SendEventParams
120+
from agentex.types.text_content import TextContent
121+
122+
now = dt.now()
123+
agent = Agent(
124+
id="procurement-agent",
125+
name="procurement-agent",
126+
acp_type="agentic",
127+
description="Procurement agent for construction delivery management",
128+
created_at=now,
129+
updated_at=now,
130+
)
131+
task = Task(id=handle.id)
132+
event = Event(
133+
id=str(uuid.uuid4()),
134+
agent_id="procurement-agent",
135+
task_id=handle.id,
136+
sequence_id=1,
137+
content=TextContent(author="user", content=response),
138+
)
139+
params = SendEventParams(agent=agent, task=task, event=event)
140+
141+
await handle.signal("receive_event", params)
142+
143+
144+
async def wait_for_processing(_handle: WorkflowHandle, timeout_seconds: float = 60) -> None:
145+
"""
146+
Wait for the workflow to finish processing an event.
147+
148+
Polls the workflow until no more activities are running.
149+
150+
Args:
151+
_handle: The workflow handle (unused, reserved for future polling)
152+
timeout_seconds: Maximum time to wait
153+
"""
154+
# Simple approach: wait a fixed time for agent to process
155+
# In production, you'd poll workflow state more intelligently
156+
await asyncio.sleep(timeout_seconds)
157+
158+
159+
async def get_workflow_transcript(handle: WorkflowHandle) -> list[dict[str, Any]]:
160+
"""
161+
Extract the conversation transcript from workflow history.
162+
163+
Queries the workflow to get the internal state containing tool calls.
164+
165+
Args:
166+
handle: The workflow handle
167+
168+
Returns:
169+
List of message dicts containing tool calls and responses
170+
"""
171+
# Query workflow state to get the input_list (conversation history)
172+
# This requires the workflow to expose a query handler
173+
174+
# For now, we'll extract from workflow history events
175+
# The tool calls appear in activity completions
176+
transcript = []
177+
178+
async for event in handle.fetch_history_events():
179+
# Look for activity completed events
180+
if hasattr(event, 'activity_task_completed_event_attributes'):
181+
attrs = event.activity_task_completed_event_attributes
182+
if attrs and hasattr(attrs, 'result'):
183+
# Activity results contain tool execution info
184+
transcript.append({
185+
"type": "activity_completed",
186+
"result": str(attrs.result) if attrs.result else None,
187+
})
188+
189+
# Look for activity scheduled events (contains tool name)
190+
if hasattr(event, 'activity_task_scheduled_event_attributes'):
191+
attrs = event.activity_task_scheduled_event_attributes
192+
if attrs and hasattr(attrs, 'activity_type'):
193+
activity_name = attrs.activity_type.name if attrs.activity_type else None
194+
transcript.append({
195+
"type": "function_call",
196+
"name": activity_name,
197+
})
198+
199+
return transcript
200+
201+
202+
async def get_transcript_event_count(handle: WorkflowHandle) -> int:
203+
"""Get the current number of events in the transcript."""
204+
transcript = await get_workflow_transcript(handle)
205+
return len(transcript)
206+
207+
208+
def get_new_tool_calls(
209+
full_transcript: list[dict[str, Any]],
210+
previous_count: int
211+
) -> list[dict[str, Any]]:
212+
"""
213+
Get only the new tool calls since the previous checkpoint.
214+
215+
Args:
216+
full_transcript: The complete transcript from get_workflow_transcript
217+
previous_count: The transcript length before the event was sent
218+
219+
Returns:
220+
List of new tool call entries
221+
"""
222+
return full_transcript[previous_count:]
223+
224+
225+
def get_workflow_id(handle: WorkflowHandle) -> str:
226+
"""Get the workflow ID from a handle."""
227+
return handle.id

examples/demos/procurement_agent/evals/fixtures/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)