Skip to content

Commit 8bd7e2e

Browse files
committed
Add evals
1 parent 7d2aeda commit 8bd7e2e

20 files changed

+1885
-1
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Procurement Agent Evals
2+
3+
Integration tests for the procurement agent that verify tool calls and database state.
4+
5+
## Prerequisites
6+
7+
1. AgentEx backend running (`make dev` from scale-agentex)
8+
2. Procurement agent running:
9+
```bash
10+
cd examples/demos/procurement_agent
11+
export ENVIRONMENT=development
12+
uv run agentex agents run --manifest manifest.yaml
13+
```
14+
15+
## Running Tests
16+
17+
From the `procurement_agent` directory:
18+
19+
```bash
20+
# Run all tests
21+
cd evals && uv run pytest
22+
23+
# Run specific test file
24+
cd evals && uv run pytest tasks/test_shipment_departed.py -v
25+
26+
# Run single test
27+
cd evals && uv run pytest tasks/test_shipment_departed.py::test_departed_01_no_flag_5_days_early -v
28+
```
29+
30+
## Test Structure
31+
32+
| File | Event Type | Focus |
33+
|------|------------|-------|
34+
| `test_submittal_approved.py` | Submittal_Approved | PO issued, DB entry |
35+
| `test_shipment_departed.py` | Shipment_Departed | **False positive detection** |
36+
| `test_shipment_arrived.py` | Shipment_Arrived | Team notification, inspection |
37+
| `test_inspection_failed.py` | Inspection_Failed | Human-in-the-loop |
38+
| `test_inspection_passed.py` | Inspection_Passed | Status update |
39+
40+
## Test Cases Summary
41+
42+
| Event | Tests | Key Assertions |
43+
|-------|-------|----------------|
44+
| Submittal_Approved | 2 | `issue_purchase_order` called, DB item created |
45+
| Shipment_Departed | 6 | Forbidden: `flag_potential_issue` when ETA < required_by |
46+
| Shipment_Arrived | 2 | `notify_team`, `schedule_inspection` called |
47+
| Inspection_Failed | 3 | Human-in-loop: approve, approve+extra, reject+delete |
48+
| Inspection_Passed | 2 | Forbidden: `wait_for_human`, `flag_potential_issue` |
49+
50+
## Graders
51+
52+
- **tool_calls.py**: Verifies required and forbidden tool calls in transcripts
53+
- **database.py**: Verifies database state changes
54+
55+
## False Positive Detection
56+
57+
The `test_shipment_departed.py` tests are specifically designed to catch the false positive issue where the agent incorrectly flags conflicts.
58+
59+
**Conflict logic:**
60+
- **Flag if** ETA >= required_by (zero/negative buffer)
61+
- **Don't flag if** ETA < required_by (has buffer remaining)
62+
63+
The tests use `assert_forbidden_tools(["flag_potential_issue"])` to catch cases where the agent incorrectly escalates.

examples/demos/procurement_agent/evals/__init__.py

Whitespace-only changes.
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
"""
2+
Pytest fixtures for procurement agent evals.
3+
4+
Provides workflow setup, transcript extraction, and human input simulation.
5+
"""
6+
import os
7+
import uuid
8+
import asyncio
9+
from typing import Any, AsyncGenerator
10+
11+
import pytest
12+
import pytest_asyncio
13+
from temporalio.client import Client, WorkflowHandle
14+
15+
from datetime import datetime as dt
16+
from agentex.lib.types.acp import CreateTaskParams
17+
from agentex.types.task import Task
18+
from agentex.types.agent import Agent
19+
20+
21+
# Set environment variables for local development
22+
os.environ.setdefault("AGENT_NAME", "procurement-agent")
23+
os.environ.setdefault("ACP_URL", "http://localhost:8000")
24+
os.environ.setdefault("WORKFLOW_NAME", "procurement-agent")
25+
os.environ.setdefault("WORKFLOW_TASK_QUEUE", "procurement_agent_queue")
26+
os.environ.setdefault("TEMPORAL_ADDRESS", "localhost:7233")
27+
28+
29+
@pytest.fixture(scope="session")
30+
def event_loop():
31+
"""Create an event loop for the test session."""
32+
loop = asyncio.get_event_loop_policy().new_event_loop()
33+
yield loop
34+
loop.close()
35+
36+
37+
@pytest_asyncio.fixture(scope="session")
38+
async def temporal_client() -> AsyncGenerator[Client, None]:
39+
"""Create a Temporal client for the test session."""
40+
client = await Client.connect(
41+
os.environ.get("TEMPORAL_ADDRESS", "localhost:7233")
42+
)
43+
yield client
44+
# Client doesn't need explicit close
45+
46+
47+
@pytest_asyncio.fixture
48+
async def workflow_handle(temporal_client: Client) -> AsyncGenerator[WorkflowHandle, None]:
49+
"""
50+
Start a fresh workflow for each test.
51+
52+
Creates a unique workflow ID and starts the procurement agent workflow.
53+
Yields the handle for sending signals and querying state.
54+
"""
55+
workflow_id = f"eval-{uuid.uuid4()}"
56+
task_queue = os.environ.get("WORKFLOW_TASK_QUEUE", "procurement_agent_queue")
57+
workflow_name = os.environ.get("WORKFLOW_NAME", "procurement-agent")
58+
59+
# Create agent and task params
60+
now = dt.now()
61+
agent = Agent(
62+
id="procurement-agent",
63+
name="procurement-agent",
64+
acp_type="agentic",
65+
description="Procurement agent for construction delivery management",
66+
created_at=now,
67+
updated_at=now,
68+
)
69+
task = Task(id=workflow_id)
70+
params = CreateTaskParams(agent=agent, task=task)
71+
72+
# Start the workflow
73+
handle = await temporal_client.start_workflow(
74+
workflow_name,
75+
params,
76+
id=workflow_id,
77+
task_queue=task_queue,
78+
)
79+
80+
# Give workflow time to initialize
81+
await asyncio.sleep(2)
82+
83+
yield handle
84+
85+
# Cleanup: terminate the workflow after test
86+
try:
87+
await handle.terminate("Test completed")
88+
except Exception:
89+
pass # Workflow may have already completed
90+
91+
92+
async def send_event(handle: WorkflowHandle, event: Any) -> None:
93+
"""
94+
Send an event to the workflow via signal.
95+
96+
Args:
97+
handle: The workflow handle
98+
event: A Pydantic event model (will be serialized to JSON)
99+
"""
100+
event_json = event.model_dump_json()
101+
await handle.signal("send_event", event_json)
102+
103+
104+
async def send_human_response(handle: WorkflowHandle, response: str) -> None:
105+
"""
106+
Send a human response to the workflow.
107+
108+
This simulates a user responding in the UI to a wait_for_human escalation.
109+
110+
Args:
111+
handle: The workflow handle
112+
response: The human's text response
113+
"""
114+
# Import here to avoid circular imports
115+
from agentex.lib.types.acp import SendEventParams
116+
from agentex.types.task import Task
117+
from agentex.types.agent import Agent
118+
from agentex.types.event import Event
119+
from agentex.types.text_content import TextContent
120+
121+
now = dt.now()
122+
agent = Agent(
123+
id="procurement-agent",
124+
name="procurement-agent",
125+
acp_type="agentic",
126+
description="Procurement agent for construction delivery management",
127+
created_at=now,
128+
updated_at=now,
129+
)
130+
task = Task(id=handle.id)
131+
event = Event(
132+
id=str(uuid.uuid4()),
133+
agent_id="procurement-agent",
134+
task_id=handle.id,
135+
sequence_id=1,
136+
content=TextContent(author="user", content=response),
137+
)
138+
params = SendEventParams(agent=agent, task=task, event=event)
139+
140+
await handle.signal("receive_event", params)
141+
142+
143+
async def wait_for_processing(handle: WorkflowHandle, timeout_seconds: float = 60) -> None:
144+
"""
145+
Wait for the workflow to finish processing an event.
146+
147+
Polls the workflow until no more activities are running.
148+
149+
Args:
150+
handle: The workflow handle
151+
timeout_seconds: Maximum time to wait
152+
"""
153+
# Simple approach: wait a fixed time for agent to process
154+
# In production, you'd poll workflow state more intelligently
155+
await asyncio.sleep(timeout_seconds)
156+
157+
158+
async def get_workflow_transcript(handle: WorkflowHandle) -> list[dict[str, Any]]:
159+
"""
160+
Extract the conversation transcript from workflow history.
161+
162+
Queries the workflow to get the internal state containing tool calls.
163+
164+
Args:
165+
handle: The workflow handle
166+
167+
Returns:
168+
List of message dicts containing tool calls and responses
169+
"""
170+
# Query workflow state to get the input_list (conversation history)
171+
# This requires the workflow to expose a query handler
172+
173+
# For now, we'll extract from workflow history events
174+
# The tool calls appear in activity completions
175+
transcript = []
176+
177+
async for event in handle.fetch_history_events():
178+
# Look for activity completed events
179+
if hasattr(event, 'activity_task_completed_event_attributes'):
180+
attrs = event.activity_task_completed_event_attributes
181+
if attrs and hasattr(attrs, 'result'):
182+
# Activity results contain tool execution info
183+
transcript.append({
184+
"type": "activity_completed",
185+
"result": str(attrs.result) if attrs.result else None,
186+
})
187+
188+
# Look for activity scheduled events (contains tool name)
189+
if hasattr(event, 'activity_task_scheduled_event_attributes'):
190+
attrs = event.activity_task_scheduled_event_attributes
191+
if attrs and hasattr(attrs, 'activity_type'):
192+
activity_name = attrs.activity_type.name if attrs.activity_type else None
193+
transcript.append({
194+
"type": "function_call",
195+
"name": activity_name,
196+
})
197+
198+
return transcript
199+
200+
201+
async def get_transcript_event_count(handle: WorkflowHandle) -> int:
202+
"""Get the current number of events in the transcript."""
203+
transcript = await get_workflow_transcript(handle)
204+
return len(transcript)
205+
206+
207+
def get_new_tool_calls(
208+
full_transcript: list[dict[str, Any]],
209+
previous_count: int
210+
) -> list[dict[str, Any]]:
211+
"""
212+
Get only the new tool calls since the previous checkpoint.
213+
214+
Args:
215+
full_transcript: The complete transcript from get_workflow_transcript
216+
previous_count: The transcript length before the event was sent
217+
218+
Returns:
219+
List of new tool call entries
220+
"""
221+
return full_transcript[previous_count:]
222+
223+
224+
def get_workflow_id(handle: WorkflowHandle) -> str:
225+
"""Get the workflow ID from a handle."""
226+
return handle.id

examples/demos/procurement_agent/evals/fixtures/__init__.py

Whitespace-only changes.
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
"""
2+
Event fixtures for eval test cases.
3+
4+
Provides factory functions to create events with configurable parameters.
5+
"""
6+
from datetime import datetime, timedelta
7+
from typing import Optional
8+
9+
from project.models.events import (
10+
EventType,
11+
SubmitalApprovalEvent,
12+
ShipmentDepartedFactoryEvent,
13+
ShipmentArrivedSiteEvent,
14+
InspectionFailedEvent,
15+
InspectionPassedEvent,
16+
)
17+
18+
19+
def create_submittal_approved(item: str) -> SubmitalApprovalEvent:
20+
"""Create a Submittal_Approved event."""
21+
return SubmitalApprovalEvent(
22+
event_type=EventType.SUBMITTAL_APPROVED,
23+
item=item,
24+
document_name=f"{item} Submittal.pdf",
25+
document_url=f"/submittals/{item.lower().replace(' ', '_')}.pdf",
26+
)
27+
28+
29+
def create_shipment_departed(
30+
item: str,
31+
eta: datetime,
32+
date_departed: Optional[datetime] = None,
33+
) -> ShipmentDepartedFactoryEvent:
34+
"""
35+
Create a Shipment_Departed_Factory event.
36+
37+
Args:
38+
item: The item name
39+
eta: Estimated time of arrival (this is what gets compared to required_by)
40+
date_departed: When shipment left factory (defaults to 7 days before ETA)
41+
"""
42+
if date_departed is None:
43+
date_departed = eta - timedelta(days=7)
44+
45+
return ShipmentDepartedFactoryEvent(
46+
event_type=EventType.SHIPMENT_DEPARTED_FACTORY,
47+
item=item,
48+
eta=eta,
49+
date_departed=date_departed,
50+
location_address="218 W 18th St, New York, NY 10011",
51+
)
52+
53+
54+
def create_shipment_arrived(
55+
item: str,
56+
date_arrived: datetime,
57+
) -> ShipmentArrivedSiteEvent:
58+
"""Create a Shipment_Arrived_Site event."""
59+
return ShipmentArrivedSiteEvent(
60+
event_type=EventType.SHIPMENT_ARRIVED_SITE,
61+
item=item,
62+
date_arrived=date_arrived,
63+
location_address="650 Townsend St, San Francisco, CA 94103",
64+
)
65+
66+
67+
def create_inspection_failed(
68+
item: str,
69+
inspection_date: Optional[datetime] = None,
70+
) -> InspectionFailedEvent:
71+
"""Create an Inspection_Failed event."""
72+
if inspection_date is None:
73+
inspection_date = datetime.now()
74+
75+
return InspectionFailedEvent(
76+
event_type=EventType.INSPECTION_FAILED,
77+
item=item,
78+
inspection_date=inspection_date,
79+
document_name=f"{item} Inspection Report.pdf",
80+
document_url=f"/inspections/{item.lower().replace(' ', '_')}_failed.pdf",
81+
)
82+
83+
84+
def create_inspection_passed(
85+
item: str,
86+
inspection_date: Optional[datetime] = None,
87+
) -> InspectionPassedEvent:
88+
"""Create an Inspection_Passed event."""
89+
if inspection_date is None:
90+
inspection_date = datetime.now()
91+
92+
return InspectionPassedEvent(
93+
event_type=EventType.INSPECTION_PASSED,
94+
item=item,
95+
inspection_date=inspection_date,
96+
document_name=f"{item} Inspection Report.pdf",
97+
document_url=f"/inspections/{item.lower().replace(' ', '_')}_passed.pdf",
98+
)
99+
100+
101+
# Default schedule reference (matches database.py DEFAULT_SCHEDULE)
102+
SCHEDULE_REFERENCE = {
103+
"Steel Beams": {"required_by": "2026-02-15", "buffer_days": 5},
104+
"HVAC Units": {"required_by": "2026-03-01", "buffer_days": 7},
105+
"Windows": {"required_by": "2026-03-15", "buffer_days": 10},
106+
"Flooring Materials": {"required_by": "2026-04-01", "buffer_days": 3},
107+
"Electrical Panels": {"required_by": "2026-04-15", "buffer_days": 5},
108+
}

examples/demos/procurement_agent/evals/graders/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)