scaleapi
diff --git a/‎examples/demos/procurement_agent/evals/README.md‎
Lines changed: 63 additions & 0 deletions b/‎examples/demos/procurement_agent/evals/README.md‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎examples/demos/procurement_agent/evals/__init__.py‎ b/‎examples/demos/procurement_agent/evals/__init__.py‎
diff --git a/‎examples/demos/procurement_agent/evals/conftest.py‎
Lines changed: 226 additions & 0 deletions b/‎examples/demos/procurement_agent/evals/conftest.py‎
Lines changed: 226 additions & 0 deletions
diff --git a/‎examples/demos/procurement_agent/evals/fixtures/__init__.py‎ b/‎examples/demos/procurement_agent/evals/fixtures/__init__.py‎
diff --git a/‎examples/demos/procurement_agent/evals/fixtures/events.py‎
Lines changed: 108 additions & 0 deletions b/‎examples/demos/procurement_agent/evals/fixtures/events.py‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎examples/demos/procurement_agent/evals/graders/__init__.py‎ b/‎examples/demos/procurement_agent/evals/graders/__init__.py‎
@@ -0,0 +1,63 @@
+# Procurement Agent Evals
+
+Integration tests for the procurement agent that verify tool calls and database state.
+
+## Prerequisites
+
+1. AgentEx backend running (`make dev` from scale-agentex)
+2. Procurement agent running:
+   ```bash
+   cd examples/demos/procurement_agent
+   export ENVIRONMENT=development
+   uv run agentex agents run --manifest manifest.yaml
+   ```
+
+## Running Tests
+
+From the `procurement_agent` directory:
+
+```bash
+# Run all tests
+cd evals && uv run pytest
+
+# Run specific test file
+cd evals && uv run pytest tasks/test_shipment_departed.py -v
+
+# Run single test
+cd evals && uv run pytest tasks/test_shipment_departed.py::test_departed_01_no_flag_5_days_early -v
+```
+
+## Test Structure
+
+| File | Event Type | Focus |
+|------|------------|-------|
+| `test_submittal_approved.py` | Submittal_Approved | PO issued, DB entry |
+| `test_shipment_departed.py` | Shipment_Departed | **False positive detection** |
+| `test_shipment_arrived.py` | Shipment_Arrived | Team notification, inspection |
+| `test_inspection_failed.py` | Inspection_Failed | Human-in-the-loop |
+| `test_inspection_passed.py` | Inspection_Passed | Status update |
+
+## Test Cases Summary
+
+| Event | Tests | Key Assertions |
+|-------|-------|----------------|
+| Submittal_Approved | 2 | `issue_purchase_order` called, DB item created |
+| Shipment_Departed | 6 | Forbidden: `flag_potential_issue` when ETA < required_by |
+| Shipment_Arrived | 2 | `notify_team`, `schedule_inspection` called |
+| Inspection_Failed | 3 | Human-in-loop: approve, approve+extra, reject+delete |
+| Inspection_Passed | 2 | Forbidden: `wait_for_human`, `flag_potential_issue` |
+
+## Graders
+
+- **tool_calls.py**: Verifies required and forbidden tool calls in transcripts
+- **database.py**: Verifies database state changes
+
+## False Positive Detection
+
+The `test_shipment_departed.py` tests are specifically designed to catch the false positive issue where the agent incorrectly flags conflicts.
+
+**Conflict logic:**
+- **Flag if** ETA >= required_by (zero/negative buffer)
+- **Don't flag if** ETA < required_by (has buffer remaining)
+
+The tests use `assert_forbidden_tools(["flag_potential_issue"])` to catch cases where the agent incorrectly escalates.
@@ -0,0 +1,226 @@
+"""
+Pytest fixtures for procurement agent evals.
+
+Provides workflow setup, transcript extraction, and human input simulation.
+"""
+import os
+import uuid
+import asyncio
+from typing import Any, AsyncGenerator
+
+import pytest
+import pytest_asyncio
+from temporalio.client import Client, WorkflowHandle
+
+from datetime import datetime as dt
+from agentex.lib.types.acp import CreateTaskParams
+from agentex.types.task import Task
+from agentex.types.agent import Agent
+
+
+# Set environment variables for local development
+os.environ.setdefault("AGENT_NAME", "procurement-agent")
+os.environ.setdefault("ACP_URL", "http://localhost:8000")
+os.environ.setdefault("WORKFLOW_NAME", "procurement-agent")
+os.environ.setdefault("WORKFLOW_TASK_QUEUE", "procurement_agent_queue")
+os.environ.setdefault("TEMPORAL_ADDRESS", "localhost:7233")
+
+
+@pytest.fixture(scope="session")
+def event_loop():
+    """Create an event loop for the test session."""
+    loop = asyncio.get_event_loop_policy().new_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest_asyncio.fixture(scope="session")
+async def temporal_client() -> AsyncGenerator[Client, None]:
+    """Create a Temporal client for the test session."""
+    client = await Client.connect(
+        os.environ.get("TEMPORAL_ADDRESS", "localhost:7233")
+    )
+    yield client
+    # Client doesn't need explicit close
+
+
+@pytest_asyncio.fixture
+async def workflow_handle(temporal_client: Client) -> AsyncGenerator[WorkflowHandle, None]:
+    """
+    Start a fresh workflow for each test.
+
+    Creates a unique workflow ID and starts the procurement agent workflow.
+    Yields the handle for sending signals and querying state.
+    """
+    workflow_id = f"eval-{uuid.uuid4()}"
+    task_queue = os.environ.get("WORKFLOW_TASK_QUEUE", "procurement_agent_queue")
+    workflow_name = os.environ.get("WORKFLOW_NAME", "procurement-agent")
+
+    # Create agent and task params
+    now = dt.now()
+    agent = Agent(
+        id="procurement-agent",
+        name="procurement-agent",
+        acp_type="agentic",
+        description="Procurement agent for construction delivery management",
+        created_at=now,
+        updated_at=now,
+    )
+    task = Task(id=workflow_id)
+    params = CreateTaskParams(agent=agent, task=task)
+
+    # Start the workflow
+    handle = await temporal_client.start_workflow(
+        workflow_name,
+        params,
+        id=workflow_id,
+        task_queue=task_queue,
+    )
+
+    # Give workflow time to initialize
+    await asyncio.sleep(2)
+
+    yield handle
+
+    # Cleanup: terminate the workflow after test
+    try:
+        await handle.terminate("Test completed")
+    except Exception:
+        pass  # Workflow may have already completed
+
+
+async def send_event(handle: WorkflowHandle, event: Any) -> None:
+    """
+    Send an event to the workflow via signal.
+
+    Args:
+        handle: The workflow handle
+        event: A Pydantic event model (will be serialized to JSON)
+    """
+    event_json = event.model_dump_json()
+    await handle.signal("send_event", event_json)
+
+
+async def send_human_response(handle: WorkflowHandle, response: str) -> None:
+    """
+    Send a human response to the workflow.
+
+    This simulates a user responding in the UI to a wait_for_human escalation.
+
+    Args:
+        handle: The workflow handle
+        response: The human's text response
+    """
+    # Import here to avoid circular imports
+    from agentex.lib.types.acp import SendEventParams
+    from agentex.types.task import Task
+    from agentex.types.agent import Agent
+    from agentex.types.event import Event
+    from agentex.types.text_content import TextContent
+
+    now = dt.now()
+    agent = Agent(
+        id="procurement-agent",
+        name="procurement-agent",
+        acp_type="agentic",
+        description="Procurement agent for construction delivery management",
+        created_at=now,
+        updated_at=now,
+    )
+    task = Task(id=handle.id)
+    event = Event(
+        id=str(uuid.uuid4()),
+        agent_id="procurement-agent",
+        task_id=handle.id,
+        sequence_id=1,
+        content=TextContent(author="user", content=response),
+    )
+    params = SendEventParams(agent=agent, task=task, event=event)
+
+    await handle.signal("receive_event", params)
+
+
+async def wait_for_processing(handle: WorkflowHandle, timeout_seconds: float = 60) -> None:
+    """
+    Wait for the workflow to finish processing an event.
+
+    Polls the workflow until no more activities are running.
+
+    Args:
+        handle: The workflow handle
+        timeout_seconds: Maximum time to wait
+    """
+    # Simple approach: wait a fixed time for agent to process
+    # In production, you'd poll workflow state more intelligently
+    await asyncio.sleep(timeout_seconds)
+
+
+async def get_workflow_transcript(handle: WorkflowHandle) -> list[dict[str, Any]]:
+    """
+    Extract the conversation transcript from workflow history.
+
+    Queries the workflow to get the internal state containing tool calls.
+
+    Args:
+        handle: The workflow handle
+
+    Returns:
+        List of message dicts containing tool calls and responses
+    """
+    # Query workflow state to get the input_list (conversation history)
+    # This requires the workflow to expose a query handler
+
+    # For now, we'll extract from workflow history events
+    # The tool calls appear in activity completions
+    transcript = []
+
+    async for event in handle.fetch_history_events():
+        # Look for activity completed events
+        if hasattr(event, 'activity_task_completed_event_attributes'):
+            attrs = event.activity_task_completed_event_attributes
+            if attrs and hasattr(attrs, 'result'):
+                # Activity results contain tool execution info
+                transcript.append({
+                    "type": "activity_completed",
+                    "result": str(attrs.result) if attrs.result else None,
+                })
+
+        # Look for activity scheduled events (contains tool name)
+        if hasattr(event, 'activity_task_scheduled_event_attributes'):
+            attrs = event.activity_task_scheduled_event_attributes
+            if attrs and hasattr(attrs, 'activity_type'):
+                activity_name = attrs.activity_type.name if attrs.activity_type else None
+                transcript.append({
+                    "type": "function_call",
+                    "name": activity_name,
+                })
+
+    return transcript
+
+
+async def get_transcript_event_count(handle: WorkflowHandle) -> int:
+    """Get the current number of events in the transcript."""
+    transcript = await get_workflow_transcript(handle)
+    return len(transcript)
+
+
+def get_new_tool_calls(
+    full_transcript: list[dict[str, Any]],
+    previous_count: int
+) -> list[dict[str, Any]]:
+    """
+    Get only the new tool calls since the previous checkpoint.
+
+    Args:
+        full_transcript: The complete transcript from get_workflow_transcript
+        previous_count: The transcript length before the event was sent
+
+    Returns:
+        List of new tool call entries
+    """
+    return full_transcript[previous_count:]
+
+
+def get_workflow_id(handle: WorkflowHandle) -> str:
+    """Get the workflow ID from a handle."""
+    return handle.id
@@ -0,0 +1,108 @@
+"""
+Event fixtures for eval test cases.
+
+Provides factory functions to create events with configurable parameters.
+"""
+from datetime import datetime, timedelta
+from typing import Optional
+
+from project.models.events import (
+    EventType,
+    SubmitalApprovalEvent,
+    ShipmentDepartedFactoryEvent,
+    ShipmentArrivedSiteEvent,
+    InspectionFailedEvent,
+    InspectionPassedEvent,
+)
+
+
+def create_submittal_approved(item: str) -> SubmitalApprovalEvent:
+    """Create a Submittal_Approved event."""
+    return SubmitalApprovalEvent(
+        event_type=EventType.SUBMITTAL_APPROVED,
+        item=item,
+        document_name=f"{item} Submittal.pdf",
+        document_url=f"/submittals/{item.lower().replace(' ', '_')}.pdf",
+    )
+
+
+def create_shipment_departed(
+    item: str,
+    eta: datetime,
+    date_departed: Optional[datetime] = None,
+) -> ShipmentDepartedFactoryEvent:
+    """
+    Create a Shipment_Departed_Factory event.
+
+    Args:
+        item: The item name
+        eta: Estimated time of arrival (this is what gets compared to required_by)
+        date_departed: When shipment left factory (defaults to 7 days before ETA)
+    """
+    if date_departed is None:
+        date_departed = eta - timedelta(days=7)
+
+    return ShipmentDepartedFactoryEvent(
+        event_type=EventType.SHIPMENT_DEPARTED_FACTORY,
+        item=item,
+        eta=eta,
+        date_departed=date_departed,
+        location_address="218 W 18th St, New York, NY 10011",
+    )
+
+
+def create_shipment_arrived(
+    item: str,
+    date_arrived: datetime,
+) -> ShipmentArrivedSiteEvent:
+    """Create a Shipment_Arrived_Site event."""
+    return ShipmentArrivedSiteEvent(
+        event_type=EventType.SHIPMENT_ARRIVED_SITE,
+        item=item,
+        date_arrived=date_arrived,
+        location_address="650 Townsend St, San Francisco, CA 94103",
+    )
+
+
+def create_inspection_failed(
+    item: str,
+    inspection_date: Optional[datetime] = None,
+) -> InspectionFailedEvent:
+    """Create an Inspection_Failed event."""
+    if inspection_date is None:
+        inspection_date = datetime.now()
+
+    return InspectionFailedEvent(
+        event_type=EventType.INSPECTION_FAILED,
+        item=item,
+        inspection_date=inspection_date,
+        document_name=f"{item} Inspection Report.pdf",
+        document_url=f"/inspections/{item.lower().replace(' ', '_')}_failed.pdf",
+    )
+
+
+def create_inspection_passed(
+    item: str,
+    inspection_date: Optional[datetime] = None,
+) -> InspectionPassedEvent:
+    """Create an Inspection_Passed event."""
+    if inspection_date is None:
+        inspection_date = datetime.now()
+
+    return InspectionPassedEvent(
+        event_type=EventType.INSPECTION_PASSED,
+        item=item,
+        inspection_date=inspection_date,
+        document_name=f"{item} Inspection Report.pdf",
+        document_url=f"/inspections/{item.lower().replace(' ', '_')}_passed.pdf",
+    )
+
+
+# Default schedule reference (matches database.py DEFAULT_SCHEDULE)
+SCHEDULE_REFERENCE = {
+    "Steel Beams": {"required_by": "2026-02-15", "buffer_days": 5},
+    "HVAC Units": {"required_by": "2026-03-01", "buffer_days": 7},
+    "Windows": {"required_by": "2026-03-15", "buffer_days": 10},
+    "Flooring Materials": {"required_by": "2026-04-01", "buffer_days": 3},
+    "Electrical Panels": {"required_by": "2026-04-15", "buffer_days": 5},
+}