Merge pull request #17 from aws-samples/feat/offline-eval

najmia · web-flow · commit 97d7ea05ecbb · 2025-10-29T16:37:28.000+01:00
Feat/offline eval
diff --git a/README.md b/README.md
@@ -121,3 +121,85 @@ uv run streamlit run src/app.py --server.port 8501 --server.address 127.0.0.1
 
 Refer to the **agentcore_runtime_deployment.ipynb** notebook to deploy your agent using [Bedrock AgentCore Runtime](https://docs.aws.amazon.com/bedrock-agentcore/latest/devguide/agents-tools-runtime.html).
 
+## Evaluation
+
+The platform includes comprehensive evaluation capabilities to assess agent performance across multiple dimensions.
+
+### How Evaluation Works
+
+The evaluation system runs test queries against your agent, collects execution traces, and measures performance:
+
+1. **Load test queries** from `groundtruth.json` with expected tool usage
+2. **Send queries to agent** endpoint and capture responses with trace IDs
+3. **Wait for traces** to be available in Langfuse observability platform
+4. **Extract metrics** from traces including tool calls, retrieval scores, and latencies
+5. **Evaluate response quality** using Bedrock LLM to score faithfulness, correctness, and helpfulness
+6. **Calculate performance metrics** and save comprehensive results to CSV files
+
+### Evaluation Setup
+
+The evaluation system consists of:
+- **offline_evaluation.py**: Main evaluation script that runs test queries and calculates metrics
+- **response_quality_evaluator.py**: Uses Bedrock LLM to evaluate response quality
+- **groundtruth.json**: Test queries with expected tool usage (create this file with your test cases)
+
+### Prerequisites
+
+1. **Environment Variables**: Export Langfuse and AWS credentials:
+   ```bash
+   export LANGFUSE_SECRET_KEY="your-key"
+   export LANGFUSE_PUBLIC_KEY="your-key"
+   export LANGFUSE_HOST="your-langfuse-host"
+   export AWS_ACCESS_KEY_ID="your-key"
+   export AWS_SECRET_ACCESS_KEY="your-key"
+   ```
+2. **Agent Endpoint**: Have your agent running locally (`http://localhost:8080`) or deployed on Bedrock AgentCore
+3. **Test Data**: Create `groundtruth.json` with test queries:
+
+```json
+[
+  {
+    "query": "How do I reset my router hub?",
+    "expected_tools": ["retrieve_context"]
+  }
+]
+```
+
+### Running Evaluation
+
+```bash
+# Run offline evaluation
+python offline_evaluation.py
+
+# Or evaluate existing trace data
+python response_quality_evaluator.py
+```
+
+### Metrics Collected
+
+- **Success Rate**: Percentage of successful agent responses
+- **Tool Accuracy**: How well the agent selects expected tools
+- **Retrieval Quality**: Relevance scores from knowledge base retrieval
+- **Response Quality**: AI-evaluated metrics using Bedrock LLM:
+  - **Faithfulness** (0.0-1.0): How well the response sticks to provided context without hallucination
+  - **Correctness** (0.0-1.0): How factually accurate and technically correct the response is
+  - **Helpfulness** (0.0-1.0): How useful and relevant the response is to answering the user's query
+- **Latency Metrics**: Total and per-tool response times
+
+### Output Files
+
+- **comprehensive_results.csv**: Complete evaluation results with all metrics
+- **trace_metrics.csv**: Raw trace data from Langfuse
+- **response_quality_scores.csv**: Detailed response quality evaluations
+
+### Configuration
+
+Set agent endpoint (local or AgentCore):
+```bash
+# For local agent
+export AGENT_ARN="http://localhost:8080"
+
+# For Bedrock AgentCore deployment
+export AGENT_ARN="your-agentcore-endpoint"
+```
+
diff --git a/cx-agent-backend/cx_agent_backend/domain/services/agent_service.py b/cx-agent-backend/cx_agent_backend/domain/services/agent_service.py
@@ -1,7 +1,7 @@
 """Domain service interface for agent operations."""
 
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum
 
 from cx_agent_backend.domain.entities.conversation import Message
@@ -25,6 +25,7 @@ class AgentRequest:
     model: str
     session_id: str | None = None
     trace_id: str | None = None
+    langfuse_tags: list[str] = field(default_factory=list)
 
 
 @dataclass(frozen=True)
@@ -34,6 +35,7 @@ class AgentResponse:
     agent_type: AgentType
     tools_used: list[str]
     metadata: dict[str, str]
+    trace_id: str | None = None
 
 
 class AgentService(ABC):
diff --git a/cx-agent-backend/cx_agent_backend/domain/services/conversation_service.py b/cx-agent-backend/cx_agent_backend/domain/services/conversation_service.py
@@ -35,7 +35,7 @@ async def start_conversation(self, user_id: str) -> Conversation:
         return conversation
 
     async def send_message(
-        self, conversation_id: UUID, user_id: str, content: str, model: str
+        self, conversation_id: UUID, user_id: str, content: str, model: str, langfuse_tags: list[str] = None
     ) -> tuple[Message, list[str]]:
         """Send a message and get AI response."""
         # Get or create conversation
@@ -74,6 +74,7 @@ async def send_message(
             model=model,
             session_id=str(conversation.id),
             trace_id=None,  # Can be set from FastAPI layer
+            langfuse_tags=langfuse_tags or [],
         )
         agent_response = await self._agent_service.process_request(agent_request)
         
@@ -90,6 +91,8 @@ async def send_message(
             ai_metadata["citations"] = json.dumps(agent_response.metadata["citations"]) if isinstance(agent_response.metadata["citations"], list) else agent_response.metadata["citations"]
         if "knowledge_base_id" in agent_response.metadata:
             ai_metadata["knowledge_base_id"] = agent_response.metadata["knowledge_base_id"]
+        if agent_response.trace_id:
+            ai_metadata["trace_id"] = agent_response.trace_id
         
         logger.info("AI metadata: %s", ai_metadata)
 
diff --git a/cx-agent-backend/cx_agent_backend/infrastructure/adapters/langgraph_agent_service.py b/cx-agent-backend/cx_agent_backend/infrastructure/adapters/langgraph_agent_service.py
@@ -138,6 +138,7 @@ async def process_request(self, request: AgentRequest) -> AgentResponse:
                                 input_result.blocked_categories
                             )
                         },
+                        trace_id=predefined_trace_id,
                     )
 
         # Get memory parameters from environment or request
@@ -176,10 +177,20 @@ async def process_request(self, request: AgentRequest) -> AgentResponse:
                 name="langchain-request",
                 trace_context={"trace_id": predefined_trace_id}
             ) as span:
-                span.update_trace(
-                    user_id=request.user_id,
-                    input={"messages": [msg.content for msg in request.messages]}
-                )
+                trace_update_params = {
+                    "user_id": request.user_id,
+                    "input": {"messages": [msg.content for msg in request.messages]}
+                }
+                # Add default tag and any additional tags
+                tags = ["langgraph-cx-agent"]
+                if request.langfuse_tags:
+                    logger.info(f"Adding langfuse_tags: {request.langfuse_tags}")
+                    tags.extend(request.langfuse_tags)
+                else:
+                    logger.info("No langfuse_tags provided")
+                logger.info(f"Final tags for trace: {tags}")
+                trace_update_params["tags"] = tags
+                span.update_trace(**trace_update_params)
                 
                 config = RunnableConfig(
                     configurable={
@@ -283,13 +294,13 @@ async def process_request(self, request: AgentRequest) -> AgentResponse:
                     metadata={
                         "blocked_categories": ",".join(output_result.blocked_categories)
                     },
+                    trace_id=trace_id,
                 )
 
         # Add trace metadata
         metadata = {
             "model": request.model,
             "agent_type": request.agent_type.value,
-            "trace_id": trace_id,
         }
         
         # Add citations to metadata if available
@@ -302,6 +313,7 @@ async def process_request(self, request: AgentRequest) -> AgentResponse:
             agent_type=request.agent_type,
             tools_used=tools_used,
             metadata=metadata,
+            trace_id=trace_id
         )
 
     async def stream_response(self, request: AgentRequest):
diff --git a/cx-agent-backend/cx_agent_backend/presentation/api/conversation_router.py b/cx-agent-backend/cx_agent_backend/presentation/api/conversation_router.py
@@ -132,6 +132,7 @@ async def send_message(
             user_id=user_id,
             content=request.prompt,
             model=request.model,
+            langfuse_tags=request.langfuse_tags,
         )
         
         return SendMessageResponse(
diff --git a/cx-agent-backend/cx_agent_backend/presentation/schemas/conversation_schemas.py b/cx-agent-backend/cx_agent_backend/presentation/schemas/conversation_schemas.py
@@ -69,6 +69,7 @@ class SendMessageRequest(BaseModel):
     model: str = Field(default="gpt-4o-mini", min_length=1)
     user_id: str | None = Field(None, min_length=1, max_length=100)
     feedback: FeedbackRequest | None = None
+    langfuse_tags: list[str] = Field(default_factory=list, description="Tags to add to Langfuse trace")
 
 
 class SendMessageResponse(BaseModel):
diff --git a/cx-agent-backend/cx_agent_backend/server.py b/cx-agent-backend/cx_agent_backend/server.py
@@ -75,6 +75,7 @@ async def invocations(request: dict, http_request: Request):
         feedback = input_data.get("feedback")
         conversation_id_str = input_data.get("conversation_id")
         user_id = input_data.get("user_id")
+        langfuse_tags = input_data.get("langfuse_tags", [])
         
         # Convert conversation_id to UUID
         from uuid import UUID
@@ -104,6 +105,7 @@ async def invocations(request: dict, http_request: Request):
                 user_id=user_id,
                 content=prompt,
                 model=settings.default_model,
+                langfuse_tags=langfuse_tags,
             )
             
             # Return agent contract format with metadata
@@ -116,7 +118,16 @@ async def invocations(request: dict, http_request: Request):
             # Add metadata if available
             if hasattr(message, 'metadata') and message.metadata:
                 output["metadata"] = message.metadata
-                
+                # Extract trace_id from metadata if available
+                if "trace_id" in message.metadata:
+                    output["trace_id"] = message.metadata["trace_id"]
+                    print(f"DEBUG: Added trace_id to output: {message.metadata['trace_id']}")
+                else:
+                    print(f"DEBUG: No trace_id in metadata. Available keys: {list(message.metadata.keys())}")
+            else:
+                print("DEBUG: No metadata available in message")
+            
+            print(f"DEBUG: Final output keys: {list(output.keys())}")
             return {"output": output}
             
         except ValueError as e:
diff --git a/offline_evaluation.py b/offline_evaluation.py
diff --git a/response_quality_evaluator.py b/response_quality_evaluator.py

Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,7 @@ async def send_message(`
`132`	`132`	`user_id=user_id,`
`133`	`133`	`content=request.prompt,`
`134`	`134`	`model=request.model,`
	`135`	`+ langfuse_tags=request.langfuse_tags,`
`135`	`136`	`)`
`136`	`137`
`137`	`138`	`return SendMessageResponse(`