Skip to content

Commit 97d7ea0

Browse files
authored
Merge pull request #17 from aws-samples/feat/offline-eval
Feat/offline eval
2 parents 56f9f27 + 5f7f119 commit 97d7ea0

File tree

9 files changed

+663
-8
lines changed

9 files changed

+663
-8
lines changed

README.md

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,3 +121,85 @@ uv run streamlit run src/app.py --server.port 8501 --server.address 127.0.0.1
121121

122122
Refer to the **agentcore_runtime_deployment.ipynb** notebook to deploy your agent using [Bedrock AgentCore Runtime](https://docs.aws.amazon.com/bedrock-agentcore/latest/devguide/agents-tools-runtime.html).
123123

124+
## Evaluation
125+
126+
The platform includes comprehensive evaluation capabilities to assess agent performance across multiple dimensions.
127+
128+
### How Evaluation Works
129+
130+
The evaluation system runs test queries against your agent, collects execution traces, and measures performance:
131+
132+
1. **Load test queries** from `groundtruth.json` with expected tool usage
133+
2. **Send queries to agent** endpoint and capture responses with trace IDs
134+
3. **Wait for traces** to be available in Langfuse observability platform
135+
4. **Extract metrics** from traces including tool calls, retrieval scores, and latencies
136+
5. **Evaluate response quality** using Bedrock LLM to score faithfulness, correctness, and helpfulness
137+
6. **Calculate performance metrics** and save comprehensive results to CSV files
138+
139+
### Evaluation Setup
140+
141+
The evaluation system consists of:
142+
- **offline_evaluation.py**: Main evaluation script that runs test queries and calculates metrics
143+
- **response_quality_evaluator.py**: Uses Bedrock LLM to evaluate response quality
144+
- **groundtruth.json**: Test queries with expected tool usage (create this file with your test cases)
145+
146+
### Prerequisites
147+
148+
1. **Environment Variables**: Export Langfuse and AWS credentials:
149+
```bash
150+
export LANGFUSE_SECRET_KEY="your-key"
151+
export LANGFUSE_PUBLIC_KEY="your-key"
152+
export LANGFUSE_HOST="your-langfuse-host"
153+
export AWS_ACCESS_KEY_ID="your-key"
154+
export AWS_SECRET_ACCESS_KEY="your-key"
155+
```
156+
2. **Agent Endpoint**: Have your agent running locally (`http://localhost:8080`) or deployed on Bedrock AgentCore
157+
3. **Test Data**: Create `groundtruth.json` with test queries:
158+
159+
```json
160+
[
161+
{
162+
"query": "How do I reset my router hub?",
163+
"expected_tools": ["retrieve_context"]
164+
}
165+
]
166+
```
167+
168+
### Running Evaluation
169+
170+
```bash
171+
# Run offline evaluation
172+
python offline_evaluation.py
173+
174+
# Or evaluate existing trace data
175+
python response_quality_evaluator.py
176+
```
177+
178+
### Metrics Collected
179+
180+
- **Success Rate**: Percentage of successful agent responses
181+
- **Tool Accuracy**: How well the agent selects expected tools
182+
- **Retrieval Quality**: Relevance scores from knowledge base retrieval
183+
- **Response Quality**: AI-evaluated metrics using Bedrock LLM:
184+
- **Faithfulness** (0.0-1.0): How well the response sticks to provided context without hallucination
185+
- **Correctness** (0.0-1.0): How factually accurate and technically correct the response is
186+
- **Helpfulness** (0.0-1.0): How useful and relevant the response is to answering the user's query
187+
- **Latency Metrics**: Total and per-tool response times
188+
189+
### Output Files
190+
191+
- **comprehensive_results.csv**: Complete evaluation results with all metrics
192+
- **trace_metrics.csv**: Raw trace data from Langfuse
193+
- **response_quality_scores.csv**: Detailed response quality evaluations
194+
195+
### Configuration
196+
197+
Set agent endpoint (local or AgentCore):
198+
```bash
199+
# For local agent
200+
export AGENT_ARN="http://localhost:8080"
201+
202+
# For Bedrock AgentCore deployment
203+
export AGENT_ARN="your-agentcore-endpoint"
204+
```
205+

cx-agent-backend/cx_agent_backend/domain/services/agent_service.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Domain service interface for agent operations."""
22

33
from abc import ABC, abstractmethod
4-
from dataclasses import dataclass
4+
from dataclasses import dataclass, field
55
from enum import Enum
66

77
from cx_agent_backend.domain.entities.conversation import Message
@@ -25,6 +25,7 @@ class AgentRequest:
2525
model: str
2626
session_id: str | None = None
2727
trace_id: str | None = None
28+
langfuse_tags: list[str] = field(default_factory=list)
2829

2930

3031
@dataclass(frozen=True)
@@ -34,6 +35,7 @@ class AgentResponse:
3435
agent_type: AgentType
3536
tools_used: list[str]
3637
metadata: dict[str, str]
38+
trace_id: str | None = None
3739

3840

3941
class AgentService(ABC):

cx-agent-backend/cx_agent_backend/domain/services/conversation_service.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ async def start_conversation(self, user_id: str) -> Conversation:
3535
return conversation
3636

3737
async def send_message(
38-
self, conversation_id: UUID, user_id: str, content: str, model: str
38+
self, conversation_id: UUID, user_id: str, content: str, model: str, langfuse_tags: list[str] = None
3939
) -> tuple[Message, list[str]]:
4040
"""Send a message and get AI response."""
4141
# Get or create conversation
@@ -74,6 +74,7 @@ async def send_message(
7474
model=model,
7575
session_id=str(conversation.id),
7676
trace_id=None, # Can be set from FastAPI layer
77+
langfuse_tags=langfuse_tags or [],
7778
)
7879
agent_response = await self._agent_service.process_request(agent_request)
7980

@@ -90,6 +91,8 @@ async def send_message(
9091
ai_metadata["citations"] = json.dumps(agent_response.metadata["citations"]) if isinstance(agent_response.metadata["citations"], list) else agent_response.metadata["citations"]
9192
if "knowledge_base_id" in agent_response.metadata:
9293
ai_metadata["knowledge_base_id"] = agent_response.metadata["knowledge_base_id"]
94+
if agent_response.trace_id:
95+
ai_metadata["trace_id"] = agent_response.trace_id
9396

9497
logger.info("AI metadata: %s", ai_metadata)
9598

cx-agent-backend/cx_agent_backend/infrastructure/adapters/langgraph_agent_service.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ async def process_request(self, request: AgentRequest) -> AgentResponse:
138138
input_result.blocked_categories
139139
)
140140
},
141+
trace_id=predefined_trace_id,
141142
)
142143

143144
# Get memory parameters from environment or request
@@ -176,10 +177,20 @@ async def process_request(self, request: AgentRequest) -> AgentResponse:
176177
name="langchain-request",
177178
trace_context={"trace_id": predefined_trace_id}
178179
) as span:
179-
span.update_trace(
180-
user_id=request.user_id,
181-
input={"messages": [msg.content for msg in request.messages]}
182-
)
180+
trace_update_params = {
181+
"user_id": request.user_id,
182+
"input": {"messages": [msg.content for msg in request.messages]}
183+
}
184+
# Add default tag and any additional tags
185+
tags = ["langgraph-cx-agent"]
186+
if request.langfuse_tags:
187+
logger.info(f"Adding langfuse_tags: {request.langfuse_tags}")
188+
tags.extend(request.langfuse_tags)
189+
else:
190+
logger.info("No langfuse_tags provided")
191+
logger.info(f"Final tags for trace: {tags}")
192+
trace_update_params["tags"] = tags
193+
span.update_trace(**trace_update_params)
183194

184195
config = RunnableConfig(
185196
configurable={
@@ -283,13 +294,13 @@ async def process_request(self, request: AgentRequest) -> AgentResponse:
283294
metadata={
284295
"blocked_categories": ",".join(output_result.blocked_categories)
285296
},
297+
trace_id=trace_id,
286298
)
287299

288300
# Add trace metadata
289301
metadata = {
290302
"model": request.model,
291303
"agent_type": request.agent_type.value,
292-
"trace_id": trace_id,
293304
}
294305

295306
# Add citations to metadata if available
@@ -302,6 +313,7 @@ async def process_request(self, request: AgentRequest) -> AgentResponse:
302313
agent_type=request.agent_type,
303314
tools_used=tools_used,
304315
metadata=metadata,
316+
trace_id=trace_id
305317
)
306318

307319
async def stream_response(self, request: AgentRequest):

cx-agent-backend/cx_agent_backend/presentation/api/conversation_router.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ async def send_message(
132132
user_id=user_id,
133133
content=request.prompt,
134134
model=request.model,
135+
langfuse_tags=request.langfuse_tags,
135136
)
136137

137138
return SendMessageResponse(

cx-agent-backend/cx_agent_backend/presentation/schemas/conversation_schemas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ class SendMessageRequest(BaseModel):
6969
model: str = Field(default="gpt-4o-mini", min_length=1)
7070
user_id: str | None = Field(None, min_length=1, max_length=100)
7171
feedback: FeedbackRequest | None = None
72+
langfuse_tags: list[str] = Field(default_factory=list, description="Tags to add to Langfuse trace")
7273

7374

7475
class SendMessageResponse(BaseModel):

cx-agent-backend/cx_agent_backend/server.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ async def invocations(request: dict, http_request: Request):
7575
feedback = input_data.get("feedback")
7676
conversation_id_str = input_data.get("conversation_id")
7777
user_id = input_data.get("user_id")
78+
langfuse_tags = input_data.get("langfuse_tags", [])
7879

7980
# Convert conversation_id to UUID
8081
from uuid import UUID
@@ -104,6 +105,7 @@ async def invocations(request: dict, http_request: Request):
104105
user_id=user_id,
105106
content=prompt,
106107
model=settings.default_model,
108+
langfuse_tags=langfuse_tags,
107109
)
108110

109111
# Return agent contract format with metadata
@@ -116,7 +118,16 @@ async def invocations(request: dict, http_request: Request):
116118
# Add metadata if available
117119
if hasattr(message, 'metadata') and message.metadata:
118120
output["metadata"] = message.metadata
119-
121+
# Extract trace_id from metadata if available
122+
if "trace_id" in message.metadata:
123+
output["trace_id"] = message.metadata["trace_id"]
124+
print(f"DEBUG: Added trace_id to output: {message.metadata['trace_id']}")
125+
else:
126+
print(f"DEBUG: No trace_id in metadata. Available keys: {list(message.metadata.keys())}")
127+
else:
128+
print("DEBUG: No metadata available in message")
129+
130+
print(f"DEBUG: Final output keys: {list(output.keys())}")
120131
return {"output": output}
121132

122133
except ValueError as e:

0 commit comments

Comments
 (0)