Skip to content

Commit 18c0947

Browse files
author
Bob Strahan
committed
Merge branch 'feature/folder-prefix-match-test' of ssh.gitlab.aws.dev:genaiic-reusable-assets/engagement-artifacts/genaiic-idp-accelerator into feature/folder-prefix-match-test
2 parents 952763e + 5d181ba commit 18c0947

File tree

5 files changed

+308
-116
lines changed

5 files changed

+308
-116
lines changed

src/lambda/test_results_resolver/index.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -326,14 +326,14 @@ def get_test_run_status(test_run_id):
326326

327327
item = response['Item']
328328
files = item.get('Files', [])
329-
files_count = len(files)
330-
331-
logger.info(f"Test run {test_run_id}: Found {files_count} files: {files}")
329+
files_count = item.get('FilesCount', 0)
330+
logger.info(f"Test run {test_run_id}: Found {files_count} files")
332331

333332
# Always check actual document status from tracking table
334333
completed_files = 0
335334
processing_failed_files = 0 # Only count processing failures found during scan
336335
evaluating_files = 0
336+
queued_files = 0
337337

338338
for file_key in files:
339339
logger.info(f"Checking file: {file_key} for test run: {test_run_id}")
@@ -372,16 +372,21 @@ def get_test_run_status(test_run_id):
372372
elif doc_status == 'FAILED':
373373
processing_failed_files += 1
374374
logger.info(f"File {file_key}: counted as failed")
375+
elif doc_status == 'QUEUED':
376+
queued_files += 1
377+
logger.info(f"File {file_key}: counted as queued")
375378
else:
376379
logger.info(f"File {file_key}: still processing (status: {doc_status})")
377380
else:
378381
logger.warning(f"Document not found: doc#{test_run_id}/{file_key}")
382+
# Count missing documents as queued (not yet created)
383+
queued_files += 1
379384

380385
# Calculate total failed files
381386
baseline_failed_files = item.get('BaselineFailedFiles', 0) # Set by copier, never updated
382387
total_failed_files = baseline_failed_files + processing_failed_files # Recalculated each call
383388

384-
logger.info(f"Test run {test_run_id} counts: completed={completed_files}, processing_failed={processing_failed_files}, baseline_failed={baseline_failed_files}, total_failed={total_failed_files}, evaluating={evaluating_files}, total={files_count}")
389+
logger.info(f"Test run {test_run_id} counts: completed={completed_files}, processing_failed={processing_failed_files}, baseline_failed={baseline_failed_files}, total_failed={total_failed_files}, evaluating={evaluating_files}, queued={queued_files}, total={files_count}")
385390

386391
# Determine overall test run status based on document and evaluation states
387392
if completed_files == files_count and files_count > 0 and total_failed_files == 0:
@@ -390,8 +395,10 @@ def get_test_run_status(test_run_id):
390395
overall_status = 'PARTIAL_COMPLETE'
391396
elif evaluating_files > 0:
392397
overall_status = 'EVALUATING'
393-
elif completed_files + total_failed_files + evaluating_files < files_count:
394-
overall_status = 'RUNNING'
398+
elif queued_files == files_count:
399+
overall_status = 'QUEUED' # All files are still queued
400+
elif completed_files + total_failed_files + evaluating_files + queued_files < files_count:
401+
overall_status = 'RUNNING' # Some files are actively processing
395402
else:
396403
overall_status = item.get('Status', 'RUNNING')
397404

src/ui/src/components/test-studio/TestComparison.jsx

Lines changed: 161 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// SPDX-License-Identifier: Apache-2.0
33
import React, { useState, useEffect } from 'react';
44
import PropTypes from 'prop-types';
5-
import { Container, Header, SpaceBetween, Table, Box, Button, ButtonDropdown } from '@cloudscape-design/components';
5+
import { Container, Header, SpaceBetween, Table, Box, Button, ButtonDropdown, ProgressBar } from '@cloudscape-design/components';
66
import { generateClient } from 'aws-amplify/api';
77
import COMPARE_TEST_RUNS from '../../graphql/queries/compareTestRuns';
88
import TestStudioHeader from './TestStudioHeader';
@@ -12,6 +12,7 @@ const client = generateClient();
1212
const TestComparison = ({ preSelectedTestRunIds = [] }) => {
1313
const [comparisonData, setComparisonData] = useState(null);
1414
const [comparing, setComparing] = useState(false);
15+
const [currentAttempt, setCurrentAttempt] = useState(1);
1516

1617
useEffect(() => {
1718
const fetchComparison = async () => {
@@ -22,10 +23,60 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
2223

2324
try {
2425
console.log('Making GraphQL request...');
25-
const result = await client.graphql({
26-
query: COMPARE_TEST_RUNS,
27-
variables: { testRunIds: preSelectedTestRunIds },
28-
});
26+
let result;
27+
let attempt = 1;
28+
const maxRetries = 5;
29+
30+
while (attempt <= maxRetries) {
31+
try {
32+
setCurrentAttempt(attempt);
33+
result = await client.graphql({
34+
query: COMPARE_TEST_RUNS,
35+
variables: { testRunIds: preSelectedTestRunIds },
36+
});
37+
setCurrentAttempt(5); // Set to 100% before completing
38+
await new Promise((resolve) => setTimeout(resolve, 500)); // Brief pause to show 100%
39+
break;
40+
} catch (error) {
41+
const isTimeout =
42+
error.message?.toLowerCase().includes('timeout') ||
43+
error.code === 'TIMEOUT' ||
44+
error.message?.includes('Request failed with status code 504') ||
45+
error.name === 'TimeoutError' ||
46+
error.code === 'NetworkError' ||
47+
error.errors?.some(err =>
48+
err.errorType === 'Lambda:ExecutionTimeoutException' ||
49+
err.message?.toLowerCase().includes('timeout')
50+
);
51+
if (isTimeout && attempt < maxRetries) {
52+
console.log(`COMPARE_TEST_RUNS attempt ${attempt} failed, retrying...`, error.message);
53+
attempt++;
54+
55+
// Animate progress during 5-second wait
56+
const waitTime = 5000;
57+
const intervalTime = 100;
58+
const steps = waitTime / intervalTime;
59+
const startProgress = (attempt - 1) * 20;
60+
const endProgress = attempt * 20;
61+
const progressStep = (endProgress - startProgress) / steps;
62+
63+
let currentProgress = startProgress;
64+
const progressInterval = setInterval(() => {
65+
currentProgress += progressStep;
66+
setCurrentAttempt(Math.min(currentProgress / 20, 5));
67+
}, intervalTime);
68+
69+
await new Promise((resolve) => setTimeout(() => {
70+
clearInterval(progressInterval);
71+
setCurrentAttempt(attempt);
72+
resolve();
73+
}, waitTime));
74+
75+
continue;
76+
}
77+
throw error;
78+
}
79+
}
2980

3081
const compareData = result.data.compareTestRuns;
3182

@@ -90,17 +141,27 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
90141
),
91142
],
92143
[
93-
'Overall Accuracy',
144+
'Average Accuracy',
94145
...Object.values(completeTestRuns).map((run) =>
95-
run.overallAccuracy !== null && run.overallAccuracy !== undefined ? `${(run.overallAccuracy * 100).toFixed(1)}%` : 'N/A',
146+
run.overallAccuracy !== null && run.overallAccuracy !== undefined ? run.overallAccuracy.toFixed(3) : 'N/A',
96147
),
97148
],
98149
[
99-
'Overall Confidence',
150+
'Average Confidence',
100151
...Object.values(completeTestRuns).map((run) =>
101152
run.averageConfidence !== null && run.averageConfidence !== undefined ? `${(run.averageConfidence * 100).toFixed(1)}%` : 'N/A',
102153
),
103154
],
155+
[
156+
'Average Weighted Overall Score',
157+
...Object.values(completeTestRuns).map((run) => {
158+
if (run.weightedOverallScores && run.weightedOverallScores.length > 0) {
159+
const avg = run.weightedOverallScores.reduce((sum, score) => sum + score, 0) / run.weightedOverallScores.length;
160+
return avg.toFixed(3);
161+
}
162+
return 'N/A';
163+
}),
164+
],
104165
[
105166
'Duration',
106167
...Object.values(completeTestRuns).map((run) => {
@@ -186,6 +247,44 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
186247
usageRows.push(row);
187248
});
188249

250+
// Add accuracy breakdown rows
251+
const accuracyRows = [];
252+
const allAccuracyMetrics = new Set();
253+
Object.values(completeTestRuns).forEach((testRun) => {
254+
if (testRun.accuracyBreakdown) {
255+
Object.keys(testRun.accuracyBreakdown).forEach((metric) => {
256+
allAccuracyMetrics.add(metric);
257+
});
258+
}
259+
});
260+
261+
// Add accuracy breakdown header
262+
accuracyRows.push(['Accuracy Metric', ...Object.keys(completeTestRuns)]);
263+
264+
// Add accuracy breakdown metrics
265+
Array.from(allAccuracyMetrics).forEach((metricKey) => {
266+
const row = [metricKey.replace(/_/g, ' ').replace(/\b\w/g, (l) => l.toUpperCase())];
267+
Object.entries(completeTestRuns).forEach(([testRunId, testRun]) => {
268+
const accuracyBreakdown = testRun.accuracyBreakdown || {};
269+
const value = accuracyBreakdown[metricKey];
270+
const displayValue = value !== null && value !== undefined ? value.toFixed(3) : '0.000';
271+
row.push(displayValue);
272+
});
273+
accuracyRows.push(row);
274+
});
275+
276+
// Add weighted overall score to accuracy breakdown
277+
const weightedRow = ['Weighted Overall Score'];
278+
Object.entries(completeTestRuns).forEach(([testRunId, testRun]) => {
279+
if (testRun.weightedOverallScores && testRun.weightedOverallScores.length > 0) {
280+
const avg = testRun.weightedOverallScores.reduce((sum, score) => sum + score, 0) / testRun.weightedOverallScores.length;
281+
weightedRow.push(avg.toFixed(3));
282+
} else {
283+
weightedRow.push('N/A');
284+
}
285+
});
286+
accuracyRows.push(weightedRow);
287+
189288
// Add config comparison rows
190289
const configRows = [];
191290
if (comparisonData.configs && comparisonData.configs.length > 0) {
@@ -202,14 +301,17 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
202301
['=== PERFORMANCE METRICS ==='],
203302
...performanceRows,
204303
[''],
304+
['=== CONFIGURATION COMPARISON ==='],
305+
...configRows,
306+
[''],
307+
['=== AVERAGE ACCURACY BREAKDOWN ==='],
308+
...accuracyRows,
309+
[''],
205310
['=== COST BREAKDOWN ==='],
206311
...costRows,
207312
[''],
208313
['=== USAGE BREAKDOWN ==='],
209314
...usageRows,
210-
[''],
211-
['=== CONFIGURATION DIFFERENCES ==='],
212-
...configRows,
213315
];
214316

215317
const csvContent = csvData.map((row) => row.map((field) => `"${String(field).replace(/"/g, '""')}"`).join(',')).join('\n');
@@ -255,8 +357,11 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
255357
completedFiles: testRun.completedFiles,
256358
failedFiles: testRun.failedFiles,
257359
totalCost: testRun.totalCost,
258-
overallAccuracy: testRun.overallAccuracy,
360+
averageAccuracy: testRun.overallAccuracy,
259361
averageConfidence: testRun.averageConfidence,
362+
averageWeightedOverallScore: testRun.weightedOverallScores && testRun.weightedOverallScores.length > 0
363+
? testRun.weightedOverallScores.reduce((sum, score) => sum + score, 0) / testRun.weightedOverallScores.length
364+
: null,
260365
duration:
261366
testRun.createdAt && testRun.completedAt
262367
? (() => {
@@ -269,13 +374,20 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
269374
},
270375
]),
271376
),
377+
configurationDifferences: comparisonData.configs || [],
378+
accuracyBreakdown: Object.fromEntries(
379+
Object.entries(completeTestRuns).map(([testRunId, testRun]) => {
380+
const breakdown = { ...(testRun.accuracyBreakdown || {}) };
381+
// Add weighted overall score to accuracy breakdown
382+
if (testRun.weightedOverallScores && testRun.weightedOverallScores.length > 0) {
383+
breakdown.weightedOverallScore = testRun.weightedOverallScores.reduce((sum, score) => sum + score, 0) / testRun.weightedOverallScores.length;
384+
}
385+
return [testRunId, breakdown];
386+
}),
387+
),
272388
costBreakdown: Object.fromEntries(
273389
Object.entries(completeTestRuns).map(([testRunId, testRun]) => [testRunId, testRun.costBreakdown || {}]),
274390
),
275-
accuracyBreakdown: Object.fromEntries(
276-
Object.entries(completeTestRuns).map(([testRunId, testRun]) => [testRunId, testRun.accuracyBreakdown || {}]),
277-
),
278-
configurationDifferences: comparisonData.configs || [],
279391
};
280392

281393
const jsonData = JSON.stringify(filteredData, null, 2);
@@ -299,7 +411,7 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
299411
};
300412

301413
if (comparing) {
302-
return <Box>Loading comparison...</Box>;
414+
return <ProgressBar status="in-progress" label="Loading comparison..." value={currentAttempt * 20} />;
303415
}
304416

305417
if (!comparisonData) {
@@ -415,31 +527,31 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
415527
),
416528
},
417529
{
418-
metric: 'Overall Accuracy',
530+
metric: 'Average Accuracy',
419531
...Object.fromEntries(
420532
Object.entries(completeTestRuns).map(([testRunId, testRun]) => [
421533
testRunId,
422534
testRun.overallAccuracy !== null && testRun.overallAccuracy !== undefined
423-
? `${(testRun.overallAccuracy * 100).toFixed(1)}%`
535+
? testRun.overallAccuracy.toFixed(3)
424536
: 'N/A',
425537
]),
426538
),
427539
},
428540
{
429-
metric: 'Weighted Overall Score',
541+
metric: 'Average Weighted Overall Score',
430542
...Object.fromEntries(
431543
Object.entries(completeTestRuns).map(([testRunId, testRun]) => {
432544
if (testRun.weightedOverallScores && testRun.weightedOverallScores.length > 0) {
433545
const avg =
434546
testRun.weightedOverallScores.reduce((sum, score) => sum + score, 0) / testRun.weightedOverallScores.length;
435-
return [testRunId, `${(avg * 100).toFixed(1)}%`];
547+
return [testRunId, avg.toFixed(3)];
436548
}
437549
return [testRunId, 'N/A'];
438550
}),
439551
),
440552
},
441553
{
442-
metric: 'Overall Confidence',
554+
metric: 'Average Confidence',
443555
...Object.fromEntries(
444556
Object.entries(completeTestRuns).map(([testRunId, testRun]) => [
445557
testRunId,
@@ -513,8 +625,8 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
513625
})()}
514626
</Container>
515627

516-
{/* Accuracy Comparison */}
517-
<Container header={<Header variant="h3">Accuracy Comparison</Header>}>
628+
{/* Average Accuracy Comparison */}
629+
<Container header={<Header variant="h3">Average Accuracy Comparison</Header>}>
518630
{(() => {
519631
const hasAccuracyData = Object.values(completeTestRuns).some((testRun) => testRun.accuracyBreakdown);
520632

@@ -533,17 +645,31 @@ const TestComparison = ({ preSelectedTestRunIds = [] }) => {
533645

534646
return (
535647
<Table
536-
items={Array.from(allAccuracyMetrics).map((metricKey) => ({
537-
metric: metricKey.replace(/_/g, ' ').replace(/\b\w/g, (l) => l.toUpperCase()),
538-
...Object.fromEntries(
539-
Object.entries(completeTestRuns).map(([testRunId, testRun]) => {
540-
const accuracyBreakdown = testRun.accuracyBreakdown || {};
541-
const value = accuracyBreakdown[metricKey];
542-
const displayValue = value !== null && value !== undefined ? `${(value * 100).toFixed(1)}%` : '0.0%';
543-
return [testRunId, displayValue];
544-
}),
545-
),
546-
}))}
648+
items={[
649+
...Array.from(allAccuracyMetrics).map((metricKey) => ({
650+
metric: metricKey.replace(/_/g, ' ').replace(/\b\w/g, (l) => l.toUpperCase()),
651+
...Object.fromEntries(
652+
Object.entries(completeTestRuns).map(([testRunId, testRun]) => {
653+
const accuracyBreakdown = testRun.accuracyBreakdown || {};
654+
const value = accuracyBreakdown[metricKey];
655+
const displayValue = value !== null && value !== undefined ? value.toFixed(3) : '0.000';
656+
return [testRunId, displayValue];
657+
}),
658+
),
659+
})),
660+
{
661+
metric: 'Weighted Overall Score',
662+
...Object.fromEntries(
663+
Object.entries(completeTestRuns).map(([testRunId, testRun]) => {
664+
if (testRun.weightedOverallScores && testRun.weightedOverallScores.length > 0) {
665+
const avg = testRun.weightedOverallScores.reduce((sum, score) => sum + score, 0) / testRun.weightedOverallScores.length;
666+
return [testRunId, avg.toFixed(3)];
667+
}
668+
return [testRunId, 'N/A'];
669+
}),
670+
),
671+
}
672+
]}
547673
columnDefinitions={[
548674
{ id: 'metric', header: 'Accuracy Metric', cell: (item) => item.metric },
549675
...Object.keys(completeTestRuns).map((testRunId) => ({

0 commit comments

Comments
 (0)