Skip to content

Commit 4eb6657

Browse files
Add parallel and stress test scripts for Windows stdio flakiness
Based on deep analysis, the flakiness likely occurs due to: 1. Job Object handle race conditions when tests run in parallel 2. Windows handle inheritance between test processes 3. Timing sensitivity in the minimal test that just opens/closes immediately 4. Python 3.11/3.12 specific subprocess handling changes These scripts help reproduce the CI environment conditions: - test-stdio-parallel-flakiness.ps1: Runs with xdist parallel workers - test-stdio-stress-race.ps1: Rapidly creates processes to expose races
1 parent b0edf56 commit 4eb6657

File tree

2 files changed

+219
-0
lines changed

2 files changed

+219
-0
lines changed
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#!/usr/bin/env pwsh
2+
# Script to test for flakiness when running tests in parallel (like CI does)
3+
# This simulates the xdist environment where the issue occurs
4+
# Usage: .\test-stdio-parallel-flakiness.ps1
5+
#
6+
# Prerequisites: Run . .\setup-environment.ps1 first to ensure tee is available
7+
8+
Write-Host "Testing stdio with parallel execution to simulate CI environment..." -ForegroundColor Cyan
9+
Write-Host ""
10+
11+
# Check if tee is available
12+
$teeCheck = python -c "import shutil; print(shutil.which('tee'))"
13+
if (-not $teeCheck -or $teeCheck -eq "None") {
14+
Write-Host "ERROR: tee command not found!" -ForegroundColor Red
15+
Write-Host "Please run: . .\setup-environment.ps1" -ForegroundColor Yellow
16+
Write-Host "(Note the dot at the beginning to source the script)" -ForegroundColor Yellow
17+
exit 1
18+
}
19+
20+
Write-Host "Running tests with different parallel configurations..." -ForegroundColor Yellow
21+
Write-Host ""
22+
23+
# Test 1: Run with 4 workers (default CI behavior)
24+
Write-Host "Test 1: Running with 4 parallel workers (CI default)..." -ForegroundColor Cyan
25+
$failures1 = 0
26+
for ($i = 1; $i -le 20; $i++) {
27+
Write-Host " Run $i..." -NoNewline
28+
$output = uv run --frozen pytest tests/client/test_stdio.py::test_stdio_context_manager_exiting -v -n 4 2>&1
29+
if ($LASTEXITCODE -ne 0) {
30+
$failures1++
31+
Write-Host " FAILED" -ForegroundColor Red
32+
} else {
33+
Write-Host " PASSED" -ForegroundColor Green
34+
}
35+
}
36+
Write-Host " Result: $failures1 failures out of 20 runs" -ForegroundColor $(if ($failures1 -eq 0) { "Green" } else { "Red" })
37+
Write-Host ""
38+
39+
# Test 2: Run with 2 workers
40+
Write-Host "Test 2: Running with 2 parallel workers..." -ForegroundColor Cyan
41+
$failures2 = 0
42+
for ($i = 1; $i -le 20; $i++) {
43+
Write-Host " Run $i..." -NoNewline
44+
$output = uv run --frozen pytest tests/client/test_stdio.py::test_stdio_context_manager_exiting -v -n 2 2>&1
45+
if ($LASTEXITCODE -ne 0) {
46+
$failures2++
47+
Write-Host " FAILED" -ForegroundColor Red
48+
} else {
49+
Write-Host " PASSED" -ForegroundColor Green
50+
}
51+
}
52+
Write-Host " Result: $failures2 failures out of 20 runs" -ForegroundColor $(if ($failures2 -eq 0) { "Green" } else { "Red" })
53+
Write-Host ""
54+
55+
# Test 3: Run all stdio tests in parallel (simulates real CI)
56+
Write-Host "Test 3: Running ALL stdio tests with 4 workers (full CI simulation)..." -ForegroundColor Cyan
57+
$failures3 = 0
58+
for ($i = 1; $i -le 10; $i++) {
59+
Write-Host " Run $i..." -NoNewline
60+
$output = uv run --frozen pytest tests/client/test_stdio.py -v -n 4 2>&1
61+
if ($LASTEXITCODE -ne 0) {
62+
$failures3++
63+
Write-Host " FAILED" -ForegroundColor Red
64+
# Show which test failed
65+
$failedTest = $output | Select-String "FAILED tests/client/test_stdio.py::" | Select-Object -First 1
66+
if ($failedTest) {
67+
Write-Host " Failed: $failedTest" -ForegroundColor Red
68+
}
69+
} else {
70+
Write-Host " PASSED" -ForegroundColor Green
71+
}
72+
}
73+
Write-Host " Result: $failures3 failures out of 10 runs" -ForegroundColor $(if ($failures3 -eq 0) { "Green" } else { "Red" })
74+
Write-Host ""
75+
76+
# Summary
77+
Write-Host "========== SUMMARY ==========" -ForegroundColor Cyan
78+
Write-Host "4 workers (single test): $failures1/20 failures"
79+
Write-Host "2 workers (single test): $failures2/20 failures"
80+
Write-Host "4 workers (all tests): $failures3/10 failures"
81+
Write-Host ""
82+
83+
if ($failures1 -gt 0 -or $failures2 -gt 0 -or $failures3 -gt 0) {
84+
Write-Host "FLAKINESS DETECTED with parallel execution!" -ForegroundColor Red
85+
Write-Host ""
86+
Write-Host "This confirms the issue is related to parallel test execution." -ForegroundColor Yellow
87+
Write-Host "The race condition likely involves:" -ForegroundColor Yellow
88+
Write-Host " - Windows Job Object handle management" -ForegroundColor Gray
89+
Write-Host " - Process cleanup timing with multiple workers" -ForegroundColor Gray
90+
Write-Host " - Handle inheritance between test processes" -ForegroundColor Gray
91+
} else {
92+
Write-Host "No flakiness detected in this run." -ForegroundColor Green
93+
Write-Host "The issue might require specific timing conditions to reproduce." -ForegroundColor Yellow
94+
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#!/usr/bin/env pwsh
2+
# Script to stress test the specific race condition in stdio cleanup
3+
# This creates many processes rapidly to expose handle/job object races
4+
# Usage: .\test-stdio-stress-race.ps1
5+
#
6+
# Prerequisites: Run . .\setup-environment.ps1 first to ensure tee is available
7+
8+
Write-Host "Stress testing stdio cleanup race conditions..." -ForegroundColor Cyan
9+
Write-Host "This test creates many processes rapidly to expose timing issues." -ForegroundColor Yellow
10+
Write-Host ""
11+
12+
# Check if tee is available
13+
$teeCheck = python -c "import shutil; print(shutil.which('tee'))"
14+
if (-not $teeCheck -or $teeCheck -eq "None") {
15+
Write-Host "ERROR: tee command not found!" -ForegroundColor Red
16+
Write-Host "Please run: . .\setup-environment.ps1" -ForegroundColor Yellow
17+
Write-Host "(Note the dot at the beginning to source the script)" -ForegroundColor Yellow
18+
exit 1
19+
}
20+
21+
# Create a Python script that runs the test many times in quick succession
22+
$stressScript = @'
23+
import asyncio
24+
import sys
25+
import time
26+
from pathlib import Path
27+
28+
# Add parent directories to path
29+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
30+
31+
from mcp.client.stdio import stdio_client, StdioServerParameters
32+
33+
async def rapid_test(test_id: int):
34+
"""Run a single test iteration"""
35+
try:
36+
async with stdio_client(StdioServerParameters(command="tee")) as (_, _):
37+
pass
38+
return True, None
39+
except Exception as e:
40+
return False, str(e)
41+
42+
async def stress_test(iterations: int, concurrent: int):
43+
"""Run many tests concurrently"""
44+
print(f"Running {iterations} tests with {concurrent} concurrent...")
45+
46+
failures = 0
47+
errors = []
48+
start_time = time.time()
49+
50+
# Run in batches
51+
for batch in range(0, iterations, concurrent):
52+
batch_size = min(concurrent, iterations - batch)
53+
tasks = [rapid_test(batch + i) for i in range(batch_size)]
54+
results = await asyncio.gather(*tasks)
55+
56+
for success, error in results:
57+
if not success:
58+
failures += 1
59+
if error and error not in errors:
60+
errors.append(error)
61+
62+
# Progress indicator
63+
if (batch + batch_size) % 100 == 0:
64+
print(f" Completed {batch + batch_size}/{iterations} tests...")
65+
66+
duration = time.time() - start_time
67+
return failures, errors, duration
68+
69+
async def main():
70+
# Test different concurrency levels
71+
configs = [
72+
(100, 1), # Sequential
73+
(100, 2), # Low concurrency
74+
(100, 5), # Medium concurrency
75+
(100, 10), # High concurrency
76+
]
77+
78+
for iterations, concurrent in configs:
79+
print(f"\nTest: {iterations} iterations, {concurrent} concurrent")
80+
failures, errors, duration = await stress_test(iterations, concurrent)
81+
82+
print(f" Duration: {duration:.2f}s")
83+
print(f" Failures: {failures}/{iterations}")
84+
if errors:
85+
print(f" Unique errors: {len(errors)}")
86+
for error in errors[:3]: # Show first 3 errors
87+
print(f" - {error}")
88+
89+
if failures > 0:
90+
print(" RACE CONDITION DETECTED!" if concurrent > 1 else " FAILURE DETECTED!")
91+
92+
if __name__ == "__main__":
93+
asyncio.run(main())
94+
'@
95+
96+
# Save the stress test script
97+
$scriptPath = Join-Path $PSScriptRoot "stress_test.py"
98+
$stressScript | Out-File -FilePath $scriptPath -Encoding UTF8
99+
100+
Write-Host "Running stress tests..." -ForegroundColor Cyan
101+
Write-Host ""
102+
103+
# Run the stress test
104+
uv run python $scriptPath
105+
106+
$exitCode = $LASTEXITCODE
107+
108+
# Clean up
109+
Remove-Item $scriptPath -ErrorAction SilentlyContinue
110+
111+
Write-Host ""
112+
Write-Host "========== ANALYSIS ==========" -ForegroundColor Cyan
113+
114+
if ($exitCode -ne 0) {
115+
Write-Host "Stress test failed to complete." -ForegroundColor Red
116+
} else {
117+
Write-Host "Stress test completed." -ForegroundColor Green
118+
Write-Host ""
119+
Write-Host "If failures increased with concurrency, it indicates:" -ForegroundColor Yellow
120+
Write-Host " - Race condition in process cleanup" -ForegroundColor Gray
121+
Write-Host " - Job Object handle conflicts" -ForegroundColor Gray
122+
Write-Host " - Windows handle inheritance issues" -ForegroundColor Gray
123+
Write-Host ""
124+
Write-Host "This matches the CI flakiness pattern where parallel tests fail." -ForegroundColor Yellow
125+
}

0 commit comments

Comments
 (0)