Add parallel and stress test scripts for Windows stdio flakiness

felixweinberger · felixweinberger · commit 4eb6657f96ad · 2025-07-16T16:32:45.000+01:00
Based on deep analysis, the flakiness likely occurs due to:
1. Job Object handle race conditions when tests run in parallel
2. Windows handle inheritance between test processes
3. Timing sensitivity in the minimal test that just opens/closes immediately
4. Python 3.11/3.12 specific subprocess handling changes

These scripts help reproduce the CI environment conditions:
- test-stdio-parallel-flakiness.ps1: Runs with xdist parallel workers
- test-stdio-stress-race.ps1: Rapidly creates processes to expose races
diff --git a/scripts/windows-debug/test-stdio-parallel-flakiness.ps1 b/scripts/windows-debug/test-stdio-parallel-flakiness.ps1
@@ -0,0 +1,94 @@
+#!/usr/bin/env pwsh
+# Script to test for flakiness when running tests in parallel (like CI does)
+# This simulates the xdist environment where the issue occurs
+# Usage: .\test-stdio-parallel-flakiness.ps1
+#
+# Prerequisites: Run . .\setup-environment.ps1 first to ensure tee is available
+
+Write-Host "Testing stdio with parallel execution to simulate CI environment..." -ForegroundColor Cyan
+Write-Host ""
+
+# Check if tee is available
+$teeCheck = python -c "import shutil; print(shutil.which('tee'))"
+if (-not $teeCheck -or $teeCheck -eq "None") {
+    Write-Host "ERROR: tee command not found!" -ForegroundColor Red
+    Write-Host "Please run: . .\setup-environment.ps1" -ForegroundColor Yellow
+    Write-Host "(Note the dot at the beginning to source the script)" -ForegroundColor Yellow
+    exit 1
+}
+
+Write-Host "Running tests with different parallel configurations..." -ForegroundColor Yellow
+Write-Host ""
+
+# Test 1: Run with 4 workers (default CI behavior)
+Write-Host "Test 1: Running with 4 parallel workers (CI default)..." -ForegroundColor Cyan
+$failures1 = 0
+for ($i = 1; $i -le 20; $i++) {
+    Write-Host "  Run $i..." -NoNewline
+    $output = uv run --frozen pytest tests/client/test_stdio.py::test_stdio_context_manager_exiting -v -n 4 2>&1
+    if ($LASTEXITCODE -ne 0) {
+        $failures1++
+        Write-Host " FAILED" -ForegroundColor Red
+    } else {
+        Write-Host " PASSED" -ForegroundColor Green
+    }
+}
+Write-Host "  Result: $failures1 failures out of 20 runs" -ForegroundColor $(if ($failures1 -eq 0) { "Green" } else { "Red" })
+Write-Host ""
+
+# Test 2: Run with 2 workers
+Write-Host "Test 2: Running with 2 parallel workers..." -ForegroundColor Cyan
+$failures2 = 0
+for ($i = 1; $i -le 20; $i++) {
+    Write-Host "  Run $i..." -NoNewline
+    $output = uv run --frozen pytest tests/client/test_stdio.py::test_stdio_context_manager_exiting -v -n 2 2>&1
+    if ($LASTEXITCODE -ne 0) {
+        $failures2++
+        Write-Host " FAILED" -ForegroundColor Red
+    } else {
+        Write-Host " PASSED" -ForegroundColor Green
+    }
+}
+Write-Host "  Result: $failures2 failures out of 20 runs" -ForegroundColor $(if ($failures2 -eq 0) { "Green" } else { "Red" })
+Write-Host ""
+
+# Test 3: Run all stdio tests in parallel (simulates real CI)
+Write-Host "Test 3: Running ALL stdio tests with 4 workers (full CI simulation)..." -ForegroundColor Cyan
+$failures3 = 0
+for ($i = 1; $i -le 10; $i++) {
+    Write-Host "  Run $i..." -NoNewline
+    $output = uv run --frozen pytest tests/client/test_stdio.py -v -n 4 2>&1
+    if ($LASTEXITCODE -ne 0) {
+        $failures3++
+        Write-Host " FAILED" -ForegroundColor Red
+        # Show which test failed
+        $failedTest = $output | Select-String "FAILED tests/client/test_stdio.py::" | Select-Object -First 1
+        if ($failedTest) {
+            Write-Host "    Failed: $failedTest" -ForegroundColor Red
+        }
+    } else {
+        Write-Host " PASSED" -ForegroundColor Green
+    }
+}
+Write-Host "  Result: $failures3 failures out of 10 runs" -ForegroundColor $(if ($failures3 -eq 0) { "Green" } else { "Red" })
+Write-Host ""
+
+# Summary
+Write-Host "========== SUMMARY ==========" -ForegroundColor Cyan
+Write-Host "4 workers (single test): $failures1/20 failures"
+Write-Host "2 workers (single test): $failures2/20 failures"
+Write-Host "4 workers (all tests):   $failures3/10 failures"
+Write-Host ""
+
+if ($failures1 -gt 0 -or $failures2 -gt 0 -or $failures3 -gt 0) {
+    Write-Host "FLAKINESS DETECTED with parallel execution!" -ForegroundColor Red
+    Write-Host ""
+    Write-Host "This confirms the issue is related to parallel test execution." -ForegroundColor Yellow
+    Write-Host "The race condition likely involves:" -ForegroundColor Yellow
+    Write-Host "  - Windows Job Object handle management" -ForegroundColor Gray
+    Write-Host "  - Process cleanup timing with multiple workers" -ForegroundColor Gray
+    Write-Host "  - Handle inheritance between test processes" -ForegroundColor Gray
+} else {
+    Write-Host "No flakiness detected in this run." -ForegroundColor Green
+    Write-Host "The issue might require specific timing conditions to reproduce." -ForegroundColor Yellow
+}
diff --git a/scripts/windows-debug/test-stdio-stress-race.ps1 b/scripts/windows-debug/test-stdio-stress-race.ps1
@@ -0,0 +1,125 @@
+#!/usr/bin/env pwsh
+# Script to stress test the specific race condition in stdio cleanup
+# This creates many processes rapidly to expose handle/job object races
+# Usage: .\test-stdio-stress-race.ps1
+#
+# Prerequisites: Run . .\setup-environment.ps1 first to ensure tee is available
+
+Write-Host "Stress testing stdio cleanup race conditions..." -ForegroundColor Cyan
+Write-Host "This test creates many processes rapidly to expose timing issues." -ForegroundColor Yellow
+Write-Host ""
+
+# Check if tee is available
+$teeCheck = python -c "import shutil; print(shutil.which('tee'))"
+if (-not $teeCheck -or $teeCheck -eq "None") {
+    Write-Host "ERROR: tee command not found!" -ForegroundColor Red
+    Write-Host "Please run: . .\setup-environment.ps1" -ForegroundColor Yellow
+    Write-Host "(Note the dot at the beginning to source the script)" -ForegroundColor Yellow
+    exit 1
+}
+
+# Create a Python script that runs the test many times in quick succession
+$stressScript = @'
+import asyncio
+import sys
+import time
+from pathlib import Path
+
+# Add parent directories to path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from mcp.client.stdio import stdio_client, StdioServerParameters
+
+async def rapid_test(test_id: int):
+    """Run a single test iteration"""
+    try:
+        async with stdio_client(StdioServerParameters(command="tee")) as (_, _):
+            pass
+        return True, None
+    except Exception as e:
+        return False, str(e)
+
+async def stress_test(iterations: int, concurrent: int):
+    """Run many tests concurrently"""
+    print(f"Running {iterations} tests with {concurrent} concurrent...")
+    
+    failures = 0
+    errors = []
+    start_time = time.time()
+    
+    # Run in batches
+    for batch in range(0, iterations, concurrent):
+        batch_size = min(concurrent, iterations - batch)
+        tasks = [rapid_test(batch + i) for i in range(batch_size)]
+        results = await asyncio.gather(*tasks)
+        
+        for success, error in results:
+            if not success:
+                failures += 1
+                if error and error not in errors:
+                    errors.append(error)
+        
+        # Progress indicator
+        if (batch + batch_size) % 100 == 0:
+            print(f"  Completed {batch + batch_size}/{iterations} tests...")
+    
+    duration = time.time() - start_time
+    return failures, errors, duration
+
+async def main():
+    # Test different concurrency levels
+    configs = [
+        (100, 1),    # Sequential
+        (100, 2),    # Low concurrency
+        (100, 5),    # Medium concurrency
+        (100, 10),   # High concurrency
+    ]
+    
+    for iterations, concurrent in configs:
+        print(f"\nTest: {iterations} iterations, {concurrent} concurrent")
+        failures, errors, duration = await stress_test(iterations, concurrent)
+        
+        print(f"  Duration: {duration:.2f}s")
+        print(f"  Failures: {failures}/{iterations}")
+        if errors:
+            print(f"  Unique errors: {len(errors)}")
+            for error in errors[:3]:  # Show first 3 errors
+                print(f"    - {error}")
+        
+        if failures > 0:
+            print("  RACE CONDITION DETECTED!" if concurrent > 1 else "  FAILURE DETECTED!")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+'@
+
+# Save the stress test script
+$scriptPath = Join-Path $PSScriptRoot "stress_test.py"
+$stressScript | Out-File -FilePath $scriptPath -Encoding UTF8
+
+Write-Host "Running stress tests..." -ForegroundColor Cyan
+Write-Host ""
+
+# Run the stress test
+uv run python $scriptPath
+
+$exitCode = $LASTEXITCODE
+
+# Clean up
+Remove-Item $scriptPath -ErrorAction SilentlyContinue
+
+Write-Host ""
+Write-Host "========== ANALYSIS ==========" -ForegroundColor Cyan
+
+if ($exitCode -ne 0) {
+    Write-Host "Stress test failed to complete." -ForegroundColor Red
+} else {
+    Write-Host "Stress test completed." -ForegroundColor Green
+    Write-Host ""
+    Write-Host "If failures increased with concurrency, it indicates:" -ForegroundColor Yellow
+    Write-Host "  - Race condition in process cleanup" -ForegroundColor Gray
+    Write-Host "  - Job Object handle conflicts" -ForegroundColor Gray
+    Write-Host "  - Windows handle inheritance issues" -ForegroundColor Gray
+    Write-Host ""
+    Write-Host "This matches the CI flakiness pattern where parallel tests fail." -ForegroundColor Yellow
+}