Fix embedding pipeline hanging on unresponsive API calls (#8)

Copilot · Mte90 · web-flow · commit 68fd02a6a28d · 2025-11-10T12:39:05.000+01:00
Co-authored-by: Mte90 &lt;403283+Mte90@users.noreply.github.com&gt;
Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
diff --git a/ai/analyzer.py b/ai/analyzer.py
@@ -52,6 +52,8 @@
 # Increase batch size for parallel processing
 EMBEDDING_BATCH_SIZE = 16  # Process embeddings in batches for better throughput
 PROGRESS_LOG_INTERVAL = 10  # Log progress every N completed files
+EMBEDDING_TIMEOUT = 30  # Timeout in seconds for each embedding API call
+FILE_PROCESSING_TIMEOUT = 300  # Timeout in seconds for processing a single file (5 minutes)
 _THREADPOOL_WORKERS = max(16, EMBEDDING_CONCURRENCY + 8)
 _EXECUTOR = concurrent.futures.ThreadPoolExecutor(max_workers=_THREADPOOL_WORKERS)
 
@@ -216,12 +218,16 @@ def _process_file_sync(
                     if elapsed_before_result > 3.0:
                         logger.warning(f"Embedding API request taking too long for {rel_path} chunk {idx}: {elapsed_before_result:.2f}s elapsed, still waiting for response...")
                     
-                    emb = future.result()  # This will re-raise any exception from the worker
+                    emb = future.result(timeout=EMBEDDING_TIMEOUT)  # Add timeout to prevent hanging indefinitely
                     embedding_duration = time.time() - embedding_start_time
                     
                     # Log slow embedding generation (> 5 seconds)
                     if embedding_duration > 5.0:
                         logger.warning(f"Slow embedding API response for {rel_path} chunk {idx}: {embedding_duration:.2f}s total")
+                except concurrent.futures.TimeoutError:
+                    logger.error(f"Embedding API timeout ({EMBEDDING_TIMEOUT}s) for {rel_path} chunk {idx}")
+                    emb = None
+                    failed_count += 1
                 except Exception as e:
                     logger.exception("Embedding retrieval failed for %s chunk %d: %s", rel_path, idx, e)
                     emb = None
@@ -355,7 +361,7 @@ def analyze_local_path_sync(
 
             for fut in concurrent.futures.as_completed(futures):
                 try:
-                    r = fut.result()
+                    r = fut.result(timeout=FILE_PROCESSING_TIMEOUT)
                     
                     # Increment completed counter and check for periodic logging
                     with counters[2]:
@@ -374,6 +380,10 @@ def analyze_local_path_sync(
                         # Log periodic progress updates (every 10 files)
                         if should_log:
                             logger.info(f"Progress: {completed_count}/{total_files} files processed ({file_count} stored, {emb_count} with embeddings, {skipped_count} skipped)")
+                except concurrent.futures.TimeoutError:
+                    logger.error(f"File processing timeout ({FILE_PROCESSING_TIMEOUT}s exceeded)")
+                    with counters[2]:
+                        counters[1] += 1
                 except Exception:
                     logger.exception("A per-file task failed")
 
diff --git a/ai/openai.py b/ai/openai.py
@@ -63,10 +63,17 @@ def _record_failure():
             _circuit_state["open_until"] = time.time() + _CIRCUIT_BREAKER_TIMEOUT
 
 def _retry_with_backoff(func, *args, **kwargs):
-    """Retry function with exponential backoff"""
+    """Retry function with exponential backoff on transient errors"""
     max_retries = 3
     base_delay = 1.0
     
+    # Transient error indicators that should be retried
+    transient_error_keywords = [
+        'timeout', 'timed out', 'connection', 'network', 
+        'temporary', 'unavailable', 'rate limit', '429', 
+        '500', '502', '503', '504', 'overload'
+    ]
+    
     for attempt in range(max_retries):
         try:
             _check_circuit_breaker()
@@ -75,9 +82,20 @@ def _retry_with_backoff(func, *args, **kwargs):
             _record_success()
             return result
         except Exception as e:
+            error_str = str(e).lower()
+            is_transient = any(keyword in error_str for keyword in transient_error_keywords)
+            
+            # Always record failure for circuit breaker
             _record_failure()
+            
+            # Only retry on transient errors or if it's not the last attempt
             if attempt == max_retries - 1:
                 raise
+            
+            # If it's clearly not a transient error, don't retry
+            if not is_transient and attempt > 0:
+                raise
+            
             delay = base_delay * (2 ** attempt)
             time.sleep(delay)