Skip to content

Commit 68fd02a

Browse files
CopilotMte90
andauthored
Fix embedding pipeline hanging on unresponsive API calls (#8)
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
1 parent f2582f0 commit 68fd02a

File tree

2 files changed

+31
-3
lines changed

2 files changed

+31
-3
lines changed

ai/analyzer.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@
5252
# Increase batch size for parallel processing
5353
EMBEDDING_BATCH_SIZE = 16 # Process embeddings in batches for better throughput
5454
PROGRESS_LOG_INTERVAL = 10 # Log progress every N completed files
55+
EMBEDDING_TIMEOUT = 30 # Timeout in seconds for each embedding API call
56+
FILE_PROCESSING_TIMEOUT = 300 # Timeout in seconds for processing a single file (5 minutes)
5557
_THREADPOOL_WORKERS = max(16, EMBEDDING_CONCURRENCY + 8)
5658
_EXECUTOR = concurrent.futures.ThreadPoolExecutor(max_workers=_THREADPOOL_WORKERS)
5759

@@ -216,12 +218,16 @@ def _process_file_sync(
216218
if elapsed_before_result > 3.0:
217219
logger.warning(f"Embedding API request taking too long for {rel_path} chunk {idx}: {elapsed_before_result:.2f}s elapsed, still waiting for response...")
218220

219-
emb = future.result() # This will re-raise any exception from the worker
221+
emb = future.result(timeout=EMBEDDING_TIMEOUT) # Add timeout to prevent hanging indefinitely
220222
embedding_duration = time.time() - embedding_start_time
221223

222224
# Log slow embedding generation (> 5 seconds)
223225
if embedding_duration > 5.0:
224226
logger.warning(f"Slow embedding API response for {rel_path} chunk {idx}: {embedding_duration:.2f}s total")
227+
except concurrent.futures.TimeoutError:
228+
logger.error(f"Embedding API timeout ({EMBEDDING_TIMEOUT}s) for {rel_path} chunk {idx}")
229+
emb = None
230+
failed_count += 1
225231
except Exception as e:
226232
logger.exception("Embedding retrieval failed for %s chunk %d: %s", rel_path, idx, e)
227233
emb = None
@@ -355,7 +361,7 @@ def analyze_local_path_sync(
355361

356362
for fut in concurrent.futures.as_completed(futures):
357363
try:
358-
r = fut.result()
364+
r = fut.result(timeout=FILE_PROCESSING_TIMEOUT)
359365

360366
# Increment completed counter and check for periodic logging
361367
with counters[2]:
@@ -374,6 +380,10 @@ def analyze_local_path_sync(
374380
# Log periodic progress updates (every 10 files)
375381
if should_log:
376382
logger.info(f"Progress: {completed_count}/{total_files} files processed ({file_count} stored, {emb_count} with embeddings, {skipped_count} skipped)")
383+
except concurrent.futures.TimeoutError:
384+
logger.error(f"File processing timeout ({FILE_PROCESSING_TIMEOUT}s exceeded)")
385+
with counters[2]:
386+
counters[1] += 1
377387
except Exception:
378388
logger.exception("A per-file task failed")
379389

ai/openai.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,17 @@ def _record_failure():
6363
_circuit_state["open_until"] = time.time() + _CIRCUIT_BREAKER_TIMEOUT
6464

6565
def _retry_with_backoff(func, *args, **kwargs):
66-
"""Retry function with exponential backoff"""
66+
"""Retry function with exponential backoff on transient errors"""
6767
max_retries = 3
6868
base_delay = 1.0
6969

70+
# Transient error indicators that should be retried
71+
transient_error_keywords = [
72+
'timeout', 'timed out', 'connection', 'network',
73+
'temporary', 'unavailable', 'rate limit', '429',
74+
'500', '502', '503', '504', 'overload'
75+
]
76+
7077
for attempt in range(max_retries):
7178
try:
7279
_check_circuit_breaker()
@@ -75,9 +82,20 @@ def _retry_with_backoff(func, *args, **kwargs):
7582
_record_success()
7683
return result
7784
except Exception as e:
85+
error_str = str(e).lower()
86+
is_transient = any(keyword in error_str for keyword in transient_error_keywords)
87+
88+
# Always record failure for circuit breaker
7889
_record_failure()
90+
91+
# Only retry on transient errors or if it's not the last attempt
7992
if attempt == max_retries - 1:
8093
raise
94+
95+
# If it's clearly not a transient error, don't retry
96+
if not is_transient and attempt > 0:
97+
raise
98+
8199
delay = base_delay * (2 ** attempt)
82100
time.sleep(delay)
83101

0 commit comments

Comments
 (0)