77import concurrent .futures
88import sqlite3
99import importlib .resources
10+ import logging
1011from pathlib import Path
1112from typing import Optional , Dict , Any , List
1213
5051DB_LOCK_RETRY_COUNT = 6
5152DB_LOCK_RETRY_BASE_DELAY = 0.05 # seconds, exponential backoff multiplier
5253
54+ # configure basic logging for visibility
55+ logging .basicConfig (level = logging .INFO )
56+ logger = logging .getLogger (__name__ )
57+
5358
5459def detect_language (path : str ):
5560 if "LICENSE.md" in path :
@@ -73,6 +78,54 @@ async def async_get_embedding(text: str, model: Optional[str] = None):
7378 return await _run_in_executor (get_embedding_for_text , text , model )
7479
7580
81+ # helper: write error files to disk (instead of storing them in the DB)
82+ def _write_error_file_sync (database_path : str , analysis_id : Optional [int ], rel_path : str , content : str , language : Optional [str ] = None ) -> str :
83+ """
84+ Synchronously write an error file to disk under:
85+ <dir_of_database>/analysis_errors/<analysis_id>/<rel_path>
86+
87+ rel_path is sanitized to avoid path traversal.
88+ Returns the full written path.
89+ """
90+ try :
91+ # base directory: directory that contains the database file; fallback to cwd
92+ base = os .path .dirname (os .path .abspath (database_path )) if database_path else os .getcwd ()
93+ except Exception :
94+ base = os .getcwd ()
95+
96+ base_dir = os .path .join (base , "analysis_errors" , str (analysis_id or "unknown" ))
97+
98+ # Sanitize rel_path: normalize, remove leading slashes, replace .. with __
99+ p = os .path .normpath (rel_path )
100+ # Normalize separators and strip any leading drive/sep
101+ p = p .replace ("\\ " , "/" )
102+ while p .startswith ("../" ) or p .startswith ("/" ):
103+ if p .startswith ("../" ):
104+ p = p [len ("../" ) :]
105+ elif p .startswith ("/" ):
106+ p = p .lstrip ("/" )
107+ p = p .replace (".." , "__" )
108+ safe_rel = p
109+
110+ full_path = os .path .join (base_dir , safe_rel )
111+ # Ensure directory exists
112+ os .makedirs (os .path .dirname (full_path ), exist_ok = True )
113+
114+ # Write file (overwrite if exists)
115+ try :
116+ with open (full_path , "w" , encoding = "utf-8" ) as fh :
117+ fh .write (content or "" )
118+ except Exception as e :
119+ # As a last resort, log to logger
120+ logger .exception ("Failed to write error file to disk: %s (dest=%s)" , e , full_path )
121+
122+ return full_path
123+
124+
125+ async def _write_error_file (database_path : str , analysis_id : Optional [int ], rel_path : str , content : str , language : Optional [str ] = None ):
126+ return await _run_in_executor (_write_error_file_sync , database_path , analysis_id , rel_path , content , language )
127+
128+
76129# Simple chunker (character-based). Tunable CHUNK_SIZE, CHUNK_OVERLAP.
77130def chunk_text (text : str , chunk_size : int = CHUNK_SIZE , overlap : int = CHUNK_OVERLAP ) -> List [str ]:
78131 if chunk_size <= 0 :
@@ -346,46 +399,31 @@ def _insert_task(dbp, fid_local, pth, idx_local, vector_local):
346399 await _run_in_executor (_insert_task , database_path , fid , rel_path , idx , emb )
347400 embedded_any = True
348401 except Exception as e :
349- # record an error row in files table for diagnostics but continue processing other chunks
402+ # record an error to disk (previously was stored in DB)
350403 try :
351- await _run_in_executor (
352- store_file ,
353- database_path ,
354- analysis_id ,
355- f"errors/{ rel_path } .chunk{ idx } .error.txt" ,
356- f"Failed to insert chunk vector: { e } " ,
357- "error" ,
358- )
404+ err_content = f"Failed to insert chunk vector: { e } \n \n Traceback:\n { traceback .format_exc ()} "
405+ await _write_error_file (database_path , analysis_id , f"errors/{ rel_path } .chunk{ idx } .error.txt" , err_content , "error" )
359406 except Exception :
360- pass
407+ logger . exception ( "Failed to write chunk-insert error to disk for %s chunk %d" , rel_path , idx )
361408 else :
362409 try :
363- await _run_in_executor (
364- store_file ,
365- database_path ,
366- analysis_id ,
367- f"errors/{ rel_path } .chunk{ idx } .error.txt" ,
368- "Embedding API returned no vector for chunk." ,
369- "error" ,
370- )
410+ err_content = "Embedding API returned no vector for chunk."
411+ await _write_error_file (database_path , analysis_id , f"errors/{ rel_path } .chunk{ idx } .error.txt" , err_content , "error" )
371412 except Exception :
372- pass
413+ logger . exception ( "Failed to write empty-embedding error to disk for %s chunk %d" , rel_path , idx )
373414
374415 return {"stored" : True , "embedded" : embedded_any }
375416 except Exception as e :
376417 tb = traceback .format_exc ()
377418 try :
378419 error_payload = {"file" : rel_path , "error" : str (e ), "traceback" : tb [:2000 ]}
379- await _run_in_executor (
380- store_file ,
381- database_path ,
382- analysis_id ,
383- f"errors/{ rel_path } .error.txt" ,
384- json .dumps (error_payload , indent = 2 ),
385- "error" ,
386- )
420+ # write the error payload to disk instead of DB
421+ try :
422+ await _write_error_file (database_path , analysis_id , f"errors/{ rel_path } .error.txt" , json .dumps (error_payload , indent = 2 ), "error" )
423+ except Exception :
424+ logger .exception ("Failed to write exception error to disk for file %s" , rel_path )
387425 except Exception :
388- pass
426+ logger . exception ( "Failed while handling exception for file %s" , rel_path )
389427 return {"stored" : False , "embedded" : False }
390428
391429
@@ -439,20 +477,16 @@ async def analyze_local_path(
439477 await _run_in_executor (update_analysis_counts , database_path , aid , file_count , emb_count )
440478 except Exception :
441479 try :
442- await _run_in_executor (
443- store_file ,
444- database_path ,
445- aid ,
446- f"errors/progress_update_{ chunk_start } .error.txt" ,
447- f"Failed to update progress at chunk_start={ chunk_start } " ,
448- "error" ,
449- )
480+ # Previously this error was recorded to DB; now write to disk
481+ err_content = f"Failed to update progress at chunk_start={ chunk_start } "
482+ await _write_error_file (database_path , aid , f"errors/progress_update_{ chunk_start } .error.txt" , err_content , "error" )
450483 except Exception :
451- pass
484+ logger . exception ( "Failed to write progress-update error to disk for analysis %s" , aid )
452485
453486 # detect uv usage and deps (run in executor because it may use subprocess / file IO)
454487 uv_info = await _run_in_executor (lambda p , v : (None if p is None else p ), local_path , venv_path )
455488 try :
489+ # uv_detected.json is meta information — we keep storing meta in DB as before
456490 await _run_in_executor (
457491 store_file ,
458492 database_path ,
@@ -462,7 +496,11 @@ async def analyze_local_path(
462496 "meta" ,
463497 )
464498 except Exception :
465- pass
499+ # if storing meta fails, log to disk
500+ try :
501+ await _write_error_file (database_path , aid , "meta/uv_detected_write_failed.error.txt" , "Failed to store uv_detected.json in DB" , "error" )
502+ except Exception :
503+ logger .exception ("Failed to write uv_detected meta error to disk for analysis %s" , aid )
466504
467505 # final counts & status
468506 await _run_in_executor (update_analysis_counts , database_path , aid , file_count , emb_count )
0 commit comments