fix(stuff): copilot bad boy

Mte90 · Mte90 · commit 29b1a7069720 · 2025-10-23T16:26:29.000+02:00
diff --git a/analyzer.py b/analyzer.py
@@ -11,9 +11,11 @@
 from pathlib import Path
 from typing import Optional, Dict, Any, List
 
-from db import create_analysis, store_file, store_embedding, update_analysis_status, update_analysis_counts
+from db import create_analysis, store_file, update_analysis_status, update_analysis_counts
 from external_api import get_embedding_for_text, call_coding_api
 from llama_index.core import Document
+import logging
+logging.getLogger("httpx").setLevel(logging.WARNING)
 
 # language detection by extension
 EXT_LANG = {
@@ -77,55 +79,6 @@ async def async_get_embedding(text: str, model: Optional[str] = None):
     # Wrap the (possibly blocking) get_embedding_for_text in a threadpool so the event loop isn't blocked.
     return await _run_in_executor(get_embedding_for_text, text, model)
 
-
-# helper: write error files to disk (instead of storing them in the DB)
-def _write_error_file_sync(database_path: str, analysis_id: Optional[int], rel_path: str, content: str, language: Optional[str] = None) -> str:
-    """
-    Synchronously write an error file to disk under:
-      <dir_of_database>/analysis_errors/<analysis_id>/<rel_path>
-
-    rel_path is sanitized to avoid path traversal.
-    Returns the full written path.
-    """
-    try:
-        # base directory: directory that contains the database file; fallback to cwd
-        base = os.path.dirname(os.path.abspath(database_path)) if database_path else os.getcwd()
-    except Exception:
-        base = os.getcwd()
-
-    base_dir = os.path.join(base, "analysis_errors", str(analysis_id or "unknown"))
-
-    # Sanitize rel_path: normalize, remove leading slashes, replace .. with __
-    p = os.path.normpath(rel_path)
-    # Normalize separators and strip any leading drive/sep
-    p = p.replace("\\", "/")
-    while p.startswith("../") or p.startswith("/"):
-        if p.startswith("../"):
-            p = p[len("../") :]
-        elif p.startswith("/"):
-            p = p.lstrip("/")
-    p = p.replace("..", "__")
-    safe_rel = p
-
-    full_path = os.path.join(base_dir, safe_rel)
-    # Ensure directory exists
-    os.makedirs(os.path.dirname(full_path), exist_ok=True)
-
-    # Write file (overwrite if exists)
-    try:
-        with open(full_path, "w", encoding="utf-8") as fh:
-            fh.write(content or "")
-    except Exception as e:
-        # As a last resort, log to logger
-        logger.exception("Failed to write error file to disk: %s (dest=%s)", e, full_path)
-
-    return full_path
-
-
-async def _write_error_file(database_path: str, analysis_id: Optional[int], rel_path: str, content: str, language: Optional[str] = None):
-    return await _run_in_executor(_write_error_file_sync, database_path, analysis_id, rel_path, content, language)
-
-
 # Simple chunker (character-based). Tunable CHUNK_SIZE, CHUNK_OVERLAP.
 def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
     if chunk_size <= 0:
@@ -383,9 +336,6 @@ async def _process_file(
                 semaphore.release()
 
             if emb:
-                # store audit embedding (non-blocking to vector store)
-                await _run_in_executor(store_embedding, database_path, fid, emb)
-
                 # insert chunk vector into sqlite-vector-backed chunks.embedding with retry
                 def _insert_task(dbp, fid_local, pth, idx_local, vector_local):
                     conn2 = _connect_db(dbp)
@@ -402,13 +352,13 @@ def _insert_task(dbp, fid_local, pth, idx_local, vector_local):
                     # record an error to disk (previously was stored in DB)
                     try:
                         err_content = f"Failed to insert chunk vector: {e}\n\nTraceback:\n{traceback.format_exc()}"
-                        await _write_error_file(database_path, analysis_id, f"errors/{rel_path}.chunk{idx}.error.txt", err_content, "error")
+                        print(err_content)
                     except Exception:
                         logger.exception("Failed to write chunk-insert error to disk for %s chunk %d", rel_path, idx)
             else:
                 try:
                     err_content = "Embedding API returned no vector for chunk."
-                    await _write_error_file(database_path, analysis_id, f"errors/{rel_path}.chunk{idx}.error.txt", err_content, "error")
+                    print(err_content)
                 except Exception:
                     logger.exception("Failed to write empty-embedding error to disk for %s chunk %d", rel_path, idx)
 
@@ -419,7 +369,7 @@ def _insert_task(dbp, fid_local, pth, idx_local, vector_local):
             error_payload = {"file": rel_path, "error": str(e), "traceback": tb[:2000]}
             # write the error payload to disk instead of DB
             try:
-                await _write_error_file(database_path, analysis_id, f"errors/{rel_path}.error.txt", json.dumps(error_payload, indent=2), "error")
+                print(error_payload)
             except Exception:
                 logger.exception("Failed to write exception error to disk for file %s", rel_path)
         except Exception:
@@ -479,7 +429,7 @@ async def analyze_local_path(
                 try:
                     # Previously this error was recorded to DB; now write to disk
                     err_content = f"Failed to update progress at chunk_start={chunk_start}"
-                    await _write_error_file(database_path, aid, f"errors/progress_update_{chunk_start}.error.txt", err_content, "error")
+                    print(err_content)
                 except Exception:
                     logger.exception("Failed to write progress-update error to disk for analysis %s", aid)
 
@@ -498,7 +448,7 @@ async def analyze_local_path(
         except Exception:
             # if storing meta fails, log to disk
             try:
-                await _write_error_file(database_path, aid, "meta/uv_detected_write_failed.error.txt", "Failed to store uv_detected.json in DB", "error")
+                print("Failed to store uv_detected.json in DB")
             except Exception:
                 logger.exception("Failed to write uv_detected meta error to disk for analysis %s", aid)
 
diff --git a/config.py b/config.py
@@ -1,5 +1,3 @@
-# config.py
-# Loads configuration from a .env file (and environment) using python-dotenv.
 from dotenv import load_dotenv
 import os
 
@@ -13,6 +11,12 @@ def _int_env(name, default):
     except Exception:
         return default
 
+def _bool_env(name, default):
+    v = os.getenv(name)
+    if v is None:
+        return default
+    return v.lower() in ("1", "true", "yes")
+
 # Expose a CFG dictionary for the rest of the app
 CFG = {
     "local_path": os.getenv("LOCAL_PATH"),
@@ -26,7 +30,11 @@ def _int_env(name, default):
     "embedding_model": os.getenv("EMBEDDING_MODEL"),
     "coding_model": os.getenv("CODING_MODEL"),
 
+    # chunking parameters configurable via env
+    "chunk_size": _int_env("CHUNK_SIZE", 800),
+    "chunk_overlap": _int_env("CHUNK_OVERLAP", 100),
+
     # uvicorn host/port (from .env)
     "uvicorn_host": os.getenv("UVICORN_HOST", "127.0.0.1"),
-    "uvicorn_port": _int_env("UVICORN_PORT", 8000),
+    "uvicorn_port": int(os.getenv("UVICORN_PORT", "8000")),
 }
diff --git a/db.py b/db.py
@@ -1,15 +1,18 @@
 import os
 import sqlite3
-import json
 from typing import Any, Dict, List, Optional
 
+from config import CFG  # config (keeps chunk_size etc if needed)
+
 # Simple connection helper: we open new connections per operation so the code is robust
 # across threads. We set WAL journal mode for safer concurrency.
+# Added a small timeout to avoid long blocking if DB is locked.
 def _get_connection(db_path: str) -> sqlite3.Connection:
     dirname = os.path.dirname(os.path.abspath(db_path))
     if dirname and not os.path.isdir(dirname):
         os.makedirs(dirname, exist_ok=True)
-    conn = sqlite3.connect(db_path, check_same_thread=False)
+    # timeout in seconds for busy sqlite; small value to avoid long blocking in web requests
+    conn = sqlite3.connect(db_path, check_same_thread=False, timeout=5.0)
     conn.row_factory = sqlite3.Row
     try:
         conn.execute("PRAGMA journal_mode = WAL;")
@@ -23,23 +26,21 @@ def init_db(database_path: str) -> None:
     """
     Initialize database schema. Safe to call multiple times.
     Creates:
-    - analyses
+    - analyses (embedding_count column kept for backward compat but not used as source of truth)
     - files
-    - embeddings
     - chunks (with embedding BLOB column for sqlite-vector)
     """
     conn = _get_connection(database_path)
     try:
         cur = conn.cursor()
-        # analyses table
+        # analyses table: embedding_count column kept for compatibility but will be computed live
         cur.execute(
             """
             CREATE TABLE IF NOT EXISTS analyses (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 name TEXT NOT NULL,
                 path TEXT NOT NULL,
                 status TEXT NOT NULL,
-                file_count INTEGER DEFAULT 0,
                 embedding_count INTEGER DEFAULT 0,
                 created_at TEXT DEFAULT (datetime('now'))
             )
@@ -63,22 +64,7 @@ def init_db(database_path: str) -> None:
         )
         cur.execute("CREATE INDEX IF NOT EXISTS idx_files_analysis ON files(analysis_id);")
 
-        # embeddings: audit/backup store of embeddings as JSON text
-        cur.execute(
-            """
-            CREATE TABLE IF NOT EXISTS embeddings (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                file_id INTEGER NOT NULL,
-                vector TEXT,
-                created_at TEXT DEFAULT (datetime('now')),
-                FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
-            )
-            """
-        )
-        cur.execute("CREATE INDEX IF NOT EXISTS idx_embeddings_file ON embeddings(file_id);")
-
         # chunks table: metadata for chunked documents; includes embedding BLOB column
-        # which sqlite-vector will operate on (via vector_as_f32 / vector_full_scan / vector_init, etc).
         cur.execute(
             """
             CREATE TABLE IF NOT EXISTS chunks (
@@ -122,19 +108,6 @@ def update_analysis_status(database_path: str, analysis_id: int, status: str) ->
         conn.close()
 
 
-def update_analysis_counts(database_path: str, analysis_id: int, file_count: int, embedding_count: int) -> None:
-    conn = _get_connection(database_path)
-    try:
-        cur = conn.cursor()
-        cur.execute(
-            "UPDATE analyses SET file_count = ?, embedding_count = ? WHERE id = ?",
-            (file_count, embedding_count, analysis_id),
-        )
-        conn.commit()
-    finally:
-        conn.close()
-
-
 def store_file(database_path: str, analysis_id: int, path: str, content: str, language: str) -> int:
     """
     Insert a file row. Returns the new file id.
@@ -152,29 +125,10 @@ def store_file(database_path: str, analysis_id: int, path: str, content: str, la
         conn.close()
 
 
-def store_embedding(database_path: str, file_id: int, vector: Any) -> None:
-    """
-    Store an embedding in the embeddings audit table as JSON text.
-    Keep semantics backward-compatible: external code expects this to exist.
-    """
-    conn = _get_connection(database_path)
-    try:
-        cur = conn.cursor()
-        cur.execute(
-            "INSERT INTO embeddings (file_id, vector) VALUES (?, ?)",
-            (file_id, json.dumps(vector)),
-        )
-        conn.commit()
-    finally:
-        conn.close()
-
-
 def insert_chunk_row_with_null_embedding(database_path: str, file_id: int, path: str, chunk_index: int) -> int:
     """
-    Convenience to insert a chunk metadata row without populating embedding column.
+    Insert a chunk metadata row without populating embedding column.
     Returns the new chunks.id.
-    (Typically you will later update chunks.embedding using the sqlite-vector API or via
-    an INSERT that uses vector_as_f32(...) as done in analyzer._insert_chunk_vector.)
     """
     conn = _get_connection(database_path)
     try:
@@ -190,11 +144,27 @@ def insert_chunk_row_with_null_embedding(database_path: str, file_id: int, path:
 
 
 def list_analyses(database_path: str) -> List[Dict[str, Any]]:
+    """
+    Return analyses with computed file_count and computed embedding_count (from chunks.embedding).
+    This ensures the UI shows accurate, up-to-date counts based on actual rows.
+    """
     conn = _get_connection(database_path)
     try:
         cur = conn.cursor()
         rows = cur.execute(
-            "SELECT id, name, path, status, file_count, embedding_count, created_at FROM analyses ORDER BY id DESC"
+            """
+            SELECT
+                a.id,
+                a.name,
+                a.path,
+                a.status,
+                (SELECT COUNT(*) FROM files f WHERE f.analysis_id = a.id) AS file_count,
+                (SELECT COUNT(*) FROM chunks ch JOIN files f2 ON ch.file_id = f2.id
+                    WHERE f2.analysis_id = a.id AND ch.embedding IS NOT NULL) AS embedding_count,
+                a.created_at
+            FROM analyses a
+            ORDER BY a.id DESC
+            """
         ).fetchall()
         results: List[Dict[str, Any]] = []
         for r in rows:
@@ -204,8 +174,8 @@ def list_analyses(database_path: str) -> List[Dict[str, Any]]:
                     "name": r["name"],
                     "path": r["path"],
                     "status": r["status"],
-                    "file_count": r["file_count"],
-                    "embedding_count": r["embedding_count"],
+                    "file_count": int(r["file_count"]),
+                    "embedding_count": int(r["embedding_count"]),
                     "created_at": r["created_at"],
                 }
             )
@@ -227,18 +197,12 @@ def list_files_for_analysis(database_path: str, analysis_id: int) -> List[Dict[s
 
 def delete_analysis(database_path: str, analysis_id: int) -> None:
     """
-    Delete an analysis and cascade-delete associated files / embeddings / chunks.
-    SQLite foreign keys require PRAGMA foreign_keys = ON for enforcement; do explicit deletes
-    for safety across SQLite builds.
+    Delete an analysis and cascade-delete associated files / chunks.
+    Foreign key enforcement varies by SQLite build; do explicit deletes for safety.
     """
     conn = _get_connection(database_path)
     try:
         cur = conn.cursor()
-        # delete embeddings for files in analysis
-        cur.execute(
-            "DELETE FROM embeddings WHERE file_id IN (SELECT id FROM files WHERE analysis_id = ?)",
-            (analysis_id,),
-        )
         # delete chunks for files in analysis
         cur.execute(
             "DELETE FROM chunks WHERE file_id IN (SELECT id FROM files WHERE analysis_id = ?)",
diff --git a/main.py b/main.py
@@ -17,6 +17,7 @@
 
 # Controls how many characters of each snippet and total context we send to coding model
 TOTAL_CONTEXT_LIMIT = 4000
+_ANALYSES_CACHE = []
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
@@ -37,10 +38,22 @@ def index(request: Request):
 
 @app.get("/analyses/status")
 def analyses_status():
+    global _ANALYSES_CACHE
     try:
         analyses = list_analyses(DATABASE)
+        # If the DB returned a non-empty list, update cache and return it.
+        if analyses:
+            _ANALYSES_CACHE = analyses
+            return JSONResponse(analyses)
+        # If DB returned empty but we have a cached non-empty list, return cache
+        if not analyses and _ANALYSES_CACHE:
+            return JSONResponse(_ANALYSES_CACHE)
+        # else return whatever (empty list) — first-run or truly empty
         return JSONResponse(analyses)
     except Exception as e:
+        # On DB errors (e.g., locked) return last known cache to avoid empty responses spam.
+        if _ANALYSES_CACHE:
+            return JSONResponse(_ANALYSES_CACHE)
         return JSONResponse({"error": str(e)}, status_code=500)
 
 
diff --git a/templates/index.html b/templates/index.html