Skip to content

Commit 29b1a70

Browse files
committed
fix(stuff): copilot bad boy
1 parent 42682df commit 29b1a70

File tree

5 files changed

+80
-142
lines changed

5 files changed

+80
-142
lines changed

analyzer.py

Lines changed: 8 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
from pathlib import Path
1212
from typing import Optional, Dict, Any, List
1313

14-
from db import create_analysis, store_file, store_embedding, update_analysis_status, update_analysis_counts
14+
from db import create_analysis, store_file, update_analysis_status, update_analysis_counts
1515
from external_api import get_embedding_for_text, call_coding_api
1616
from llama_index.core import Document
17+
import logging
18+
logging.getLogger("httpx").setLevel(logging.WARNING)
1719

1820
# language detection by extension
1921
EXT_LANG = {
@@ -77,55 +79,6 @@ async def async_get_embedding(text: str, model: Optional[str] = None):
7779
# Wrap the (possibly blocking) get_embedding_for_text in a threadpool so the event loop isn't blocked.
7880
return await _run_in_executor(get_embedding_for_text, text, model)
7981

80-
81-
# helper: write error files to disk (instead of storing them in the DB)
82-
def _write_error_file_sync(database_path: str, analysis_id: Optional[int], rel_path: str, content: str, language: Optional[str] = None) -> str:
83-
"""
84-
Synchronously write an error file to disk under:
85-
<dir_of_database>/analysis_errors/<analysis_id>/<rel_path>
86-
87-
rel_path is sanitized to avoid path traversal.
88-
Returns the full written path.
89-
"""
90-
try:
91-
# base directory: directory that contains the database file; fallback to cwd
92-
base = os.path.dirname(os.path.abspath(database_path)) if database_path else os.getcwd()
93-
except Exception:
94-
base = os.getcwd()
95-
96-
base_dir = os.path.join(base, "analysis_errors", str(analysis_id or "unknown"))
97-
98-
# Sanitize rel_path: normalize, remove leading slashes, replace .. with __
99-
p = os.path.normpath(rel_path)
100-
# Normalize separators and strip any leading drive/sep
101-
p = p.replace("\\", "/")
102-
while p.startswith("../") or p.startswith("/"):
103-
if p.startswith("../"):
104-
p = p[len("../") :]
105-
elif p.startswith("/"):
106-
p = p.lstrip("/")
107-
p = p.replace("..", "__")
108-
safe_rel = p
109-
110-
full_path = os.path.join(base_dir, safe_rel)
111-
# Ensure directory exists
112-
os.makedirs(os.path.dirname(full_path), exist_ok=True)
113-
114-
# Write file (overwrite if exists)
115-
try:
116-
with open(full_path, "w", encoding="utf-8") as fh:
117-
fh.write(content or "")
118-
except Exception as e:
119-
# As a last resort, log to logger
120-
logger.exception("Failed to write error file to disk: %s (dest=%s)", e, full_path)
121-
122-
return full_path
123-
124-
125-
async def _write_error_file(database_path: str, analysis_id: Optional[int], rel_path: str, content: str, language: Optional[str] = None):
126-
return await _run_in_executor(_write_error_file_sync, database_path, analysis_id, rel_path, content, language)
127-
128-
12982
# Simple chunker (character-based). Tunable CHUNK_SIZE, CHUNK_OVERLAP.
13083
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
13184
if chunk_size <= 0:
@@ -383,9 +336,6 @@ async def _process_file(
383336
semaphore.release()
384337

385338
if emb:
386-
# store audit embedding (non-blocking to vector store)
387-
await _run_in_executor(store_embedding, database_path, fid, emb)
388-
389339
# insert chunk vector into sqlite-vector-backed chunks.embedding with retry
390340
def _insert_task(dbp, fid_local, pth, idx_local, vector_local):
391341
conn2 = _connect_db(dbp)
@@ -402,13 +352,13 @@ def _insert_task(dbp, fid_local, pth, idx_local, vector_local):
402352
# record an error to disk (previously was stored in DB)
403353
try:
404354
err_content = f"Failed to insert chunk vector: {e}\n\nTraceback:\n{traceback.format_exc()}"
405-
await _write_error_file(database_path, analysis_id, f"errors/{rel_path}.chunk{idx}.error.txt", err_content, "error")
355+
print(err_content)
406356
except Exception:
407357
logger.exception("Failed to write chunk-insert error to disk for %s chunk %d", rel_path, idx)
408358
else:
409359
try:
410360
err_content = "Embedding API returned no vector for chunk."
411-
await _write_error_file(database_path, analysis_id, f"errors/{rel_path}.chunk{idx}.error.txt", err_content, "error")
361+
print(err_content)
412362
except Exception:
413363
logger.exception("Failed to write empty-embedding error to disk for %s chunk %d", rel_path, idx)
414364

@@ -419,7 +369,7 @@ def _insert_task(dbp, fid_local, pth, idx_local, vector_local):
419369
error_payload = {"file": rel_path, "error": str(e), "traceback": tb[:2000]}
420370
# write the error payload to disk instead of DB
421371
try:
422-
await _write_error_file(database_path, analysis_id, f"errors/{rel_path}.error.txt", json.dumps(error_payload, indent=2), "error")
372+
print(error_payload)
423373
except Exception:
424374
logger.exception("Failed to write exception error to disk for file %s", rel_path)
425375
except Exception:
@@ -479,7 +429,7 @@ async def analyze_local_path(
479429
try:
480430
# Previously this error was recorded to DB; now write to disk
481431
err_content = f"Failed to update progress at chunk_start={chunk_start}"
482-
await _write_error_file(database_path, aid, f"errors/progress_update_{chunk_start}.error.txt", err_content, "error")
432+
print(err_content)
483433
except Exception:
484434
logger.exception("Failed to write progress-update error to disk for analysis %s", aid)
485435

@@ -498,7 +448,7 @@ async def analyze_local_path(
498448
except Exception:
499449
# if storing meta fails, log to disk
500450
try:
501-
await _write_error_file(database_path, aid, "meta/uv_detected_write_failed.error.txt", "Failed to store uv_detected.json in DB", "error")
451+
print("Failed to store uv_detected.json in DB")
502452
except Exception:
503453
logger.exception("Failed to write uv_detected meta error to disk for analysis %s", aid)
504454

config.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# config.py
2-
# Loads configuration from a .env file (and environment) using python-dotenv.
31
from dotenv import load_dotenv
42
import os
53

@@ -13,6 +11,12 @@ def _int_env(name, default):
1311
except Exception:
1412
return default
1513

14+
def _bool_env(name, default):
15+
v = os.getenv(name)
16+
if v is None:
17+
return default
18+
return v.lower() in ("1", "true", "yes")
19+
1620
# Expose a CFG dictionary for the rest of the app
1721
CFG = {
1822
"local_path": os.getenv("LOCAL_PATH"),
@@ -26,7 +30,11 @@ def _int_env(name, default):
2630
"embedding_model": os.getenv("EMBEDDING_MODEL"),
2731
"coding_model": os.getenv("CODING_MODEL"),
2832

33+
# chunking parameters configurable via env
34+
"chunk_size": _int_env("CHUNK_SIZE", 800),
35+
"chunk_overlap": _int_env("CHUNK_OVERLAP", 100),
36+
2937
# uvicorn host/port (from .env)
3038
"uvicorn_host": os.getenv("UVICORN_HOST", "127.0.0.1"),
31-
"uvicorn_port": _int_env("UVICORN_PORT", 8000),
39+
"uvicorn_port": int(os.getenv("UVICORN_PORT", "8000")),
3240
}

db.py

Lines changed: 29 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
import os
22
import sqlite3
3-
import json
43
from typing import Any, Dict, List, Optional
54

5+
from config import CFG # config (keeps chunk_size etc if needed)
6+
67
# Simple connection helper: we open new connections per operation so the code is robust
78
# across threads. We set WAL journal mode for safer concurrency.
9+
# Added a small timeout to avoid long blocking if DB is locked.
810
def _get_connection(db_path: str) -> sqlite3.Connection:
911
dirname = os.path.dirname(os.path.abspath(db_path))
1012
if dirname and not os.path.isdir(dirname):
1113
os.makedirs(dirname, exist_ok=True)
12-
conn = sqlite3.connect(db_path, check_same_thread=False)
14+
# timeout in seconds for busy sqlite; small value to avoid long blocking in web requests
15+
conn = sqlite3.connect(db_path, check_same_thread=False, timeout=5.0)
1316
conn.row_factory = sqlite3.Row
1417
try:
1518
conn.execute("PRAGMA journal_mode = WAL;")
@@ -23,23 +26,21 @@ def init_db(database_path: str) -> None:
2326
"""
2427
Initialize database schema. Safe to call multiple times.
2528
Creates:
26-
- analyses
29+
- analyses (embedding_count column kept for backward compat but not used as source of truth)
2730
- files
28-
- embeddings
2931
- chunks (with embedding BLOB column for sqlite-vector)
3032
"""
3133
conn = _get_connection(database_path)
3234
try:
3335
cur = conn.cursor()
34-
# analyses table
36+
# analyses table: embedding_count column kept for compatibility but will be computed live
3537
cur.execute(
3638
"""
3739
CREATE TABLE IF NOT EXISTS analyses (
3840
id INTEGER PRIMARY KEY AUTOINCREMENT,
3941
name TEXT NOT NULL,
4042
path TEXT NOT NULL,
4143
status TEXT NOT NULL,
42-
file_count INTEGER DEFAULT 0,
4344
embedding_count INTEGER DEFAULT 0,
4445
created_at TEXT DEFAULT (datetime('now'))
4546
)
@@ -63,22 +64,7 @@ def init_db(database_path: str) -> None:
6364
)
6465
cur.execute("CREATE INDEX IF NOT EXISTS idx_files_analysis ON files(analysis_id);")
6566

66-
# embeddings: audit/backup store of embeddings as JSON text
67-
cur.execute(
68-
"""
69-
CREATE TABLE IF NOT EXISTS embeddings (
70-
id INTEGER PRIMARY KEY AUTOINCREMENT,
71-
file_id INTEGER NOT NULL,
72-
vector TEXT,
73-
created_at TEXT DEFAULT (datetime('now')),
74-
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
75-
)
76-
"""
77-
)
78-
cur.execute("CREATE INDEX IF NOT EXISTS idx_embeddings_file ON embeddings(file_id);")
79-
8067
# chunks table: metadata for chunked documents; includes embedding BLOB column
81-
# which sqlite-vector will operate on (via vector_as_f32 / vector_full_scan / vector_init, etc).
8268
cur.execute(
8369
"""
8470
CREATE TABLE IF NOT EXISTS chunks (
@@ -122,19 +108,6 @@ def update_analysis_status(database_path: str, analysis_id: int, status: str) ->
122108
conn.close()
123109

124110

125-
def update_analysis_counts(database_path: str, analysis_id: int, file_count: int, embedding_count: int) -> None:
126-
conn = _get_connection(database_path)
127-
try:
128-
cur = conn.cursor()
129-
cur.execute(
130-
"UPDATE analyses SET file_count = ?, embedding_count = ? WHERE id = ?",
131-
(file_count, embedding_count, analysis_id),
132-
)
133-
conn.commit()
134-
finally:
135-
conn.close()
136-
137-
138111
def store_file(database_path: str, analysis_id: int, path: str, content: str, language: str) -> int:
139112
"""
140113
Insert a file row. Returns the new file id.
@@ -152,29 +125,10 @@ def store_file(database_path: str, analysis_id: int, path: str, content: str, la
152125
conn.close()
153126

154127

155-
def store_embedding(database_path: str, file_id: int, vector: Any) -> None:
156-
"""
157-
Store an embedding in the embeddings audit table as JSON text.
158-
Keep semantics backward-compatible: external code expects this to exist.
159-
"""
160-
conn = _get_connection(database_path)
161-
try:
162-
cur = conn.cursor()
163-
cur.execute(
164-
"INSERT INTO embeddings (file_id, vector) VALUES (?, ?)",
165-
(file_id, json.dumps(vector)),
166-
)
167-
conn.commit()
168-
finally:
169-
conn.close()
170-
171-
172128
def insert_chunk_row_with_null_embedding(database_path: str, file_id: int, path: str, chunk_index: int) -> int:
173129
"""
174-
Convenience to insert a chunk metadata row without populating embedding column.
130+
Insert a chunk metadata row without populating embedding column.
175131
Returns the new chunks.id.
176-
(Typically you will later update chunks.embedding using the sqlite-vector API or via
177-
an INSERT that uses vector_as_f32(...) as done in analyzer._insert_chunk_vector.)
178132
"""
179133
conn = _get_connection(database_path)
180134
try:
@@ -190,11 +144,27 @@ def insert_chunk_row_with_null_embedding(database_path: str, file_id: int, path:
190144

191145

192146
def list_analyses(database_path: str) -> List[Dict[str, Any]]:
147+
"""
148+
Return analyses with computed file_count and computed embedding_count (from chunks.embedding).
149+
This ensures the UI shows accurate, up-to-date counts based on actual rows.
150+
"""
193151
conn = _get_connection(database_path)
194152
try:
195153
cur = conn.cursor()
196154
rows = cur.execute(
197-
"SELECT id, name, path, status, file_count, embedding_count, created_at FROM analyses ORDER BY id DESC"
155+
"""
156+
SELECT
157+
a.id,
158+
a.name,
159+
a.path,
160+
a.status,
161+
(SELECT COUNT(*) FROM files f WHERE f.analysis_id = a.id) AS file_count,
162+
(SELECT COUNT(*) FROM chunks ch JOIN files f2 ON ch.file_id = f2.id
163+
WHERE f2.analysis_id = a.id AND ch.embedding IS NOT NULL) AS embedding_count,
164+
a.created_at
165+
FROM analyses a
166+
ORDER BY a.id DESC
167+
"""
198168
).fetchall()
199169
results: List[Dict[str, Any]] = []
200170
for r in rows:
@@ -204,8 +174,8 @@ def list_analyses(database_path: str) -> List[Dict[str, Any]]:
204174
"name": r["name"],
205175
"path": r["path"],
206176
"status": r["status"],
207-
"file_count": r["file_count"],
208-
"embedding_count": r["embedding_count"],
177+
"file_count": int(r["file_count"]),
178+
"embedding_count": int(r["embedding_count"]),
209179
"created_at": r["created_at"],
210180
}
211181
)
@@ -227,18 +197,12 @@ def list_files_for_analysis(database_path: str, analysis_id: int) -> List[Dict[s
227197

228198
def delete_analysis(database_path: str, analysis_id: int) -> None:
229199
"""
230-
Delete an analysis and cascade-delete associated files / embeddings / chunks.
231-
SQLite foreign keys require PRAGMA foreign_keys = ON for enforcement; do explicit deletes
232-
for safety across SQLite builds.
200+
Delete an analysis and cascade-delete associated files / chunks.
201+
Foreign key enforcement varies by SQLite build; do explicit deletes for safety.
233202
"""
234203
conn = _get_connection(database_path)
235204
try:
236205
cur = conn.cursor()
237-
# delete embeddings for files in analysis
238-
cur.execute(
239-
"DELETE FROM embeddings WHERE file_id IN (SELECT id FROM files WHERE analysis_id = ?)",
240-
(analysis_id,),
241-
)
242206
# delete chunks for files in analysis
243207
cur.execute(
244208
"DELETE FROM chunks WHERE file_id IN (SELECT id FROM files WHERE analysis_id = ?)",

main.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
# Controls how many characters of each snippet and total context we send to coding model
1919
TOTAL_CONTEXT_LIMIT = 4000
20+
_ANALYSES_CACHE = []
2021

2122
@asynccontextmanager
2223
async def lifespan(app: FastAPI):
@@ -37,10 +38,22 @@ def index(request: Request):
3738

3839
@app.get("/analyses/status")
3940
def analyses_status():
41+
global _ANALYSES_CACHE
4042
try:
4143
analyses = list_analyses(DATABASE)
44+
# If the DB returned a non-empty list, update cache and return it.
45+
if analyses:
46+
_ANALYSES_CACHE = analyses
47+
return JSONResponse(analyses)
48+
# If DB returned empty but we have a cached non-empty list, return cache
49+
if not analyses and _ANALYSES_CACHE:
50+
return JSONResponse(_ANALYSES_CACHE)
51+
# else return whatever (empty list) — first-run or truly empty
4252
return JSONResponse(analyses)
4353
except Exception as e:
54+
# On DB errors (e.g., locked) return last known cache to avoid empty responses spam.
55+
if _ANALYSES_CACHE:
56+
return JSONResponse(_ANALYSES_CACHE)
4457
return JSONResponse({"error": str(e)}, status_code=500)
4558

4659

0 commit comments

Comments
 (0)