From e7b82f886b232e9d52c46dcd40e84204724c8160 Mon Sep 17 00:00:00 2001 From: Daniele Briggi <=> Date: Fri, 17 Oct 2025 13:38:24 +0000 Subject: [PATCH 1/7] feat(sentences): introduce sentences to improve results preview refact(settings): extensions options are generated by a setting method chore(settings): - default chunk_size equals to the model context window - increase FTS weight --- src/sqlite_rag/cli.py | 7 +- src/sqlite_rag/database.py | 29 +++- src/sqlite_rag/engine.py | 114 ++++++++++++-- src/sqlite_rag/formatters.py | 188 ++++++++++++++++++++++- src/sqlite_rag/models/chunk.py | 9 +- src/sqlite_rag/models/document.py | 2 +- src/sqlite_rag/models/document_result.py | 7 +- src/sqlite_rag/models/sentence.py | 11 ++ src/sqlite_rag/models/sentence_result.py | 16 ++ src/sqlite_rag/repository.py | 21 ++- src/sqlite_rag/sentence_splitter.py | 38 +++++ src/sqlite_rag/settings.py | 46 +++++- src/sqlite_rag/sqliterag.py | 28 +++- tests/test_engine.py | 4 + tests/test_sentence_splitter.py | 71 +++++++++ tests/test_settings.py | 12 +- 16 files changed, 560 insertions(+), 43 deletions(-) create mode 100644 src/sqlite_rag/models/sentence.py create mode 100644 src/sqlite_rag/models/sentence_result.py create mode 100644 src/sqlite_rag/sentence_splitter.py create mode 100644 tests/test_sentence_splitter.py diff --git a/src/sqlite_rag/cli.py b/src/sqlite_rag/cli.py index 6574889..13278cc 100644 --- a/src/sqlite_rag/cli.py +++ b/src/sqlite_rag/cli.py @@ -446,6 +446,11 @@ def search( "--debug", help="Print extra debug information with modern formatting", ), + debug2: bool = typer.Option( + False, + "--debug2", + help="Print debug format with sentence-level details and snippet context", + ), peek: bool = typer.Option( False, "--peek", help="Print debug information using compact table format" ), @@ -462,7 +467,7 @@ def search( results = results[:limit] # Get the appropriate formatter and display results - formatter = get_formatter(debug=debug, table_view=peek) + formatter = get_formatter(debug=debug, debug2=debug2, table_view=peek) formatter.format_results(results, query) typer.echo(f"{search_time:.3f} seconds") diff --git a/src/sqlite_rag/database.py b/src/sqlite_rag/database.py index 8ad1791..7bdd357 100644 --- a/src/sqlite_rag/database.py +++ b/src/sqlite_rag/database.py @@ -88,6 +88,21 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings): """ ) + # TODO: remove sequence + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS sentences ( + id TEXT PRIMARY KEY, + chunk_id INTEGER, + content TEXT, + embedding BLOB, + sequence INTEGER, + start_offset INTEGER, + end_offset INTEGER + ) + """ + ) + cursor.execute( """ CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(content, content='chunks', content_rowid='id'); @@ -95,9 +110,17 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings): ) cursor.execute( - f""" - SELECT vector_init('chunks', 'embedding', 'type={settings.vector_type},dimension={settings.embedding_dim},{settings.other_vector_options}'); - """ + """ + SELECT vector_init('chunks', 'embedding', ?); + """, + (settings.get_vector_init_options(),), + ) + # TODO: same configuration as chunks (or different options?) + cursor.execute( + """ + SELECT vector_init('sentences', 'embedding', ?); + """, + (settings.get_vector_init_options(),), ) conn.commit() diff --git a/src/sqlite_rag/engine.py b/src/sqlite_rag/engine.py index c2372d2..840992c 100644 --- a/src/sqlite_rag/engine.py +++ b/src/sqlite_rag/engine.py @@ -1,10 +1,12 @@ import json -import re import sqlite3 from pathlib import Path +from typing import List from sqlite_rag.logger import Logger from sqlite_rag.models.document_result import DocumentResult +from sqlite_rag.models.sentence_result import SentenceResult +from sqlite_rag.sentence_splitter import SentenceSplitter from .chunker import Chunker from .models.document import Document @@ -15,10 +17,17 @@ class Engine: # Considered a good default to normilize the score for RRF DEFAULT_RRF_K = 60 - def __init__(self, conn: sqlite3.Connection, settings: Settings, chunker: Chunker): + def __init__( + self, + conn: sqlite3.Connection, + settings: Settings, + chunker: Chunker, + sentence_chunker: SentenceSplitter, + ): self._conn = conn self._settings = settings self._chunker = chunker + self._sentence_chunker = sentence_chunker self._logger = Logger() def load_model(self): @@ -30,7 +39,7 @@ def load_model(self): self._conn.execute( "SELECT llm_model_load(?, ?);", - (self._settings.model_path, self._settings.model_options), + (self._settings.model_path, self._settings.other_model_options), ) def process(self, document: Document) -> Document: @@ -46,6 +55,11 @@ def process(self, document: Document) -> Document: chunk.title = document.get_title() chunk.embedding = self.generate_embedding(chunk.get_embedding_text()) + sentences = self._sentence_chunker.split(chunk) + for sentence in sentences: + sentence.embedding = self.generate_embedding(sentence.content) + chunk.sentences = sentences + document.chunks = chunks return document @@ -72,6 +86,7 @@ def quantize(self) -> None: cursor = self._conn.cursor() cursor.execute("SELECT vector_quantize('chunks', 'embedding');") + cursor.execute("SELECT vector_quantize('sentences', 'embedding');") self._conn.commit() self._logger.debug("Quantization completed.") @@ -81,21 +96,25 @@ def quantize_preload(self) -> None: cursor = self._conn.cursor() cursor.execute("SELECT vector_quantize_preload('chunks', 'embedding');") + cursor.execute("SELECT vector_quantize_preload('sentences', 'embedding');") def quantize_cleanup(self) -> None: """Clean up internal structures related to a previously quantized table/column.""" cursor = self._conn.cursor() cursor.execute("SELECT vector_quantize_cleanup('chunks', 'embedding');") + cursor.execute("SELECT vector_quantize_cleanup('sentences', 'embedding');") self._conn.commit() def create_new_context(self) -> None: - """""" + """Create a new LLM context with optional runtime overrides.""" cursor = self._conn.cursor() + context_options = self._settings.get_embeddings_context_options() cursor.execute( - "SELECT llm_context_create(?);", (self._settings.model_context_options,) + "SELECT llm_context_create(?);", + (context_options,), ) def free_context(self) -> None: @@ -104,13 +123,11 @@ def free_context(self) -> None: cursor.execute("SELECT llm_context_free();") - def search(self, query: str, top_k: int = 10) -> list[DocumentResult]: + def search( + self, semantic_query: str, fts_query, top_k: int = 10 + ) -> list[DocumentResult]: """Semantic search and full-text search sorted with Reciprocal Rank Fusion.""" - query_embedding = self.generate_embedding(query) - - # Clean up and split into words - # '*' is used to match while typing - query = " ".join(re.findall(r"\b\w+\b", query.lower())) + "*" + query_embedding = self.generate_embedding(semantic_query) vector_scan_type = ( "vector_quantize_scan" @@ -119,8 +136,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]: ) cursor = self._conn.cursor() - # TODO: understand how to sort results depending on the distance metric - # Eg, for cosine distance, higher is better (closer to 1) + cursor.execute( f""" -- sqlite-vector KNN vector search results @@ -163,6 +179,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]: documents.uri, documents.content as document_content, documents.metadata, + chunks.id AS chunk_id, chunks.content AS snippet, vec_rank, fts_rank, @@ -176,7 +193,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]: ; """, # nosec B608 { - "query": query, + "query": fts_query, "query_embedding": query_embedding, "k": top_k, "rrf_k": Engine.DEFAULT_RRF_K, @@ -186,7 +203,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]: ) rows = cursor.fetchall() - return [ + results = [ DocumentResult( document=Document( id=row["id"], @@ -194,6 +211,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]: content=row["document_content"], metadata=json.loads(row["metadata"]) if row["metadata"] else {}, ), + chunk_id=row["chunk_id"], snippet=row["snippet"], vec_rank=row["vec_rank"], fts_rank=row["fts_rank"], @@ -204,6 +222,72 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]: for row in rows ] + return results + + def search_sentences( + self, query: str, chunk_id: int, k: int + ) -> List[SentenceResult]: + query_embedding = self.generate_embedding(query) + + vector_scan_type = ( + "vector_quantize_scan_stream" + if self._settings.quantize_scan + else "vector_full_scan_stream" + ) + + cursor = self._conn.cursor() + + cursor.execute( + f""" + WITH vec_matches AS ( + SELECT + v.rowid AS sentence_id, + row_number() OVER (ORDER BY v.distance) AS rank_number, + v.distance, + sentences.content as sentence_content, + sentences.sequence as sentence_sequence, + sentences.start_offset as sentence_start_offset, + sentences.end_offset as sentence_end_offset + FROM {vector_scan_type}('sentences', 'embedding', :query_embedding) AS v + JOIN sentences ON sentences.rowid = v.rowid + WHERE sentences.chunk_id = :chunk_id + LIMIT :k + ) + SELECT + sentence_id, + sentence_content, + sentence_sequence, + sentence_start_offset, + sentence_end_offset, + rank_number, + distance + FROM vec_matches + ORDER BY rank_number ASC + """, # nosec B608 + { + "query_embedding": query_embedding, + "k": k, + "chunk_id": chunk_id, + }, + ) + + rows = cursor.fetchall() + sentences = [] + for row in rows: + sentences.append( + SentenceResult( + id=row["sentence_id"], + chunk_id=chunk_id, + sequence=row["sentence_sequence"], + rank=row["rank_number"], + distance=row["distance"], + start_offset=row["sentence_start_offset"], + end_offset=row["sentence_end_offset"], + ) + ) + + return sentences[:k] + def versions(self) -> dict: """Get versions of the loaded extensions.""" cursor = self._conn.cursor() diff --git a/src/sqlite_rag/formatters.py b/src/sqlite_rag/formatters.py index 255f3f2..27bf026 100644 --- a/src/sqlite_rag/formatters.py +++ b/src/sqlite_rag/formatters.py @@ -6,6 +6,8 @@ import typer +from sqlite_rag.models.sentence_result import SentenceResult + from .models.document_result import DocumentResult @@ -163,6 +165,188 @@ def _should_show_debug(self) -> bool: return True +class BoxedDebug2Formatter(BoxedFormatter): + """Debug formatter showing sentence-level details with snippet preview from top sentences.""" + + def _get_debug_line(self, doc: DocumentResult) -> str: + """Format debug metrics line.""" + combined = ( + f"{doc.combined_rank:.5f}" if doc.combined_rank is not None else "N/A" + ) + vec_info = ( + f"#{doc.vec_rank} ({doc.vec_distance:.6f})" + if doc.vec_rank is not None + else "N/A" + ) + fts_info = ( + f"#{doc.fts_rank} ({doc.fts_score:.6f})" + if doc.fts_rank is not None + else "N/A" + ) + return f"│ Combined: {combined} │ Vector: {vec_info} │ FTS: {fts_info}" + + def _should_show_debug(self) -> bool: + return True + + def _format_single_result(self, doc: DocumentResult, idx: int) -> None: + """Format a single result with box layout including sentence details.""" + icon = self._get_file_icon(doc.document.uri or "") + + # Draw the result box header + header = f"┌─ Result #{idx} " + "─" * (67 - len(str(idx))) + typer.echo(header) + + # Display URI if available + if doc.document.uri: + uri_display = self._format_uri_display(doc.document.uri, icon, 75) + typer.echo(f"│ {uri_display:<75}│") + + # Add debug info + debug_line = self._get_debug_line(doc) + if debug_line: + typer.echo(debug_line) + + typer.echo("├" + "─" * 77 + "┤") + elif self._should_show_debug(): + debug_line = self._get_debug_line(doc) + if debug_line: + typer.echo(debug_line) + typer.echo("├" + "─" * 77 + "┤") + + # Display snippet preview from top sentences + if doc.sentences: + snippet_preview = self._build_sentence_preview(doc.snippet, doc.sentences) + preview_lines = self._clean_and_wrap_snippet( + snippet_preview, width=75, max_length=400 + ) + + typer.echo( + "│ Preview (top 3 sentences): │" + ) + for line in preview_lines: + typer.echo(f"│ {line:<75} │") + + typer.echo("├" + "─" * 77 + "┤") + typer.echo( + "│ Sentences: │" + ) + + # Display sentences with their distances + for i, sentence in enumerate(doc.sentences, 1): + distance_str = ( + f"{sentence.distance:.6f}" + if sentence.distance is not None + else "N/A" + ) + rank_str = f"#{sentence.rank}" if sentence.rank is not None else "N/A" + + # Format sentence header + sentence_header = ( + f"│ {i}. [Rank: {rank_str}, Distance: {distance_str}]" + ) + typer.echo(sentence_header.ljust(78) + " │") + + # Extract sentence text using offsets from the chunk snippet + if ( + sentence.start_offset is not None + and sentence.end_offset is not None + ): + sentence_text = doc.snippet[ + sentence.start_offset : sentence.end_offset + ] + else: + sentence_text = "[No offset information available]" + + # Wrap and display sentence content + sentence_lines = self._clean_and_wrap_snippet( + sentence_text, width=72, max_length=400 + ) + for line in sentence_lines: + typer.echo(f"│ {line:<72} │") + else: + # Fallback to regular snippet display if no sentences + snippet_lines = self._clean_and_wrap_snippet( + doc.snippet, width=75, max_length=400 + ) + for line in snippet_lines: + typer.echo(f"│ {line:<75} │") + + typer.echo("└" + "─" * 77 + "┘") + typer.echo() + + def _build_sentence_preview( + self, + chunk_content: str, + sentences: List[SentenceResult], + max_chars: int = 400, + ) -> str: + """Build preview from top 3 ranked sentences with [...] for gaps. + + Args: + chunk_content: The full chunk text + sentences: List of SentenceResult objects (should already be sorted by rank) + max_chars: Maximum total characters for preview + + Returns: + Preview string with top sentences and [...] separators + """ + + # Take top 3 sentences (they should already be sorted by rank/distance) + top_sentences = sentences[:3] + + if not top_sentences: + return chunk_content[:max_chars] + + # Sort sentences by their position in the chunk (using start_offset) + # so we can build a preview in the order they appear + sentences_with_offsets = [ + s + for s in top_sentences + if s.start_offset is not None and s.end_offset is not None + ] + + if not sentences_with_offsets: + # Fallback: no offset information, return truncated chunk content + return chunk_content[:max_chars] + + # Sort by start_offset to maintain document order + sentences_with_offsets.sort(key=lambda s: s.start_offset) + + preview_parts = [] + total_chars = 0 + prev_end_offset = None + + for sentence in sentences_with_offsets: + # Extract sentence text using offsets + sentence_text = chunk_content[ + sentence.start_offset : sentence.end_offset + ].strip() + + # Calculate remaining budget including potential separator + separator_len = len(" [...] ") if preview_parts else 0 + remaining = max_chars - total_chars - separator_len + + if remaining <= 0: + break + + # Truncate sentence if needed + if len(sentence_text) > remaining: + sentence_text = sentence_text[: remaining - 3] + "..." + + # Check if there's a gap > 10 chars from previous sentence + if prev_end_offset is not None: + gap_size = sentence.start_offset - prev_end_offset + if gap_size > 10: + preview_parts.append("[...]") + total_chars += len(" [...] ") + + preview_parts.append(sentence_text) + total_chars += len(sentence_text) + prev_end_offset = sentence.end_offset + + return " ".join(preview_parts) + + class TableDebugFormatter(SearchResultFormatter): """Table view debug formatter.""" @@ -225,11 +409,13 @@ def _print_table_row(self, idx: int, doc: DocumentResult) -> None: def get_formatter( - debug: bool = False, table_view: bool = False + debug: bool = False, debug2: bool = False, table_view: bool = False ) -> SearchResultFormatter: """Factory function to get the appropriate formatter.""" if table_view: return TableDebugFormatter() + elif debug2: + return BoxedDebug2Formatter() elif debug: return BoxedDebugFormatter() else: diff --git a/src/sqlite_rag/models/chunk.py b/src/sqlite_rag/models/chunk.py index 15bb26b..89b987e 100644 --- a/src/sqlite_rag/models/chunk.py +++ b/src/sqlite_rag/models/chunk.py @@ -1,4 +1,6 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field + +from sqlite_rag.models.sentence import Sentence @dataclass @@ -6,7 +8,8 @@ class Chunk: id: int | None = None document_id: int | None = None # The human readable content of the chunk - # (not the representation of the embedding vector) + # (it does not represent the embedding vector which + # may be altered with prompt or overlap text) content: str = "" embedding: str | bytes = b"" @@ -14,6 +17,8 @@ class Chunk: head_overlap_text: str = "" title: str | None = None + sentences: list[Sentence] = field(default_factory=list) + def get_embedding_text(self) -> str: """Get the content used to generate the embedding from. It can be enriched with overlap text and prompt instructions, diff --git a/src/sqlite_rag/models/document.py b/src/sqlite_rag/models/document.py index e8e4685..535b08b 100644 --- a/src/sqlite_rag/models/document.py +++ b/src/sqlite_rag/models/document.py @@ -18,7 +18,7 @@ class Document: created_at: datetime | None = None updated_at: datetime | None = None - chunks: list["Chunk"] = field(default_factory=list) + chunks: list[Chunk] = field(default_factory=list) def hash(self) -> str: """Generate a hash for the document content using SHA-3 for maximum collision resistance""" diff --git a/src/sqlite_rag/models/document_result.py b/src/sqlite_rag/models/document_result.py index 2a89298..86a6f9a 100644 --- a/src/sqlite_rag/models/document_result.py +++ b/src/sqlite_rag/models/document_result.py @@ -1,12 +1,14 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from .document import Document +from .sentence_result import SentenceResult @dataclass class DocumentResult: document: Document + chunk_id: int snippet: str combined_rank: float @@ -15,3 +17,6 @@ class DocumentResult: vec_distance: float | None = None fts_score: float | None = None + + # highlight sentences + sentences: list[SentenceResult] = field(default_factory=list) diff --git a/src/sqlite_rag/models/sentence.py b/src/sqlite_rag/models/sentence.py new file mode 100644 index 0000000..a7f3d1e --- /dev/null +++ b/src/sqlite_rag/models/sentence.py @@ -0,0 +1,11 @@ +from dataclasses import dataclass + + +@dataclass +class Sentence: + id: int | None = None + content: str = "" + embedding: str | bytes = b"" + sequence: int | None = None + start_offset: int | None = None + end_offset: int | None = None diff --git a/src/sqlite_rag/models/sentence_result.py b/src/sqlite_rag/models/sentence_result.py new file mode 100644 index 0000000..2718400 --- /dev/null +++ b/src/sqlite_rag/models/sentence_result.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass + + +@dataclass +class SentenceResult: + id: int | None = None + # content: str = "" + + chunk_id: int | None = None + sequence: int | None = None + + rank: float | None = None + distance: float | None = None + + start_offset: int | None = None + end_offset: int | None = None diff --git a/src/sqlite_rag/repository.py b/src/sqlite_rag/repository.py index be1bc50..4f3e08a 100644 --- a/src/sqlite_rag/repository.py +++ b/src/sqlite_rag/repository.py @@ -32,11 +32,28 @@ def add_document(self, document: Document) -> str: "INSERT INTO chunks (document_id, content, embedding) VALUES (?, ?, ?)", (document_id, chunk.content, chunk.embedding), ) + + chunk_id = cursor.lastrowid + cursor.execute( - "INSERT INTO chunks_fts (rowid, content) VALUES (last_insert_rowid(), ?)", - (chunk.content,), + "INSERT INTO chunks_fts (rowid, content) VALUES (?, ?)", + (chunk_id, chunk.content), ) + for sentence in chunk.sentences: + cursor.execute( + "INSERT INTO sentences (id, chunk_id, content, sequence, embedding, start_offset, end_offset) VALUES (?, ?, ?, ?, ?, ?, ?)", + ( + str(uuid4()), + chunk_id, + sentence.content, + sentence.sequence, + sentence.embedding, + sentence.start_offset, + sentence.end_offset, + ), + ) + self._conn.commit() return document_id diff --git a/src/sqlite_rag/sentence_splitter.py b/src/sqlite_rag/sentence_splitter.py new file mode 100644 index 0000000..75642eb --- /dev/null +++ b/src/sqlite_rag/sentence_splitter.py @@ -0,0 +1,38 @@ +import re +from typing import List + +from sqlite_rag.models.chunk import Chunk +from sqlite_rag.models.sentence import Sentence + + +class SentenceSplitter: + + def split(self, chunk: Chunk) -> List[Sentence]: + """Split chunk into sentences.""" + sentence_chunks = [] + + sentences = self._split_into_sentences(chunk.content) + start_offset = 0 + end_offset = 0 + for i, sentence in enumerate(sentences): + start_offset = chunk.content.index(sentence, end_offset) + end_offset = start_offset + len(sentence) + + sentence_chunk = Sentence( + content=sentence, + sequence=i, + start_offset=start_offset, + end_offset=end_offset, + ) + sentence_chunks.append(sentence_chunk) + + return sentence_chunks + + def _split_into_sentences(self, text: str) -> List[str]: + """Split into focused segments for semantic matching.""" + # Split on: sentence endings, semicolons, or paragraph breaks + sentence_endings = re.compile(r'(?<=[.!?;])(?:"|\')?\s+(?=[A-Z])|[\n]{2,}') + sentences = sentence_endings.split(text) + + # Keep segments that are substantial enough (20+ chars for meaningful matching) + return [s.strip() for s in sentences if len(s.strip()) > 20] diff --git a/src/sqlite_rag/settings.py b/src/sqlite_rag/settings.py index ef41fb2..42b39fc 100644 --- a/src/sqlite_rag/settings.py +++ b/src/sqlite_rag/settings.py @@ -15,11 +15,14 @@ class Settings: "./models/unsloth/embeddinggemma-300m-GGUF/embeddinggemma-300M-Q8_0.gguf" ) # See: https://github.com/sqliteai/sqlite-ai/blob/main/API.md#llm_model_loadpath-text-options-text - model_options: str = "" + other_model_options: str = "" + # See: https://github.com/sqliteai/sqlite-ai/blob/main/API.md#llm_context_createoptions-text - model_context_options: str = ( - "generate_embedding=1,normalize_embedding=1,pooling_type=mean,embedding_type=INT8" - ) + other_model_context_options: str = "" + + # How the model pools token embeddings into a single embedding + # Options: "mean", "max", "min", "last", "first" + pooling_type: str = "mean" # Allow the sqlite-ai extension to use the GPU # See: https://github.com/sqliteai/sqlite-ai @@ -27,14 +30,15 @@ class Settings: vector_type: str = "INT8" embedding_dim: int = 768 + other_vector_options: str = ( "distance=cosine" # e.g. distance=metric,other=value,... ) # It includes the overlap size and the prompt template length - chunk_size: int = 512 + chunk_size: int = 2048 # Tokens overlap between chunks - chunk_overlap: int = 61 + chunk_overlap: int = 256 # # Search settings @@ -46,7 +50,7 @@ class Settings: quantize_preload: bool = False # Weights for combining FTS and vector search results - weight_fts: float = 1.0 + weight_fts: float = 1.5 weight_vec: float = 1.0 # @@ -61,7 +65,7 @@ class Settings: # Template to index documents for retrieval, use `{title}` with the title or the string `"none"` prompt_template_retrieval_document: str = "title: {title} | text: {content}" - prompt_template_retrieval_query: str = "task: search result | query: {content}" + prompt_template_retrieval_query: str = 'title: "none" | text: {content}' # # Index settings @@ -71,6 +75,31 @@ class Settings: max_document_size_bytes: int = 5 * 1024 * 1024 # 5 MB # Zero means no limit max_chunks_per_document: int = 1000 + # Number of top sentences to return per document + top_k_sentences: int = 3 + + def get_embeddings_context_options(self) -> str: + """Get the context options for embeddings generation.""" + options = { + "n_ctx": self.chunk_size, + "embedding_type": self.vector_type, + "pooling_type": self.pooling_type, + "generate_embedding": 1, + "normalize_embedding": 1, + } + + return ",".join(f"{k}={v}" for k, v in options.items()) + ( + f",{self.other_model_context_options}" + if self.other_model_context_options + else "" + ) + + def get_vector_init_options(self) -> str: + """Get the vector init options for the vector store.""" + options = {"type": self.vector_type, "dimension": self.embedding_dim} + return ",".join(f"{k}={v}" for k, v in options.items()) + ( + f",{self.other_vector_options}" if self.other_vector_options else "" + ) class SettingsManager: @@ -177,4 +206,5 @@ def has_critical_changes( new_settings.model_path != current_settings.model_path or new_settings.embedding_dim != current_settings.embedding_dim or new_settings.vector_type != current_settings.vector_type + or new_settings.pooling_type != current_settings.pooling_type ) diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py index 8be35b6..4b14e89 100644 --- a/src/sqlite_rag/sqliterag.py +++ b/src/sqlite_rag/sqliterag.py @@ -1,3 +1,4 @@ +import re import sqlite3 from dataclasses import asdict from pathlib import Path @@ -6,6 +7,7 @@ from sqlite_rag.extractor import Extractor from sqlite_rag.logger import Logger from sqlite_rag.models.document_result import DocumentResult +from sqlite_rag.sentence_splitter import SentenceSplitter from .chunker import Chunker from .database import Database @@ -25,7 +27,12 @@ def __init__(self, connection: sqlite3.Connection, settings: Settings): self._repository = Repository(self._conn, settings) self._chunker = Chunker(self._conn, settings) - self._engine = Engine(self._conn, settings, chunker=self._chunker) + self._engine = Engine( + self._conn, + settings, + chunker=self._chunker, + sentence_chunker=SentenceSplitter(), + ) self._extractor = Extractor() self.ready = False @@ -310,10 +317,25 @@ def search( if new_context: self._engine.create_new_context() + semantic_query = query if self._settings.use_prompt_templates: - query = self._settings.prompt_template_retrieval_query.format(content=query) + semantic_query = self._settings.prompt_template_retrieval_query.format( + content=query + ) + + # Clean up and split into words + # '*' is used to match while typing + fts_query = " ".join(re.findall(r"\b\w+\b", query.lower())) + "*" + + results = self._engine.search(semantic_query, fts_query, top_k=top_k) + + # Refine chunks with top sentences + for result in results: + result.sentences = self._engine.search_sentences( + semantic_query, result.chunk_id, k=self._settings.top_k_sentences + ) - return self._engine.search(query, top_k=top_k) + return results def get_settings(self) -> dict: """Get settings and more useful information""" diff --git a/tests/test_engine.py b/tests/test_engine.py index 2f38f35..0de2517 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -204,6 +204,10 @@ def test_search_fts_results(self, db_conn): assert len(results) > 0 assert doc1_id == results[0].document.id + assert results[0].fts_rank + assert results[0].fts_rank == 1 + assert results[0].fts_score + assert results[0].fts_score > 0 def test_search_without_quantization(self, db_conn): # Arrange diff --git a/tests/test_sentence_splitter.py b/tests/test_sentence_splitter.py new file mode 100644 index 0000000..09bb151 --- /dev/null +++ b/tests/test_sentence_splitter.py @@ -0,0 +1,71 @@ +from sqlite_rag.models.chunk import Chunk +from sqlite_rag.sentence_splitter import SentenceSplitter + + +class TestSentenceSplitter: + def test_split(self): + + splitter = SentenceSplitter() + + chunk = Chunk( + id=1, + document_id=1, + title="Test Chunk", + content="This is the first sentence.\nHere is the second sentence! And what about the third?", + embedding=b"", + sentences=[], + ) + + sentences = splitter.split(chunk) + + assert len(sentences) == 3 + assert sentences[0].content == "This is the first sentence." + assert sentences[0].sequence == 0 + assert sentences[0].start_offset == 0 + assert sentences[0].end_offset == 27 + + assert sentences[1].content == "Here is the second sentence!" + assert sentences[1].sequence == 1 + assert sentences[1].start_offset == 28 + assert sentences[1].end_offset == 28 + 28 + + assert sentences[2].content == "And what about the third?" + assert sentences[2].sequence == 2 + assert sentences[2].start_offset == 57 + assert sentences[2].end_offset == 57 + 25 + + def test_split_empty(self): + splitter = SentenceSplitter() + + chunk = Chunk( + id=1, + document_id=1, + title="Empty Chunk", + content="", + embedding=b"", + sentences=[], + ) + + sentences = splitter.split(chunk) + + assert len(sentences) == 0 + + def test_split_no_punctuation(self): + splitter = SentenceSplitter() + + chunk = Chunk( + id=1, + document_id=1, + title="No Punctuation Chunk", + content="This is a sentence without punctuation and another one follows it", + embedding=b"", + sentences=[], + ) + + sentences = splitter.split(chunk) + + assert len(sentences) == 1 + assert sentences[0].content == chunk.content + assert sentences[0].sequence == 0 + assert sentences[0].start_offset == 0 + assert sentences[0].end_offset == len(chunk.content) diff --git a/tests/test_settings.py b/tests/test_settings.py index c8b6e3e..a26f0eb 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -9,7 +9,7 @@ def test_store_settings(self, db_conn): settings_manager = SettingsManager(db_conn[0]) settings = Settings( model_path="test_model", - model_options="test_config", + other_model_options="test_config", embedding_dim=768, vector_type="test_store", chunk_overlap=100, @@ -23,7 +23,7 @@ def test_store_settings(self, db_conn): assert stored_settings is not None assert stored_settings.model_path == "test_model" - assert stored_settings.model_options == "test_config" + assert stored_settings.other_model_options == "test_config" assert stored_settings.embedding_dim == 768 assert stored_settings.vector_type == "test_store" assert stored_settings.chunk_overlap == 100 @@ -34,7 +34,7 @@ def test_store_settings_when_exist(self, db_conn): settings_manager = SettingsManager(db_conn[0]) settings = Settings( model_path="test_model", - model_options="test_config", + other_model_options="test_config", embedding_dim=768, vector_type="test_store", chunk_overlap=100, @@ -47,7 +47,7 @@ def test_store_settings_when_exist(self, db_conn): # Store again with different values new_settings = Settings( model_path="new_model", - model_options="new_config", + other_model_options="new_config", embedding_dim=512, vector_type="new_store", chunk_overlap=50, @@ -60,7 +60,7 @@ def test_store_settings_when_exist(self, db_conn): assert stored_settings is not None assert stored_settings.model_path == "new_model" - assert stored_settings.model_options == "new_config" + assert stored_settings.other_model_options == "new_config" assert stored_settings.embedding_dim == 512 assert stored_settings.vector_type == "new_store" assert stored_settings.chunk_overlap == 50 @@ -82,7 +82,7 @@ def test_load_settings_with_defaults(self, db_conn): assert loaded_settings is not None assert loaded_settings.model_path == settings.model_path - assert loaded_settings.model_options == settings.model_options + assert loaded_settings.other_model_options == settings.other_model_options assert loaded_settings.embedding_dim == settings.embedding_dim assert loaded_settings.vector_type == settings.vector_type assert loaded_settings.chunk_overlap == settings.chunk_overlap From 20731f3ee3ee61d3a9c30593a80a2da7ff920d4f Mon Sep 17 00:00:00 2001 From: Daniele Briggi <=> Date: Fri, 17 Oct 2025 14:22:58 +0000 Subject: [PATCH 2/7] refact(formatters): use sentences --- src/sqlite_rag/cli.py | 11 +- src/sqlite_rag/database.py | 1 - src/sqlite_rag/engine.py | 3 - src/sqlite_rag/formatters.py | 259 +++++++++++------------ src/sqlite_rag/models/sentence.py | 1 - src/sqlite_rag/models/sentence_result.py | 3 - src/sqlite_rag/repository.py | 3 +- src/sqlite_rag/sentence_splitter.py | 8 +- 8 files changed, 132 insertions(+), 157 deletions(-) diff --git a/src/sqlite_rag/cli.py b/src/sqlite_rag/cli.py index 13278cc..7035ff5 100644 --- a/src/sqlite_rag/cli.py +++ b/src/sqlite_rag/cli.py @@ -439,17 +439,12 @@ def reset( def search( ctx: typer.Context, query: str, - limit: int = typer.Option(10, help="Number of results to return"), + limit: int = typer.Option(5, help="Number of results to return"), debug: bool = typer.Option( False, "-d", "--debug", - help="Print extra debug information with modern formatting", - ), - debug2: bool = typer.Option( - False, - "--debug2", - help="Print debug format with sentence-level details and snippet context", + help="Print extra debug information with sentence-level details", ), peek: bool = typer.Option( False, "--peek", help="Print debug information using compact table format" @@ -467,7 +462,7 @@ def search( results = results[:limit] # Get the appropriate formatter and display results - formatter = get_formatter(debug=debug, debug2=debug2, table_view=peek) + formatter = get_formatter(debug=debug, table_view=peek) formatter.format_results(results, query) typer.echo(f"{search_time:.3f} seconds") diff --git a/src/sqlite_rag/database.py b/src/sqlite_rag/database.py index 7bdd357..b5ed743 100644 --- a/src/sqlite_rag/database.py +++ b/src/sqlite_rag/database.py @@ -96,7 +96,6 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings): chunk_id INTEGER, content TEXT, embedding BLOB, - sequence INTEGER, start_offset INTEGER, end_offset INTEGER ) diff --git a/src/sqlite_rag/engine.py b/src/sqlite_rag/engine.py index 840992c..2da3169 100644 --- a/src/sqlite_rag/engine.py +++ b/src/sqlite_rag/engine.py @@ -245,7 +245,6 @@ def search_sentences( row_number() OVER (ORDER BY v.distance) AS rank_number, v.distance, sentences.content as sentence_content, - sentences.sequence as sentence_sequence, sentences.start_offset as sentence_start_offset, sentences.end_offset as sentence_end_offset FROM {vector_scan_type}('sentences', 'embedding', :query_embedding) AS v @@ -256,7 +255,6 @@ def search_sentences( SELECT sentence_id, sentence_content, - sentence_sequence, sentence_start_offset, sentence_end_offset, rank_number, @@ -278,7 +276,6 @@ def search_sentences( SentenceResult( id=row["sentence_id"], chunk_id=chunk_id, - sequence=row["sentence_sequence"], rank=row["rank_number"], distance=row["distance"], start_offset=row["sentence_start_offset"], diff --git a/src/sqlite_rag/formatters.py b/src/sqlite_rag/formatters.py index 27bf026..ca3b0df 100644 --- a/src/sqlite_rag/formatters.py +++ b/src/sqlite_rag/formatters.py @@ -82,6 +82,81 @@ def _format_uri_display(self, uri: str, icon: str, max_width: int = 75) -> str: uri_display = f"{icon} ...{uri[-available_width:]}" return uri_display + def _build_sentence_preview( + self, + chunk_content: str, + sentences: List[SentenceResult], + max_chars: int = 400, + ) -> str: + """Build preview from top 3 ranked sentences with [...] for gaps. + + Args: + chunk_content: The full chunk text + sentences: List of SentenceResult objects (should already be sorted by rank) + max_chars: Maximum total characters for preview + + Returns: + Preview string with top sentences and [...] separators. + Falls back to truncated chunk_content if sentences have no offsets. + """ + + # Take top 3 sentences (they should already be sorted by rank/distance) + top_sentences = sentences[:3] if sentences else [] + + if not top_sentences: + # Fallback: no sentences, return truncated chunk content + return chunk_content[:max_chars] + + # Filter sentences that have offset information + sentences_with_offsets = [ + s + for s in top_sentences + if s.start_offset is not None and s.end_offset is not None + ] + + if not sentences_with_offsets: + # Fallback: sentences exist but no offset information, return truncated chunk content + return chunk_content[:max_chars] + + # Sort by start_offset to maintain document order + sentences_with_offsets.sort( + key=lambda s: s.start_offset if s.start_offset is not None else -1 + ) + + preview_parts = [] + total_chars = 0 + prev_end_offset = None + + for sentence in sentences_with_offsets: + # Extract sentence text using offsets + sentence_text = chunk_content[ + sentence.start_offset : sentence.end_offset + ].strip() + + # Calculate remaining budget including potential separator + separator_len = len(" [...] ") if preview_parts else 0 + remaining = max_chars - total_chars - separator_len + + if remaining <= 0: + break + + # Truncate sentence if needed + if len(sentence_text) > remaining: + sentence_text = sentence_text[: remaining - 3] + "..." + + # Check if there's a gap > 10 chars from previous sentence + if prev_end_offset is not None and sentence.start_offset is not None: + gap_size = sentence.start_offset - prev_end_offset + if gap_size > 10: + preview_parts.append("[...]") + total_chars += len(" [...] ") + + preview_parts.append(sentence_text) + total_chars += len(sentence_text) + prev_end_offset = sentence.end_offset + + return " ".join(preview_parts) + class BoxedFormatter(SearchResultFormatter): """Base class for boxed result formatters.""" @@ -100,8 +175,15 @@ def format_results(self, results: List[DocumentResult], query: str) -> None: def _format_single_result(self, doc: DocumentResult, idx: int) -> None: """Format a single result with box layout.""" icon = self._get_file_icon(doc.document.uri or "") + + # Use sentence-based preview if sentences are available + if doc.sentences: + snippet_text = self._build_sentence_preview(doc.snippet, doc.sentences) + else: + snippet_text = doc.snippet + snippet_lines = self._clean_and_wrap_snippet( - doc.snippet, width=75, max_length=400 + snippet_text, width=75, max_length=400 ) # Draw the result box header @@ -164,33 +246,19 @@ def _get_debug_line(self, doc: DocumentResult) -> str: def _should_show_debug(self) -> bool: return True + def _format_single_result(self, doc: DocumentResult, idx: int) -> None: + """Format a single result with box layout including sentence summary.""" + icon = self._get_file_icon(doc.document.uri or "") -class BoxedDebug2Formatter(BoxedFormatter): - """Debug formatter showing sentence-level details with snippet preview from top sentences.""" + # Use sentence-based preview if sentences are available + if doc.sentences: + snippet_text = self._build_sentence_preview(doc.snippet, doc.sentences) + else: + snippet_text = doc.snippet - def _get_debug_line(self, doc: DocumentResult) -> str: - """Format debug metrics line.""" - combined = ( - f"{doc.combined_rank:.5f}" if doc.combined_rank is not None else "N/A" - ) - vec_info = ( - f"#{doc.vec_rank} ({doc.vec_distance:.6f})" - if doc.vec_rank is not None - else "N/A" - ) - fts_info = ( - f"#{doc.fts_rank} ({doc.fts_score:.6f})" - if doc.fts_rank is not None - else "N/A" + snippet_lines = self._clean_and_wrap_snippet( + snippet_text, width=75, max_length=400 ) - return f"│ Combined: {combined} │ Vector: {vec_info} │ FTS: {fts_info}" - - def _should_show_debug(self) -> bool: - return True - - def _format_single_result(self, doc: DocumentResult, idx: int) -> None: - """Format a single result with box layout including sentence details.""" - icon = self._get_file_icon(doc.document.uri or "") # Draw the result box header header = f"┌─ Result #{idx} " + "─" * (67 - len(str(idx))) @@ -213,26 +281,18 @@ def _format_single_result(self, doc: DocumentResult, idx: int) -> None: typer.echo(debug_line) typer.echo("├" + "─" * 77 + "┤") - # Display snippet preview from top sentences - if doc.sentences: - snippet_preview = self._build_sentence_preview(doc.snippet, doc.sentences) - preview_lines = self._clean_and_wrap_snippet( - snippet_preview, width=75, max_length=400 - ) - - typer.echo( - "│ Preview (top 3 sentences): │" - ) - for line in preview_lines: - typer.echo(f"│ {line:<75} │") + # Display snippet preview + for line in snippet_lines: + typer.echo(f"│ {line:<75} │") + # Display sentence details if available + if doc.sentences: typer.echo("├" + "─" * 77 + "┤") typer.echo( "│ Sentences: │" ) - # Display sentences with their distances - for i, sentence in enumerate(doc.sentences, 1): + for sentence in doc.sentences[:5]: # Show max 5 sentences distance_str = ( f"{sentence.distance:.6f}" if sentence.distance is not None @@ -240,112 +300,33 @@ def _format_single_result(self, doc: DocumentResult, idx: int) -> None: ) rank_str = f"#{sentence.rank}" if sentence.rank is not None else "N/A" - # Format sentence header - sentence_header = ( - f"│ {i}. [Rank: {rank_str}, Distance: {distance_str}]" - ) - typer.echo(sentence_header.ljust(78) + " │") - - # Extract sentence text using offsets from the chunk snippet + # Extract sentence preview (first 50 chars) if ( sentence.start_offset is not None and sentence.end_offset is not None ): sentence_text = doc.snippet[ sentence.start_offset : sentence.end_offset - ] + ].strip() + # Truncate and clean for display + sentence_preview = sentence_text.replace("\n", " ").replace( + "\r", "" + ) + if len(sentence_preview) > 50: + sentence_preview = sentence_preview[:47] + "..." else: - sentence_text = "[No offset information available]" + sentence_preview = "[No offset info]" - # Wrap and display sentence content - sentence_lines = self._clean_and_wrap_snippet( - sentence_text, width=72, max_length=400 + # Format sentence line + sentence_line = ( + f"│ {rank_str:>3} ({distance_str}) | {sentence_preview}" ) - for line in sentence_lines: - typer.echo(f"│ {line:<72} │") - else: - # Fallback to regular snippet display if no sentences - snippet_lines = self._clean_and_wrap_snippet( - doc.snippet, width=75, max_length=400 - ) - for line in snippet_lines: - typer.echo(f"│ {line:<75} │") + # Pad to 78 chars and add closing border + typer.echo(sentence_line.ljust(78) + " │") typer.echo("└" + "─" * 77 + "┘") typer.echo() - def _build_sentence_preview( - self, - chunk_content: str, - sentences: List[SentenceResult], - max_chars: int = 400, - ) -> str: - """Build preview from top 3 ranked sentences with [...] for gaps. - - Args: - chunk_content: The full chunk text - sentences: List of SentenceResult objects (should already be sorted by rank) - max_chars: Maximum total characters for preview - - Returns: - Preview string with top sentences and [...] separators - """ - - # Take top 3 sentences (they should already be sorted by rank/distance) - top_sentences = sentences[:3] - - if not top_sentences: - return chunk_content[:max_chars] - - # Sort sentences by their position in the chunk (using start_offset) - # so we can build a preview in the order they appear - sentences_with_offsets = [ - s - for s in top_sentences - if s.start_offset is not None and s.end_offset is not None - ] - - if not sentences_with_offsets: - # Fallback: no offset information, return truncated chunk content - return chunk_content[:max_chars] - - # Sort by start_offset to maintain document order - sentences_with_offsets.sort(key=lambda s: s.start_offset) - - preview_parts = [] - total_chars = 0 - prev_end_offset = None - - for sentence in sentences_with_offsets: - # Extract sentence text using offsets - sentence_text = chunk_content[ - sentence.start_offset : sentence.end_offset - ].strip() - - # Calculate remaining budget including potential separator - separator_len = len(" [...] ") if preview_parts else 0 - remaining = max_chars - total_chars - separator_len - - if remaining <= 0: - break - - # Truncate sentence if needed - if len(sentence_text) > remaining: - sentence_text = sentence_text[: remaining - 3] + "..." - - # Check if there's a gap > 10 chars from previous sentence - if prev_end_offset is not None: - gap_size = sentence.start_offset - prev_end_offset - if gap_size > 10: - preview_parts.append("[...]") - total_chars += len(" [...] ") - - preview_parts.append(sentence_text) - total_chars += len(sentence_text) - prev_end_offset = sentence.end_offset - - return " ".join(preview_parts) - class TableDebugFormatter(SearchResultFormatter): """Table view debug formatter.""" @@ -383,8 +364,16 @@ def _print_table_header(self) -> None: def _print_table_row(self, idx: int, doc: DocumentResult) -> None: """Print a single table row.""" + # Use sentence-based preview if sentences are available + if doc.sentences: + snippet = self._build_sentence_preview( + doc.snippet, doc.sentences, max_chars=52 + ) + else: + snippet = doc.snippet + # Clean snippet display - snippet = doc.snippet.replace("\n", " ").replace("\r", "") + snippet = snippet.replace("\n", " ").replace("\r", "") snippet = snippet[:49] + "..." if len(snippet) > 52 else snippet # Clean URI display @@ -409,13 +398,11 @@ def _print_table_row(self, idx: int, doc: DocumentResult) -> None: def get_formatter( - debug: bool = False, debug2: bool = False, table_view: bool = False + debug: bool = False, table_view: bool = False ) -> SearchResultFormatter: """Factory function to get the appropriate formatter.""" if table_view: return TableDebugFormatter() - elif debug2: - return BoxedDebug2Formatter() elif debug: return BoxedDebugFormatter() else: diff --git a/src/sqlite_rag/models/sentence.py b/src/sqlite_rag/models/sentence.py index a7f3d1e..064b233 100644 --- a/src/sqlite_rag/models/sentence.py +++ b/src/sqlite_rag/models/sentence.py @@ -6,6 +6,5 @@ class Sentence: id: int | None = None content: str = "" embedding: str | bytes = b"" - sequence: int | None = None start_offset: int | None = None end_offset: int | None = None diff --git a/src/sqlite_rag/models/sentence_result.py b/src/sqlite_rag/models/sentence_result.py index 2718400..7094efd 100644 --- a/src/sqlite_rag/models/sentence_result.py +++ b/src/sqlite_rag/models/sentence_result.py @@ -4,10 +4,7 @@ @dataclass class SentenceResult: id: int | None = None - # content: str = "" - chunk_id: int | None = None - sequence: int | None = None rank: float | None = None distance: float | None = None diff --git a/src/sqlite_rag/repository.py b/src/sqlite_rag/repository.py index 4f3e08a..005bf80 100644 --- a/src/sqlite_rag/repository.py +++ b/src/sqlite_rag/repository.py @@ -42,12 +42,11 @@ def add_document(self, document: Document) -> str: for sentence in chunk.sentences: cursor.execute( - "INSERT INTO sentences (id, chunk_id, content, sequence, embedding, start_offset, end_offset) VALUES (?, ?, ?, ?, ?, ?, ?)", + "INSERT INTO sentences (id, chunk_id, content, embedding, start_offset, end_offset) VALUES (?, ?, ?, ?, ?, ?)", ( str(uuid4()), chunk_id, sentence.content, - sentence.sequence, sentence.embedding, sentence.start_offset, sentence.end_offset, diff --git a/src/sqlite_rag/sentence_splitter.py b/src/sqlite_rag/sentence_splitter.py index 75642eb..e22b74f 100644 --- a/src/sqlite_rag/sentence_splitter.py +++ b/src/sqlite_rag/sentence_splitter.py @@ -6,6 +6,7 @@ class SentenceSplitter: + MIN_CHARS_PER_SENTENCE = 20 def split(self, chunk: Chunk) -> List[Sentence]: """Split chunk into sentences.""" @@ -14,13 +15,12 @@ def split(self, chunk: Chunk) -> List[Sentence]: sentences = self._split_into_sentences(chunk.content) start_offset = 0 end_offset = 0 - for i, sentence in enumerate(sentences): + for sentence in sentences: start_offset = chunk.content.index(sentence, end_offset) end_offset = start_offset + len(sentence) sentence_chunk = Sentence( content=sentence, - sequence=i, start_offset=start_offset, end_offset=end_offset, ) @@ -35,4 +35,6 @@ def _split_into_sentences(self, text: str) -> List[str]: sentences = sentence_endings.split(text) # Keep segments that are substantial enough (20+ chars for meaningful matching) - return [s.strip() for s in sentences if len(s.strip()) > 20] + return [ + s.strip() for s in sentences if len(s.strip()) > self.MIN_CHARS_PER_SENTENCE + ] From fa06d146cd222d427fdc5462a5af5a296656f07a Mon Sep 17 00:00:00 2001 From: Daniele Briggi <=> Date: Fri, 17 Oct 2025 15:30:56 +0000 Subject: [PATCH 3/7] fix(tests): engine tests --- src/sqlite_rag/engine.py | 16 +- src/sqlite_rag/sentence_splitter.py | 18 +- src/sqlite_rag/sqliterag.py | 4 +- tests/conftest.py | 8 +- tests/integration/test_engine.py | 252 ++++++++++++++++++++++++++++ tests/test_chunker.py | 4 +- tests/test_engine.py | 217 ++++++------------------ 7 files changed, 328 insertions(+), 191 deletions(-) diff --git a/src/sqlite_rag/engine.py b/src/sqlite_rag/engine.py index 2da3169..f8ce76e 100644 --- a/src/sqlite_rag/engine.py +++ b/src/sqlite_rag/engine.py @@ -22,12 +22,12 @@ def __init__( conn: sqlite3.Connection, settings: Settings, chunker: Chunker, - sentence_chunker: SentenceSplitter, + sentence_splitter: SentenceSplitter, ): self._conn = conn self._settings = settings self._chunker = chunker - self._sentence_chunker = sentence_chunker + self._sentence_splitter = sentence_splitter self._logger = Logger() def load_model(self): @@ -55,7 +55,7 @@ def process(self, document: Document) -> Document: chunk.title = document.get_title() chunk.embedding = self.generate_embedding(chunk.get_embedding_text()) - sentences = self._sentence_chunker.split(chunk) + sentences = self._sentence_splitter.split(chunk) for sentence in sentences: sentence.embedding = self.generate_embedding(sentence.content) chunk.sentences = sentences @@ -225,7 +225,7 @@ def search( return results def search_sentences( - self, query: str, chunk_id: int, k: int + self, query: str, chunk_id: int, top_k: int ) -> List[SentenceResult]: query_embedding = self.generate_embedding(query) @@ -250,7 +250,8 @@ def search_sentences( FROM {vector_scan_type}('sentences', 'embedding', :query_embedding) AS v JOIN sentences ON sentences.rowid = v.rowid WHERE sentences.chunk_id = :chunk_id - LIMIT :k + ORDER BY rank_number ASC + LIMIT :top_k ) SELECT sentence_id, @@ -260,11 +261,10 @@ def search_sentences( rank_number, distance FROM vec_matches - ORDER BY rank_number ASC """, # nosec B608 { "query_embedding": query_embedding, - "k": k, + "top_k": top_k, "chunk_id": chunk_id, }, ) @@ -283,7 +283,7 @@ def search_sentences( ) ) - return sentences[:k] + return sentences[:top_k] def versions(self) -> dict: """Get versions of the loaded extensions.""" diff --git a/src/sqlite_rag/sentence_splitter.py b/src/sqlite_rag/sentence_splitter.py index e22b74f..dc5e82a 100644 --- a/src/sqlite_rag/sentence_splitter.py +++ b/src/sqlite_rag/sentence_splitter.py @@ -10,23 +10,23 @@ class SentenceSplitter: def split(self, chunk: Chunk) -> List[Sentence]: """Split chunk into sentences.""" - sentence_chunks = [] + sentences = [] - sentences = self._split_into_sentences(chunk.content) + sentences_text = self._split_into_sentences(chunk.content) start_offset = 0 end_offset = 0 - for sentence in sentences: - start_offset = chunk.content.index(sentence, end_offset) - end_offset = start_offset + len(sentence) + for sentence_text in sentences_text: + start_offset = chunk.content.index(sentence_text, end_offset) + end_offset = start_offset + len(sentence_text) - sentence_chunk = Sentence( - content=sentence, + sentence = Sentence( + content=sentence_text, start_offset=start_offset, end_offset=end_offset, ) - sentence_chunks.append(sentence_chunk) + sentences.append(sentence) - return sentence_chunks + return sentences def _split_into_sentences(self, text: str) -> List[str]: """Split into focused segments for semantic matching.""" diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py index 4b14e89..da2078b 100644 --- a/src/sqlite_rag/sqliterag.py +++ b/src/sqlite_rag/sqliterag.py @@ -31,7 +31,7 @@ def __init__(self, connection: sqlite3.Connection, settings: Settings): self._conn, settings, chunker=self._chunker, - sentence_chunker=SentenceSplitter(), + sentence_splitter=SentenceSplitter(), ) self._extractor = Extractor() @@ -332,7 +332,7 @@ def search( # Refine chunks with top sentences for result in results: result.sentences = self._engine.search_sentences( - semantic_query, result.chunk_id, k=self._settings.top_k_sentences + semantic_query, result.chunk_id, top_k=self._settings.top_k_sentences ) return results diff --git a/tests/conftest.py b/tests/conftest.py index 477832d..04a3596 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,7 @@ from sqlite_rag.chunker import Chunker from sqlite_rag.database import Database from sqlite_rag.engine import Engine +from sqlite_rag.sentence_splitter import SentenceSplitter from sqlite_rag.settings import Settings @@ -28,7 +29,12 @@ def db_conn(): def engine(db_conn) -> Engine: conn, settings = db_conn - engine = Engine(conn, settings, chunker=Chunker(conn, settings)) + engine = Engine( + conn, + settings, + chunker=Chunker(conn, settings), + sentence_splitter=SentenceSplitter(), + ) engine.load_model() engine.quantize() engine.create_new_context() diff --git a/tests/integration/test_engine.py b/tests/integration/test_engine.py index 9b99ff6..d9ab3ea 100644 --- a/tests/integration/test_engine.py +++ b/tests/integration/test_engine.py @@ -1,8 +1,15 @@ import random import string +from sqlite3 import OperationalError import pytest +from sqlite_rag.chunker import Chunker +from sqlite_rag.engine import Engine +from sqlite_rag.models.document import Document +from sqlite_rag.repository import Repository +from sqlite_rag.sentence_splitter import SentenceSplitter + class TestEngine: @pytest.mark.slow @@ -26,3 +33,248 @@ def random_string(length=30): # Assert assert len(result_chunks) == 1000 + + +class TestEngineQuantization: + def test_quantize_embedding(self, engine): + """Test quantize called for chunks and sentences embeddings.""" + engine.quantize() + + # If no exception is raised, the test passes + engine.search("test query", "test query") + + def test_quantize_cleanup(self, engine): + """Test quantize cleanup works without errors.""" + engine.quantize() + engine.quantize_cleanup() + + with pytest.raises(OperationalError) as exc_info: + engine.search("test query", "test query") + assert "Ensure that vector_quantize() has been called" in str(exc_info.value) + + +class TestEngineSearch: + def test_search_with_empty_database(self, engine): + results = engine.search("nonexistent query", top_k=5) + + assert len(results) == 0 + + def test_search_with_semantic_and_fts(self, db_conn): + # Arrange + conn, settings = db_conn + + engine = Engine(conn, settings, Chunker(conn, settings), SentenceSplitter()) + engine.load_model() + engine.create_new_context() + + doc1 = Document( + content="The quick brown fox jumps over the lazy dog.", + uri="document1.txt", + ) + doc2 = Document( + content="How much wood would a woodchuck chuck if a woodchuck could chuck wood?", + uri="document2.txt", + ) + doc3 = Document( + content="This document discusses about woodcutters and wood.", + uri="document3.txt", + ) + + engine.process(doc1) + engine.process(doc2) + engine.process(doc3) + + repository = Repository(conn, settings) + repository.add_document(doc1) + repository.add_document(doc2) + doc3_id = repository.add_document(doc3) + + engine.quantize() + + # Act + results = engine.search("wood lumberjack", "wood lumberjack", top_k=5) + + assert len(results) > 0 + assert doc3_id == results[0].document.id + + def test_search_semantic_result(self, db_conn): + # Arrange + conn, settings = db_conn + + engine = Engine(conn, settings, Chunker(conn, settings), SentenceSplitter()) + engine.load_model() + engine.create_new_context() + + doc1 = Document( + content="The quick brown fox jumps over the lazy dog.", + uri="document1.txt", + ) + doc2 = Document( + content="How much wood would a woodchuck chuck if a woodchuck could chuck wood?", + uri="document2.txt", + ) + doc3 = Document( + content="This document discusses about woodcutters and wood.", + uri="document3.txt", + ) + + engine.process(doc1) + engine.process(doc2) + engine.process(doc3) + + repository = Repository(conn, settings) + repository.add_document(doc1) + repository.add_document(doc2) + doc3_id = repository.add_document(doc3) + + engine.quantize() + + # Act + results = engine.search("about lumberjack", "about lumberjack", top_k=5) + + assert len(results) > 0 + assert doc3_id == results[0].document.id + + def test_search_fts_results(self, db_conn): + # Arrange + conn, settings = db_conn + + engine = Engine(conn, settings, Chunker(conn, settings), SentenceSplitter()) + engine.load_model() + engine.create_new_context() + + doc1 = Document( + content="The quick brown fox jumps over the lazy dog.", + uri="document1.txt", + ) + doc2 = Document( + content="How much wood would a woodchuck chuck if a woodchuck could chuck wood?", + uri="document2.txt", + ) + doc3 = Document( + content="This document discusses about woodcutters and wood.", + uri="document3.txt", + ) + + engine.process(doc1) + engine.process(doc2) + engine.process(doc3) + + repository = Repository(conn, settings) + doc1_id = repository.add_document(doc1) + repository.add_document(doc2) + repository.add_document(doc3) + + engine.quantize() + + # Act + results = engine.search("quick brown fox", "quick brown fox", top_k=5) + + assert len(results) > 0 + assert doc1_id == results[0].document.id + assert results[0].fts_rank + assert results[0].fts_rank == 1 + assert results[0].fts_score + + def test_search_without_quantization(self, db_conn): + # Arrange + conn, settings = db_conn + settings.quantize_scan = False + + engine = Engine(conn, settings, Chunker(conn, settings), SentenceSplitter()) + engine.load_model() + + doc = Document( + content="The quick brown fox jumps over the lazy dog.", + uri="document1.txt", + ) + + engine.create_new_context() + engine.process(doc) + + repository = Repository(conn, settings) + doc_id = repository.add_document(doc) + + # Act + results = engine.search("wood lumberjack", "wood lumberjack") + + assert len(results) > 0 + assert doc_id == results[0].document.id + + def test_search_exact_match(self, db_conn): + conn, settings = db_conn + # cosin distance for searching embedding is exact 0.0 when strings match + settings.other_vector_options = "distance=cosine" + settings.use_prompt_templates = False + + engine = Engine(conn, settings, Chunker(conn, settings), SentenceSplitter()) + engine.load_model() + engine.create_new_context() + + doc1 = Document( + content="The quick brown fox jumps over the lazy dog", + uri="document1.txt", + ) + doc2 = Document( + content="How much wood would a woodchuck chuck if a woodchuck could chuck wood?", + uri="document2.txt", + ) + + engine.process(doc1) + engine.process(doc2) + + repository = Repository(conn, settings) + doc1_id = repository.add_document(doc1) + repository.add_document(doc2) + + engine.quantize() + + # Act + results = engine.search( + "The quick brown fox jumps over the lazy dog", + "The quick brown fox jumps over the lazy dog", + ) + + assert len(results) > 0 + assert doc1_id == results[0].document.id + assert 0.0 == results[0].vec_distance + + +class TestEngineSearchSentences: + def test_search_sentences(self, db_conn): + conn, settings = db_conn + settings.use_prompt_templates = False + settings.quantize_scan = False + + engine = Engine(conn, settings, Chunker(conn, settings), SentenceSplitter()) + engine.load_model() + engine.create_new_context() + + doc = Document( + content=( + """The quick brown fox jumps over the lazy dog. + A stitch in time saves nine. + An apple a day keeps the doctor away. + """ + ), + uri="document1.txt", + ) + + engine.process(doc) + + repository = Repository(conn, settings) + doc_id = repository.add_document(doc) + + cursor = conn.execute("SELECT id FROM chunks WHERE document_id = ?", (doc_id,)) + chunk_id = cursor.fetchone()[0] + + # Act + results = engine.search_sentences( + "stitch time", + chunk_id, + top_k=1, + ) + + assert len(results) > 0 + assert results[0].start_offset == 61 # it's the second sentence + assert results[0].end_offset == 89 diff --git a/tests/test_chunker.py b/tests/test_chunker.py index 8c54949..792e21c 100644 --- a/tests/test_chunker.py +++ b/tests/test_chunker.py @@ -322,9 +322,9 @@ def test_chunk_size_equals_overlap(self, mock_conn): chunker = Chunker(mock_conn, settings) text = "This is a test sentence that should be handled gracefully." - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError) as exc_info: chunker.chunk(Document(content=text)) - assert "Chunk size must be greater than chunk overlap." in str(excinfo.value) + assert "Chunk size must be greater than chunk overlap." in str(exc_info.value) def test_very_small_chunk_size(self, mock_conn): """Test with chunk_size = 1.""" diff --git a/tests/test_engine.py b/tests/test_engine.py index 0de2517..7f7ead6 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -1,10 +1,9 @@ import pytest -from sqlite_rag.chunker import Chunker from sqlite_rag.engine import Engine from sqlite_rag.models.chunk import Chunk from sqlite_rag.models.document import Document -from sqlite_rag.repository import Repository +from sqlite_rag.models.sentence import Sentence from sqlite_rag.settings import Settings @@ -32,8 +31,10 @@ def test_process_uses_get_embedding_text(self, mocker): mock_conn = mocker.Mock() mock_chunker = mocker.Mock() mock_chunker.chunk.return_value = [mock_chunk] + mock_sentence_splitter = mocker.Mock() + mock_sentence_splitter.split.return_value = [] - engine = Engine(mock_conn, settings, mock_chunker) + engine = Engine(mock_conn, settings, mock_chunker, mock_sentence_splitter) # Mock generate_embedding completely mock_generate = mocker.patch.object( @@ -65,8 +66,10 @@ def test_process_with_max_chunks_per_document( settings = Settings(max_chunks_per_document=max_chunks_per_document) mock_chunker = mocker.Mock() mock_chunker.chunk.return_value = chunks + mock_sentence_splitter = mocker.Mock() + mock_sentence_splitter.split.return_value = [] - engine = Engine(mock_conn, settings, mock_chunker) + engine = Engine(mock_conn, settings, mock_chunker, mock_sentence_splitter) mock_generate_embedding = mocker.patch.object(engine, "generate_embedding") mock_generate_embedding = mocker.spy( @@ -84,187 +87,63 @@ def test_process_with_max_chunks_per_document( chunks = call_args[0][0] # First argument assert len(chunks) == expected_chunk_count - -class TestEngineSearch: - def test_search_with_empty_database(self, engine): - results = engine.search("nonexistent query", top_k=5) - - assert len(results) == 0 - - def test_search_with_semantic_and_fts(self, db_conn): - # Arrange - conn, settings = db_conn - - engine = Engine(conn, settings, Chunker(conn, settings)) - engine.load_model() - engine.create_new_context() - - doc1 = Document( - content="The quick brown fox jumps over the lazy dog.", - uri="document1.txt", - ) - doc2 = Document( - content="How much wood would a woodchuck chuck if a woodchuck could chuck wood?", - uri="document2.txt", - ) - doc3 = Document( - content="This document discusses about woodcutters and wood.", - uri="document3.txt", - ) - - engine.process(doc1) - engine.process(doc2) - engine.process(doc3) - - repository = Repository(conn, settings) - repository.add_document(doc1) - repository.add_document(doc2) - doc3_id = repository.add_document(doc3) - - engine.quantize() - - # Act - results = engine.search("wood lumberjack", top_k=5) - - assert len(results) > 0 - assert doc3_id == results[0].document.id - - def test_search_semantic_result(self, db_conn): + def test_process_with_sentences(self, mocker): # Arrange - conn, settings = db_conn + chunks = [Chunk(content="Chunk 1"), Chunk(content="Chunk 2")] - engine = Engine(conn, settings, Chunker(conn, settings)) - engine.load_model() - engine.create_new_context() - - doc1 = Document( - content="The quick brown fox jumps over the lazy dog.", - uri="document1.txt", - ) - doc2 = Document( - content="How much wood would a woodchuck chuck if a woodchuck could chuck wood?", - uri="document2.txt", - ) - doc3 = Document( - content="This document discusses about woodcutters and wood.", - uri="document3.txt", - ) - - engine.process(doc1) - engine.process(doc2) - engine.process(doc3) - - repository = Repository(conn, settings) - repository.add_document(doc1) - repository.add_document(doc2) - doc3_id = repository.add_document(doc3) - - engine.quantize() - - # Act - results = engine.search("about lumberjack", top_k=5) - - assert len(results) > 0 - assert doc3_id == results[0].document.id - - def test_search_fts_results(self, db_conn): - # Arrange - conn, settings = db_conn + mock_conn = mocker.Mock() + settings = Settings() + mock_chunker = mocker.Mock() + mock_chunker.chunk.return_value = chunks + mock_sentence_splitter = mocker.Mock() + # return different number of sentences per chunk + mock_sentence_splitter.split.side_effect = [ + [Sentence(content="Sentence 1.1")], + [Sentence(content="Sentence 2.1"), Sentence(content="Sentence 2.2")], + ] - engine = Engine(conn, settings, Chunker(conn, settings)) - engine.load_model() - engine.create_new_context() + engine = Engine(mock_conn, settings, mock_chunker, mock_sentence_splitter) - doc1 = Document( - content="The quick brown fox jumps over the lazy dog.", - uri="document1.txt", - ) - doc2 = Document( - content="How much wood would a woodchuck chuck if a woodchuck could chuck wood?", - uri="document2.txt", - ) - doc3 = Document( - content="This document discusses about woodcutters and wood.", - uri="document3.txt", + mock_generate_embedding = mocker.patch.object(engine, "generate_embedding") + mock_generate_embedding = mocker.spy( + mock_generate_embedding, "generate_embedding" ) + mock_generate_embedding.return_value = chunks - engine.process(doc1) - engine.process(doc2) - engine.process(doc3) - - repository = Repository(conn, settings) - doc1_id = repository.add_document(doc1) - repository.add_document(doc2) - repository.add_document(doc3) - - engine.quantize() + document = Document(content="Test document content") # Act - results = engine.search("quick brown fox", top_k=5) + engine.process(document) - assert len(results) > 0 - assert doc1_id == results[0].document.id - assert results[0].fts_rank - assert results[0].fts_rank == 1 - assert results[0].fts_score - assert results[0].fts_score > 0 + # Assert + assert len(document.chunks) == 2 + assert len(document.chunks[0].sentences) == 1 + assert len(document.chunks[1].sentences) == 2 - def test_search_without_quantization(self, db_conn): + def test_process_without_sentences(self, mocker): # Arrange - conn, settings = db_conn - settings.quantize_scan = False - - engine = Engine(conn, settings, Chunker(conn, settings)) - engine.load_model() - - doc = Document( - content="The quick brown fox jumps over the lazy dog.", - uri="document1.txt", - ) - - engine.create_new_context() - engine.process(doc) - - repository = Repository(conn, settings) - doc_id = repository.add_document(doc) + chunks = [Chunk(content="Chunk 1")] - # Act - results = engine.search("wood lumberjack") - - assert len(results) > 0 - assert doc_id == results[0].document.id - - def test_search_exact_match(self, db_conn): - conn, settings = db_conn - # cosin distance for searching embedding is exact 0.0 when strings match - settings.other_vector_options = "distance=cosine" - settings.use_prompt_templates = False + mock_conn = mocker.Mock() + settings = Settings() + mock_chunker = mocker.Mock() + mock_chunker.chunk.return_value = chunks + mock_sentence_splitter = mocker.Mock() + mock_sentence_splitter.split.return_value = [] - engine = Engine(conn, settings, Chunker(conn, settings)) - engine.load_model() - engine.create_new_context() + engine = Engine(mock_conn, settings, mock_chunker, mock_sentence_splitter) - doc1 = Document( - content="The quick brown fox jumps over the lazy dog", - uri="document1.txt", - ) - doc2 = Document( - content="How much wood would a woodchuck chuck if a woodchuck could chuck wood?", - uri="document2.txt", + mock_generate_embedding = mocker.patch.object(engine, "generate_embedding") + mock_generate_embedding = mocker.spy( + mock_generate_embedding, "generate_embedding" ) + mock_generate_embedding.return_value = chunks - engine.process(doc1) - engine.process(doc2) - - repository = Repository(conn, settings) - doc1_id = repository.add_document(doc1) - repository.add_document(doc2) - - engine.quantize() + document = Document(content="Test document content") # Act - results = engine.search("The quick brown fox jumps over the lazy dog") + engine.process(document) - assert len(results) > 0 - assert doc1_id == results[0].document.id - assert 0.0 == results[0].vec_distance + # Assert + assert len(document.chunks) == 1 + assert len(document.chunks[0].sentences) == 0 From 012b3e7695845b64d375abd6300c5ea283855f5e Mon Sep 17 00:00:00 2001 From: Daniele Briggi <=> Date: Mon, 20 Oct 2025 08:35:15 +0000 Subject: [PATCH 4/7] refact(tests): simplified sentences splitter --- src/sqlite_rag/sentence_splitter.py | 51 ++++++++++++++++++++--------- tests/integration/test_engine.py | 8 ++--- tests/test_sentence_splitter.py | 4 --- tests/test_sqlite_rag.py | 16 ++++++--- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/src/sqlite_rag/sentence_splitter.py b/src/sqlite_rag/sentence_splitter.py index dc5e82a..a1cee83 100644 --- a/src/sqlite_rag/sentence_splitter.py +++ b/src/sqlite_rag/sentence_splitter.py @@ -10,27 +10,48 @@ class SentenceSplitter: def split(self, chunk: Chunk) -> List[Sentence]: """Split chunk into sentences.""" - sentences = [] - - sentences_text = self._split_into_sentences(chunk.content) - start_offset = 0 - end_offset = 0 - for sentence_text in sentences_text: - start_offset = chunk.content.index(sentence_text, end_offset) - end_offset = start_offset + len(sentence_text) + # Split on: sentence endings, semicolons, or paragraph breaks + sentence_regex = re.compile(r'(?<=[.!?;])(?:"|\')?\s+(?=[A-Z])|[\n]{2,}') - sentence = Sentence( - content=sentence_text, - start_offset=start_offset, - end_offset=end_offset, - ) - sentences.append(sentence) + sentences = [] + last_end = 0 + text = chunk.content + + for match in sentence_regex.finditer(text): + segment = text[last_end : match.end()] + + segment = segment.strip() + if len(segment) > self.MIN_CHARS_PER_SENTENCE: + sentences.append( + Sentence( + content=segment, + start_offset=last_end, + end_offset=last_end + len(segment), + ) + ) + + # Position after the current match + last_end = match.end() + + # Last segment + if last_end < len(text): + segment = text[last_end:] + + segment = segment.strip() + if len(segment) > self.MIN_CHARS_PER_SENTENCE: + sentences.append( + Sentence( + content=segment, + start_offset=last_end, + end_offset=last_end + len(segment), + ) + ) return sentences def _split_into_sentences(self, text: str) -> List[str]: """Split into focused segments for semantic matching.""" - # Split on: sentence endings, semicolons, or paragraph breaks + sentence_endings = re.compile(r'(?<=[.!?;])(?:"|\')?\s+(?=[A-Z])|[\n]{2,}') sentences = sentence_endings.split(text) diff --git a/tests/integration/test_engine.py b/tests/integration/test_engine.py index d9ab3ea..a25a9a1 100644 --- a/tests/integration/test_engine.py +++ b/tests/integration/test_engine.py @@ -13,7 +13,7 @@ class TestEngine: @pytest.mark.slow - def test_stress_embedding_generation(self, engine): + def test_stress_embedding_generation(self, engine: Engine): """Test embedding generation with a large number of chunks to not fail and to never generate duplicated embeddings.""" @@ -36,7 +36,7 @@ def random_string(length=30): class TestEngineQuantization: - def test_quantize_embedding(self, engine): + def test_quantize_embedding(self, engine: Engine): """Test quantize called for chunks and sentences embeddings.""" engine.quantize() @@ -54,8 +54,8 @@ def test_quantize_cleanup(self, engine): class TestEngineSearch: - def test_search_with_empty_database(self, engine): - results = engine.search("nonexistent query", top_k=5) + def test_search_with_empty_database(self, engine: Engine): + results = engine.search("nonexistent query", "nonexistent query", top_k=5) assert len(results) == 0 diff --git a/tests/test_sentence_splitter.py b/tests/test_sentence_splitter.py index 09bb151..7030b68 100644 --- a/tests/test_sentence_splitter.py +++ b/tests/test_sentence_splitter.py @@ -20,17 +20,14 @@ def test_split(self): assert len(sentences) == 3 assert sentences[0].content == "This is the first sentence." - assert sentences[0].sequence == 0 assert sentences[0].start_offset == 0 assert sentences[0].end_offset == 27 assert sentences[1].content == "Here is the second sentence!" - assert sentences[1].sequence == 1 assert sentences[1].start_offset == 28 assert sentences[1].end_offset == 28 + 28 assert sentences[2].content == "And what about the third?" - assert sentences[2].sequence == 2 assert sentences[2].start_offset == 57 assert sentences[2].end_offset == 57 + 25 @@ -66,6 +63,5 @@ def test_split_no_punctuation(self): assert len(sentences) == 1 assert sentences[0].content == chunk.content - assert sentences[0].sequence == 0 assert sentences[0].start_offset == 0 assert sentences[0].end_offset == len(chunk.content) diff --git a/tests/test_sqlite_rag.py b/tests/test_sqlite_rag.py index 19fedb7..a04fc71 100644 --- a/tests/test_sqlite_rag.py +++ b/tests/test_sqlite_rag.py @@ -838,10 +838,14 @@ def test_search_uses_retrieval_query_template(self, mocker): rag.search(query) # Assert that engine.search was called with the formatted template - expected_query = rag._settings.prompt_template_retrieval_query.format( + expected_semantic_query = rag._settings.prompt_template_retrieval_query.format( content=query ) - mock_engine.search.assert_called_once_with(expected_query, top_k=10) + expected_fts_query = query + "*" + + mock_engine.search.assert_called_once_with( + expected_semantic_query, expected_fts_query, top_k=10 + ) @pytest.mark.parametrize("use_prompt_templates", [True, False]) def test_search_with_prompt_template(self, mocker, use_prompt_templates): @@ -865,9 +869,13 @@ def test_search_with_prompt_template(self, mocker, use_prompt_templates): rag.search("test query", new_context=False) # Assert - verify engine.search was called with correct formatted query - expected_query = ( + expected_semantic_query = ( "task: search result | query: test query" if use_prompt_templates else "test query" ) - mock_engine.search.assert_called_once_with(expected_query, top_k=10) + expected_fts_query = "test query*" + + mock_engine.search.assert_called_once_with( + expected_semantic_query, expected_fts_query, top_k=10 + ) From db93d7c77d3dc430320c075dcb22a321a58a4033 Mon Sep 17 00:00:00 2001 From: Daniele Briggi <=> Date: Mon, 20 Oct 2025 14:03:37 +0000 Subject: [PATCH 5/7] refact(search): embed query once per search --- src/sqlite_rag/database.py | 1 - src/sqlite_rag/engine.py | 45 +++- src/sqlite_rag/formatters.py | 102 +-------- src/sqlite_rag/models/document_result.py | 66 +++++- src/sqlite_rag/sentence_splitter.py | 11 - src/sqlite_rag/sqliterag.py | 21 +- tests/integration/test_engine.py | 66 ++++-- tests/models/test_document_result.py | 258 +++++++++++++++++++++++ tests/test_engine.py | 112 ++++++++++ tests/test_sqlite_rag.py | 59 ------ 10 files changed, 532 insertions(+), 209 deletions(-) create mode 100644 tests/models/test_document_result.py diff --git a/src/sqlite_rag/database.py b/src/sqlite_rag/database.py index b5ed743..c0a52d7 100644 --- a/src/sqlite_rag/database.py +++ b/src/sqlite_rag/database.py @@ -88,7 +88,6 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings): """ ) - # TODO: remove sequence cursor.execute( """ CREATE TABLE IF NOT EXISTS sentences ( diff --git a/src/sqlite_rag/engine.py b/src/sqlite_rag/engine.py index f8ce76e..1651de2 100644 --- a/src/sqlite_rag/engine.py +++ b/src/sqlite_rag/engine.py @@ -1,4 +1,5 @@ import json +import re import sqlite3 from pathlib import Path from typing import List @@ -123,11 +124,38 @@ def free_context(self) -> None: cursor.execute("SELECT llm_context_free();") - def search( - self, semantic_query: str, fts_query, top_k: int = 10 + def search(self, query, top_k: int = 10) -> list[DocumentResult]: + """Semantic search and full-text search sorted with Reciprocal Rank Fusion + with top matching sentences to highlight.""" + semantic_query = query + if self._settings.use_prompt_templates: + semantic_query = self._settings.prompt_template_retrieval_query.format( + content=query + ) + + # Clean up and split into words + # '*' is used to match while typing + fts_query = " ".join(re.findall(r"\b\w+\b", query.lower())) + "*" + + query_embedding = self.generate_embedding(semantic_query) + + results = self.search_documents(query_embedding, fts_query, top_k=top_k) + + # Refine chunks with top sentences + for result in results: + result.sentences = self.search_sentences( + query_embedding, result.chunk_id, top_k=self._settings.top_k_sentences + ) + + return results + + def search_documents( + self, query_embedding: bytes, fts_query: str, top_k: int ) -> list[DocumentResult]: """Semantic search and full-text search sorted with Reciprocal Rank Fusion.""" - query_embedding = self.generate_embedding(semantic_query) + # invalid query + if query_embedding == b"" or fts_query.strip() == "": + return [] vector_scan_type = ( "vector_quantize_scan" @@ -180,7 +208,7 @@ def search( documents.content as document_content, documents.metadata, chunks.id AS chunk_id, - chunks.content AS snippet, + chunks.content AS chunk_content, vec_rank, fts_rank, combined_rank, @@ -212,7 +240,7 @@ def search( metadata=json.loads(row["metadata"]) if row["metadata"] else {}, ), chunk_id=row["chunk_id"], - snippet=row["snippet"], + chunk_content=row["chunk_content"], vec_rank=row["vec_rank"], fts_rank=row["fts_rank"], combined_rank=row["combined_rank"], @@ -225,10 +253,9 @@ def search( return results def search_sentences( - self, query: str, chunk_id: int, top_k: int + self, query_embedding: bytes, chunk_id: int, top_k: int ) -> List[SentenceResult]: - query_embedding = self.generate_embedding(query) - + """Semantic search for sentences within a chunk.""" vector_scan_type = ( "vector_quantize_scan_stream" if self._settings.quantize_scan @@ -244,7 +271,6 @@ def search_sentences( v.rowid AS sentence_id, row_number() OVER (ORDER BY v.distance) AS rank_number, v.distance, - sentences.content as sentence_content, sentences.start_offset as sentence_start_offset, sentences.end_offset as sentence_end_offset FROM {vector_scan_type}('sentences', 'embedding', :query_embedding) AS v @@ -255,7 +281,6 @@ def search_sentences( ) SELECT sentence_id, - sentence_content, sentence_start_offset, sentence_end_offset, rank_number, diff --git a/src/sqlite_rag/formatters.py b/src/sqlite_rag/formatters.py index ca3b0df..ff3f0d8 100644 --- a/src/sqlite_rag/formatters.py +++ b/src/sqlite_rag/formatters.py @@ -6,8 +6,6 @@ import typer -from sqlite_rag.models.sentence_result import SentenceResult - from .models.document_result import DocumentResult @@ -82,81 +80,6 @@ def _format_uri_display(self, uri: str, icon: str, max_width: int = 75) -> str: uri_display = f"{icon} ...{uri[-available_width:]}" return uri_display - def _build_sentence_preview( - self, - chunk_content: str, - sentences: List[SentenceResult], - max_chars: int = 400, - ) -> str: - """Build preview from top 3 ranked sentences with [...] for gaps. - - Args: - chunk_content: The full chunk text - sentences: List of SentenceResult objects (should already be sorted by rank) - max_chars: Maximum total characters for preview - - Returns: - Preview string with top sentences and [...] separators. - Falls back to truncated chunk_content if sentences have no offsets. - """ - - # Take top 3 sentences (they should already be sorted by rank/distance) - top_sentences = sentences[:3] if sentences else [] - - if not top_sentences: - # Fallback: no sentences, return truncated chunk content - return chunk_content[:max_chars] - - # Filter sentences that have offset information - sentences_with_offsets = [ - s - for s in top_sentences - if s.start_offset is not None and s.end_offset is not None - ] - - if not sentences_with_offsets: - # Fallback: sentences exist but no offset information, return truncated chunk content - return chunk_content[:max_chars] - - # Sort by start_offset to maintain document order - sentences_with_offsets.sort( - key=lambda s: s.start_offset if s.start_offset is not None else -1 - ) - - preview_parts = [] - total_chars = 0 - prev_end_offset = None - - for sentence in sentences_with_offsets: - # Extract sentence text using offsets - sentence_text = chunk_content[ - sentence.start_offset : sentence.end_offset - ].strip() - - # Calculate remaining budget including potential separator - separator_len = len(" [...] ") if preview_parts else 0 - remaining = max_chars - total_chars - separator_len - - if remaining <= 0: - break - - # Truncate sentence if needed - if len(sentence_text) > remaining: - sentence_text = sentence_text[: remaining - 3] + "..." - - # Check if there's a gap > 10 chars from previous sentence - if prev_end_offset is not None and sentence.start_offset is not None: - gap_size = sentence.start_offset - prev_end_offset - if gap_size > 10: - preview_parts.append("[...]") - total_chars += len(" [...] ") - - preview_parts.append(sentence_text) - total_chars += len(sentence_text) - prev_end_offset = sentence.end_offset - - return " ".join(preview_parts) - class BoxedFormatter(SearchResultFormatter): """Base class for boxed result formatters.""" @@ -176,11 +99,8 @@ def _format_single_result(self, doc: DocumentResult, idx: int) -> None: """Format a single result with box layout.""" icon = self._get_file_icon(doc.document.uri or "") - # Use sentence-based preview if sentences are available - if doc.sentences: - snippet_text = self._build_sentence_preview(doc.snippet, doc.sentences) - else: - snippet_text = doc.snippet + # Get snippet from DocumentResult (handles sentence-based preview automatically) + snippet_text = doc.get_preview(max_chars=400) snippet_lines = self._clean_and_wrap_snippet( snippet_text, width=75, max_length=400 @@ -250,11 +170,8 @@ def _format_single_result(self, doc: DocumentResult, idx: int) -> None: """Format a single result with box layout including sentence summary.""" icon = self._get_file_icon(doc.document.uri or "") - # Use sentence-based preview if sentences are available - if doc.sentences: - snippet_text = self._build_sentence_preview(doc.snippet, doc.sentences) - else: - snippet_text = doc.snippet + # Get snippet from DocumentResult (handles sentence-based preview automatically) + snippet_text = doc.get_preview(max_chars=400) snippet_lines = self._clean_and_wrap_snippet( snippet_text, width=75, max_length=400 @@ -305,7 +222,7 @@ def _format_single_result(self, doc: DocumentResult, idx: int) -> None: sentence.start_offset is not None and sentence.end_offset is not None ): - sentence_text = doc.snippet[ + sentence_text = doc.chunk_content[ sentence.start_offset : sentence.end_offset ].strip() # Truncate and clean for display @@ -364,13 +281,8 @@ def _print_table_header(self) -> None: def _print_table_row(self, idx: int, doc: DocumentResult) -> None: """Print a single table row.""" - # Use sentence-based preview if sentences are available - if doc.sentences: - snippet = self._build_sentence_preview( - doc.snippet, doc.sentences, max_chars=52 - ) - else: - snippet = doc.snippet + # Get snippet from DocumentResult (handles sentence-based preview automatically) + snippet = doc.get_preview(max_chars=52) # Clean snippet display snippet = snippet.replace("\n", " ").replace("\r", "") diff --git a/src/sqlite_rag/models/document_result.py b/src/sqlite_rag/models/document_result.py index 86a6f9a..0776592 100644 --- a/src/sqlite_rag/models/document_result.py +++ b/src/sqlite_rag/models/document_result.py @@ -9,7 +9,7 @@ class DocumentResult: document: Document chunk_id: int - snippet: str + chunk_content: str combined_rank: float vec_rank: float | None = None @@ -20,3 +20,67 @@ class DocumentResult: # highlight sentences sentences: list[SentenceResult] = field(default_factory=list) + + def get_preview( + self, top_k_sentences: int = 3, max_chars: int = 400, gap: str = "[...]" + ) -> str: + """Build preview from top ranked sentences with [...] for gaps. + + Args: + top_k_sentences: Number of top sentences to include in preview + max_chars: Maximum total characters for preview + + Returns: + Preview string with top sentences and [...] separators. + Falls back to truncated chunk_content if sentences have no offsets. + """ + top_sentences = self.sentences[:top_k_sentences] if self.sentences else [] + + if not top_sentences: + # Fallback: no sentences, return truncated chunk content + return self.chunk_content[:max_chars] + + # Filter sentences that have offset information + sentences_with_offsets = [ + s + for s in top_sentences + if s.start_offset is not None and s.end_offset is not None + ] + + if not sentences_with_offsets: + return self.chunk_content[:max_chars] + + # Sort by start_offset to maintain document order + sentences_with_offsets.sort( + key=lambda s: s.start_offset if s.start_offset is not None else -1 + ) + + preview_parts = [] + total_chars = 0 + prev_end_offset = None + + for sentence in sentences_with_offsets: + sentence_text = self.chunk_content[ + sentence.start_offset : sentence.end_offset + ].strip() + + # Calculate remaining budget including potential separator + separator_len = len("[...] ") if preview_parts else 0 + remaining = max_chars - total_chars - separator_len + + if remaining <= 0: + break + + if prev_end_offset is not None and sentence.start_offset is not None: + gap_size = sentence.start_offset - prev_end_offset + if gap_size > 10: + preview_parts.append(gap) + total_chars += len(gap) + + preview_parts.append(sentence_text) + total_chars += len(sentence_text) + prev_end_offset = sentence.end_offset + + preview = " ".join(preview_parts) + + return preview[: max_chars - 3] + "..." if len(preview) > max_chars else preview diff --git a/src/sqlite_rag/sentence_splitter.py b/src/sqlite_rag/sentence_splitter.py index a1cee83..c177504 100644 --- a/src/sqlite_rag/sentence_splitter.py +++ b/src/sqlite_rag/sentence_splitter.py @@ -48,14 +48,3 @@ def split(self, chunk: Chunk) -> List[Sentence]: ) return sentences - - def _split_into_sentences(self, text: str) -> List[str]: - """Split into focused segments for semantic matching.""" - - sentence_endings = re.compile(r'(?<=[.!?;])(?:"|\')?\s+(?=[A-Z])|[\n]{2,}') - sentences = sentence_endings.split(text) - - # Keep segments that are substantial enough (20+ chars for meaningful matching) - return [ - s.strip() for s in sentences if len(s.strip()) > self.MIN_CHARS_PER_SENTENCE - ] diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py index da2078b..6aa1037 100644 --- a/src/sqlite_rag/sqliterag.py +++ b/src/sqlite_rag/sqliterag.py @@ -1,4 +1,3 @@ -import re import sqlite3 from dataclasses import asdict from pathlib import Path @@ -317,25 +316,7 @@ def search( if new_context: self._engine.create_new_context() - semantic_query = query - if self._settings.use_prompt_templates: - semantic_query = self._settings.prompt_template_retrieval_query.format( - content=query - ) - - # Clean up and split into words - # '*' is used to match while typing - fts_query = " ".join(re.findall(r"\b\w+\b", query.lower())) + "*" - - results = self._engine.search(semantic_query, fts_query, top_k=top_k) - - # Refine chunks with top sentences - for result in results: - result.sentences = self._engine.search_sentences( - semantic_query, result.chunk_id, top_k=self._settings.top_k_sentences - ) - - return results + return self._engine.search(query, top_k=top_k) def get_settings(self) -> dict: """Get settings and more useful information""" diff --git a/tests/integration/test_engine.py b/tests/integration/test_engine.py index a25a9a1..d289198 100644 --- a/tests/integration/test_engine.py +++ b/tests/integration/test_engine.py @@ -41,21 +41,54 @@ def test_quantize_embedding(self, engine: Engine): engine.quantize() # If no exception is raised, the test passes - engine.search("test query", "test query") + engine.search("test query") - def test_quantize_cleanup(self, engine): + def test_quantize_cleanup(self, engine: Engine): """Test quantize cleanup works without errors.""" engine.quantize() engine.quantize_cleanup() with pytest.raises(OperationalError) as exc_info: - engine.search("test query", "test query") + engine.search("test query") assert "Ensure that vector_quantize() has been called" in str(exc_info.value) class TestEngineSearch: + def test_search(self, engine: Engine): + # Arrange + doc1 = Document( + content="The quick brown fox jumps over the lazy dog.", + uri="document1.txt", + ) + doc2 = Document( + content="How much wood would a woodchuck chuck if a woodchuck could chuck wood?", + uri="document2.txt", + ) + + engine.create_new_context() + engine.process(doc1) + engine.process(doc2) + + repository = Repository(engine._conn, engine._settings) + repository.add_document(doc1) + repository.add_document(doc2) + + # Act + results = engine.search("quick brown fox") + + # Assert + assert len(results) > 0 + assert results[0].document.uri == "document1.txt" + + +class TestEngineSearchDocuments: def test_search_with_empty_database(self, engine: Engine): - results = engine.search("nonexistent query", "nonexistent query", top_k=5) + results = engine.search_documents(b"132456", "myquery", top_k=5) + + assert len(results) == 0 + + def test_search_with_invalid_query(self, engine: Engine): + results = engine.search_documents(b"", "", top_k=5) assert len(results) == 0 @@ -89,10 +122,11 @@ def test_search_with_semantic_and_fts(self, db_conn): repository.add_document(doc2) doc3_id = repository.add_document(doc3) + embedding = engine.generate_embedding("about lumberjack") engine.quantize() # Act - results = engine.search("wood lumberjack", "wood lumberjack", top_k=5) + results = engine.search_documents(embedding, "about lumberjack", top_k=5) assert len(results) > 0 assert doc3_id == results[0].document.id @@ -127,10 +161,11 @@ def test_search_semantic_result(self, db_conn): repository.add_document(doc2) doc3_id = repository.add_document(doc3) + embedding = engine.generate_embedding("about lumberjack") engine.quantize() # Act - results = engine.search("about lumberjack", "about lumberjack", top_k=5) + results = engine.search_documents(embedding, "about lumberjack", top_k=5) assert len(results) > 0 assert doc3_id == results[0].document.id @@ -165,10 +200,11 @@ def test_search_fts_results(self, db_conn): repository.add_document(doc2) repository.add_document(doc3) + embedding = engine.generate_embedding("quick brown fox") engine.quantize() # Act - results = engine.search("quick brown fox", "quick brown fox", top_k=5) + results = engine.search_documents(embedding, "quick brown fox", top_k=5) assert len(results) > 0 assert doc1_id == results[0].document.id @@ -195,8 +231,10 @@ def test_search_without_quantization(self, db_conn): repository = Repository(conn, settings) doc_id = repository.add_document(doc) + embedding = engine.generate_embedding("wood lumberjack") + # Act - results = engine.search("wood lumberjack", "wood lumberjack") + results = engine.search_documents(embedding, "wood lumberjack", top_k=5) assert len(results) > 0 assert doc_id == results[0].document.id @@ -227,12 +265,14 @@ def test_search_exact_match(self, db_conn): doc1_id = repository.add_document(doc1) repository.add_document(doc2) + embedding = engine.generate_embedding( + "The quick brown fox jumps over the lazy dog" + ) engine.quantize() # Act - results = engine.search( - "The quick brown fox jumps over the lazy dog", - "The quick brown fox jumps over the lazy dog", + results = engine.search_documents( + embedding, "The quick brown fox jumps over the lazy dog", top_k=5 ) assert len(results) > 0 @@ -268,9 +308,11 @@ def test_search_sentences(self, db_conn): cursor = conn.execute("SELECT id FROM chunks WHERE document_id = ?", (doc_id,)) chunk_id = cursor.fetchone()[0] + embedding = engine.generate_embedding("stitch time") + # Act results = engine.search_sentences( - "stitch time", + embedding, chunk_id, top_k=1, ) diff --git a/tests/models/test_document_result.py b/tests/models/test_document_result.py new file mode 100644 index 0000000..7fb80fc --- /dev/null +++ b/tests/models/test_document_result.py @@ -0,0 +1,258 @@ +from sqlite_rag.models.document import Document +from sqlite_rag.models.document_result import DocumentResult +from sqlite_rag.models.sentence_result import SentenceResult + + +class TestDocumentResult: + def test_get_preview_no_sentences(self): + doc = Document(uri="test.txt", content="test content") + chunk_content = "This is a long piece of text. " * 50 + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content=chunk_content, + combined_rank=1.0, + sentences=[], + ) + + preview = result.get_preview(max_chars=100) + assert len(preview) == 100 + assert preview == chunk_content[:100] + + def test_get_preview_with_sentences_no_offsets(self): + doc = Document(uri="test.txt", content="test content") + chunk_content = "First sentence. Second sentence. Third sentence." + + # Sentences without offset information + sentences = [ + SentenceResult(chunk_id=1, id=1, rank=1, distance=0.1), + SentenceResult(chunk_id=1, id=2, rank=2, distance=0.2), + ] + + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content=chunk_content, + combined_rank=1.0, + sentences=sentences, + ) + + preview = result.get_preview(max_chars=100) + assert preview == chunk_content[:100] + + def test_get_preview_with_single_sentence(self): + doc = Document(uri="test.txt", content="test content") + chunk_content = ( + "First sentence here. Second sentence there. Third sentence everywhere." + ) + + sentences = [ + SentenceResult( + chunk_id=1, + id=2, + rank=1, + distance=0.1, + start_offset=21, + end_offset=44, + ), + ] + + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content=chunk_content, + combined_rank=1.0, + sentences=sentences, + ) + + preview = result.get_preview(max_chars=400) + assert preview == "Second sentence there." + + def test_get_preview_with_gaps(self): + """Test get_preview adds [...] separator for gaps.""" + doc = Document(uri="test.txt", content="test content") + chunk_content = ( + "First sentence at the beginning." + "Some middle content that we skip over here." + "Last sentence at the end." + ) + + sentences = [ + SentenceResult( + chunk_id=1, + id=1, + rank=1, + distance=0.1, + start_offset=0, + end_offset=32, # "First sentence at the beginning." + ), + SentenceResult( + chunk_id=1, + id=3, + rank=2, + distance=0.2, + start_offset=75, + end_offset=103, # "Last sentence at the end." + ), + ] + + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content=chunk_content, + combined_rank=1.0, + sentences=sentences, + ) + + preview = result.get_preview(max_chars=400) + assert ( + "First sentence at the beginning. [...] Last sentence at the end." + == preview + ) + + def test_get_preview_respects_max_chars(self): + """Test get_preview truncates when exceeding max_chars.""" + doc = Document(uri="test.txt", content="test content") + chunk_content = ( + "A very long sentence that exceeds the maximum character limit. " * 10 + ) + + sentences = [ + SentenceResult( + chunk_id=1, + id=1, + rank=1, + distance=0.1, + start_offset=0, + end_offset=200, + ), + ] + + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content=chunk_content, + combined_rank=1.0, + sentences=sentences, + ) + + preview = result.get_preview(max_chars=50) + assert len(preview) <= 50 + + def test_get_preview_with_multiple_consecutive_and_ordered_sentences(self): + doc = Document(uri="test.txt", content="test content") + chunk_content = "First sentence. Second sentence. Third sentence." + + sentences = [ + SentenceResult( + chunk_id=1, + id=1, + rank=1, + distance=0.1, + start_offset=0, + end_offset=15, + ), + SentenceResult( + chunk_id=1, + id=2, + rank=2, + distance=0.2, + start_offset=16, + end_offset=32, + ), + ] + + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content=chunk_content, + combined_rank=1.0, + sentences=sentences, + ) + + preview = result.get_preview(max_chars=400) + assert preview == "First sentence. Second sentence." + + def test_get_preview_orders_sentences_by_offset(self): + """Test get_preview reorders sentences by start_offset (document order).""" + doc = Document(uri="test.txt", content="test content") + chunk_content = "First sentence. " + "x" * 50 + " Third sentence." + + # Sentences in reverse rank order (rank 1 is last in document) + sentences = [ + SentenceResult( + chunk_id=1, + id=3, + rank=1, # higher rank but appears latter in document + distance=0.1, + start_offset=66, + end_offset=82, # "Third sentence." + ), + SentenceResult( + chunk_id=1, + id=1, + rank=2, + distance=0.2, + start_offset=0, + end_offset=15, # "First sentence." + ), + ] + + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content=chunk_content, + combined_rank=1.0, + sentences=sentences, + ) + + preview = result.get_preview(max_chars=400) + # Should be in document order despite rank order + assert "First sentence. [...] Third sentence." == preview + + def test_get_preview_limits_to_top_k_sentences(self): + """Test get_preview respects top_k_sentences parameter.""" + doc = Document(uri="test.txt", content="test content") + chunk_content = "First. Second. Third. Fourth. Fifth." + + # 5 sentences, but only top 2 should be used + sentences = [ + SentenceResult( + chunk_id=1, id=1, rank=1, distance=0.1, start_offset=0, end_offset=6 + ), + SentenceResult( + chunk_id=1, id=2, rank=2, distance=0.2, start_offset=7, end_offset=14 + ), + SentenceResult( + chunk_id=1, id=3, rank=3, distance=0.3, start_offset=15, end_offset=21 + ), + SentenceResult( + chunk_id=1, id=4, rank=4, distance=0.4, start_offset=22, end_offset=29 + ), + SentenceResult( + chunk_id=1, id=5, rank=5, distance=0.5, start_offset=30, end_offset=36 + ), + ] + + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content=chunk_content, + combined_rank=1.0, + sentences=sentences, + ) + + preview = result.get_preview(top_k_sentences=2, max_chars=400) + assert "First." in preview + assert "Second." in preview + assert "Third" not in preview + assert "Fourth" not in preview + assert "Fifth" not in preview + + # Test with default top_k=3 + preview_default = result.get_preview(max_chars=400) + assert "First." in preview_default + assert "Second." in preview_default + assert "Third." in preview_default + assert "Fourth" not in preview_default + assert "Fifth" not in preview_default diff --git a/tests/test_engine.py b/tests/test_engine.py index 7f7ead6..1cdd3f3 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -147,3 +147,115 @@ def test_process_without_sentences(self, mocker): # Assert assert len(document.chunks) == 1 assert len(document.chunks[0].sentences) == 0 + + def test_search(self, mocker): + # Arrange + mock_conn = mocker.Mock() + settings = Settings() + engine = Engine(mock_conn, settings, mocker.Mock(), mocker.Mock()) + + mock_generate = mocker.patch.object( + engine, "generate_embedding", return_value=b"embedding" + ) + mock_search_docs = mocker.patch.object( + engine, + "search_documents", + return_value=[ + mocker.Mock(chunk_id=1, sentences=[]), + mocker.Mock(chunk_id=2, sentences=[]), + ], + ) + mock_search_sents = mocker.patch.object( + engine, "search_sentences", return_value=[] + ) + + # Act + engine.search("test query", top_k=5) + + # Assert + mock_generate.assert_called_once_with('title: "none" | text: test query') + mock_search_docs.assert_called_once_with(b"embedding", "test query*", top_k=5) + assert mock_search_sents.call_count == 2 + mock_search_sents.assert_any_call( + b"embedding", 1, top_k=settings.top_k_sentences + ) + mock_search_sents.assert_any_call( + b"embedding", 2, top_k=settings.top_k_sentences + ) + + def test_search_uses_retrieval_query_template(self, mocker): + # Arrange + template = "task: search | Do something with {content}" + + settings = Settings(prompt_template_retrieval_query=template) + + mock_conn = mocker.Mock() + engine = Engine(mock_conn, settings, mocker.Mock(), mocker.Mock()) + + mock_generate = mocker.patch.object( + engine, "generate_embedding", return_value=b"embedding" + ) + mock_search_docs = mocker.patch.object( + engine, + "search_documents", + return_value=[ + mocker.Mock(chunk_id=1, sentences=[]), + ], + ) + mock_search_sents = mocker.patch.object( + engine, "search_sentences", return_value=[] + ) + + # Act + query = "test query" + engine.search(query, top_k=10) + + expected_fts_query = query + "*" + + # Assert + # Is called with the formatted template + mock_generate.assert_called_once_with( + "task: search | Do something with test query" + ) + mock_search_docs.assert_called_once_with( + b"embedding", expected_fts_query, top_k=10 + ) + mock_search_sents.assert_called_once_with( + b"embedding", 1, top_k=settings.top_k_sentences + ) + + @pytest.mark.parametrize("use_prompt_templates", [True, False]) + def test_search_with_prompt_template(self, mocker, use_prompt_templates): + # Arrange + settings = Settings( + use_prompt_templates=use_prompt_templates, + prompt_template_retrieval_query="task: search result | query: {content}", + ) + + mock_conn = mocker.Mock() + engine = Engine(mock_conn, settings, mocker.Mock(), mocker.Mock()) + + mock_generate_embedding = mocker.patch.object( + engine, "generate_embedding", return_value=b"embedding" + ) + mocker.patch.object( + engine, + "search_documents", + return_value=[ + mocker.Mock(chunk_id=1, sentences=[]), + ], + ) + mocker.patch.object(engine, "search_sentences", return_value=[]) + + # Act + query = "test query" + engine.search(query) + + # Assert - verify engine.search was called with correct formatted query + expected_semantic_query = ( + "task: search result | query: test query" + if use_prompt_templates + else "test query" + ) + + mock_generate_embedding.assert_called_once_with(expected_semantic_query) diff --git a/tests/test_sqlite_rag.py b/tests/test_sqlite_rag.py index a04fc71..3cdddc0 100644 --- a/tests/test_sqlite_rag.py +++ b/tests/test_sqlite_rag.py @@ -6,7 +6,6 @@ import pytest from sqlite_rag import SQLiteRag -from sqlite_rag.settings import Settings class TestSQLiteRagAdd: @@ -821,61 +820,3 @@ def test_search_samples_exact_match_by_scan_type(self, quantize_scan: bool): # Second result should have distance > 0 second_result = results[1] assert second_result.vec_distance and second_result.vec_distance > 0.0 - - def test_search_uses_retrieval_query_template(self, mocker): - template = "task: search | Do something with {content}" - - settings = {"prompt_template_retrieval_query": template} - - rag = SQLiteRag.create(":memory:", settings=settings) - - mock_engine = mocker.Mock() - mock_engine.search.return_value = [] - - rag._engine = mock_engine - - query = "test query" - rag.search(query) - - # Assert that engine.search was called with the formatted template - expected_semantic_query = rag._settings.prompt_template_retrieval_query.format( - content=query - ) - expected_fts_query = query + "*" - - mock_engine.search.assert_called_once_with( - expected_semantic_query, expected_fts_query, top_k=10 - ) - - @pytest.mark.parametrize("use_prompt_templates", [True, False]) - def test_search_with_prompt_template(self, mocker, use_prompt_templates): - # Arrange - settings = Settings( - use_prompt_templates=use_prompt_templates, - prompt_template_retrieval_query="task: search result | query: {content}", - ) - - # Mock engine and its search method - mock_engine = mocker.Mock() - mock_engine.search.return_value = [] # Empty search results - - # Create SQLiteRag instance with mocked dependencies - rag = SQLiteRag(mocker.Mock(), settings) - rag._engine = mock_engine - - mocker.patch.object(rag, "_ensure_initialized") - - # Act - rag.search("test query", new_context=False) - - # Assert - verify engine.search was called with correct formatted query - expected_semantic_query = ( - "task: search result | query: test query" - if use_prompt_templates - else "test query" - ) - expected_fts_query = "test query*" - - mock_engine.search.assert_called_once_with( - expected_semantic_query, expected_fts_query, top_k=10 - ) From 6f2aa0adefe5271b5dce4c8462ee51243e2fab6f Mon Sep 17 00:00:00 2001 From: Daniele Briggi <=> Date: Mon, 20 Oct 2025 16:37:31 +0000 Subject: [PATCH 6/7] feat(sentences): extract sentence content from sql. Avoid to fetch the entire chunk to extract the content --- src/sqlite_rag/database.py | 11 +- src/sqlite_rag/engine.py | 18 +- src/sqlite_rag/formatters.py | 206 ++++++---------- src/sqlite_rag/models/document_result.py | 28 +-- src/sqlite_rag/models/sentence_result.py | 2 + tests/models/test_document_result.py | 107 ++++---- tests/test_formatters.py | 299 +++++++++++++++++++++++ 7 files changed, 456 insertions(+), 215 deletions(-) create mode 100644 tests/test_formatters.py diff --git a/src/sqlite_rag/database.py b/src/sqlite_rag/database.py index c0a52d7..f15450f 100644 --- a/src/sqlite_rag/database.py +++ b/src/sqlite_rag/database.py @@ -76,19 +76,19 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings): ) # TODO: this table is not ready for sqlite-sync, it uses the id AUTOINCREMENT - cursor.execute( + cursor.executescript( """ CREATE TABLE IF NOT EXISTS chunks ( id INTEGER PRIMARY KEY AUTOINCREMENT, document_id TEXT, content TEXT, - embedding BLOB, - FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE + embedding BLOB ); + CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks (document_id); """ ) - cursor.execute( + cursor.executescript( """ CREATE TABLE IF NOT EXISTS sentences ( id TEXT PRIMARY KEY, @@ -97,7 +97,8 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings): embedding BLOB, start_offset INTEGER, end_offset INTEGER - ) + ); + CREATE INDEX IF NOT EXISTS idx_sentences_chunk_id ON sentences (chunk_id); """ ) diff --git a/src/sqlite_rag/engine.py b/src/sqlite_rag/engine.py index 1651de2..1496437 100644 --- a/src/sqlite_rag/engine.py +++ b/src/sqlite_rag/engine.py @@ -270,22 +270,27 @@ def search_sentences( SELECT v.rowid AS sentence_id, row_number() OVER (ORDER BY v.distance) AS rank_number, - v.distance, - sentences.start_offset as sentence_start_offset, - sentences.end_offset as sentence_end_offset + v.distance FROM {vector_scan_type}('sentences', 'embedding', :query_embedding) AS v JOIN sentences ON sentences.rowid = v.rowid WHERE sentences.chunk_id = :chunk_id - ORDER BY rank_number ASC LIMIT :top_k ) SELECT sentence_id, - sentence_start_offset, - sentence_end_offset, + -- Extract sentence directly from document content + COALESCE( + substr(chunks.content, sentences.start_offset + 1, sentences.end_offset - sentences.start_offset), + "" + ) AS content, + sentences.start_offset AS sentence_start_offset, + sentences.end_offset AS sentence_end_offset, rank_number, distance FROM vec_matches + JOIN sentences ON sentences.rowid = vec_matches.sentence_id + JOIN chunks ON chunks.id = sentences.chunk_id + ORDER BY rank_number ASC """, # nosec B608 { "query_embedding": query_embedding, @@ -301,6 +306,7 @@ def search_sentences( SentenceResult( id=row["sentence_id"], chunk_id=chunk_id, + content=row["content"].strip(), rank=row["rank_number"], distance=row["distance"], start_offset=row["sentence_start_offset"], diff --git a/src/sqlite_rag/formatters.py b/src/sqlite_rag/formatters.py index ff3f0d8..66a7ae9 100644 --- a/src/sqlite_rag/formatters.py +++ b/src/sqlite_rag/formatters.py @@ -2,12 +2,19 @@ """Output formatters for CLI search results.""" from abc import ABC, abstractmethod -from typing import List, Optional +from typing import List import typer from .models.document_result import DocumentResult +# Display constants +BOX_CONTENT_WIDTH = 75 +BOX_TOTAL_WIDTH = 77 +SNIPPET_MAX_LENGTH = 400 +SENTENCE_PREVIEW_LENGTH = 50 +MAX_SENTENCES_DISPLAY = 5 + class SearchResultFormatter(ABC): """Base class for search result formatters.""" @@ -40,7 +47,10 @@ def _get_file_icon(self, uri: str) -> str: return "📄" def _clean_and_wrap_snippet( - self, snippet: str, width: int = 75, max_length: int = 400 + self, + snippet: str, + width: int = BOX_CONTENT_WIDTH, + max_length: int = SNIPPET_MAX_LENGTH, ) -> List[str]: """Clean snippet and wrap to specified width with max length limit.""" # Clean the snippet @@ -69,7 +79,9 @@ def _clean_and_wrap_snippet( return lines - def _format_uri_display(self, uri: str, icon: str, max_width: int = 75) -> str: + def _format_uri_display( + self, uri: str, icon: str, max_width: int = BOX_CONTENT_WIDTH + ) -> str: """Format URI for display with icon and truncation.""" if not uri: return "" @@ -82,7 +94,15 @@ def _format_uri_display(self, uri: str, icon: str, max_width: int = 75) -> str: class BoxedFormatter(SearchResultFormatter): - """Base class for boxed result formatters.""" + """Boxed formatter for search results with optional debug information.""" + + def __init__(self, show_debug: bool = False): + """Initialize formatter. + + Args: + show_debug: Whether to show debug information and sentence details + """ + self.show_debug = show_debug def format_results(self, results: List[DocumentResult], query: str) -> None: if not results: @@ -98,56 +118,39 @@ def format_results(self, results: List[DocumentResult], query: str) -> None: def _format_single_result(self, doc: DocumentResult, idx: int) -> None: """Format a single result with box layout.""" icon = self._get_file_icon(doc.document.uri or "") + snippet_text = doc.get_preview(max_chars=SNIPPET_MAX_LENGTH) + snippet_lines = self._clean_and_wrap_snippet(snippet_text) - # Get snippet from DocumentResult (handles sentence-based preview automatically) - snippet_text = doc.get_preview(max_chars=400) - - snippet_lines = self._clean_and_wrap_snippet( - snippet_text, width=75, max_length=400 - ) - - # Draw the result box header - header = f"┌─ Result #{idx} " + "─" * (67 - len(str(idx))) + # Draw box header + header = f"┌─ Result #{idx} " + "─" * (BOX_TOTAL_WIDTH - 10 - len(str(idx))) typer.echo(header) - # Display URI if available + # Display URI and debug info if doc.document.uri: - uri_display = self._format_uri_display(doc.document.uri, icon, 75) - typer.echo(f"│ {uri_display:<75}│") + uri_display = self._format_uri_display(doc.document.uri, icon) + typer.echo(f"│ {uri_display:<{BOX_CONTENT_WIDTH}}│") - # Add debug info if needed - debug_line = self._get_debug_line(doc) - if debug_line: - typer.echo(debug_line) + if self.show_debug: + self._print_debug_line(doc) - typer.echo("├" + "─" * 77 + "┤") - elif self._should_show_debug(): - debug_line = self._get_debug_line(doc) - if debug_line: - typer.echo(debug_line) - typer.echo("├" + "─" * 77 + "┤") + typer.echo("├" + "─" * BOX_TOTAL_WIDTH + "┤") + elif self.show_debug: + self._print_debug_line(doc) + typer.echo("├" + "─" * BOX_TOTAL_WIDTH + "┤") # Display snippet for line in snippet_lines: - typer.echo(f"│ {line:<75} │") + typer.echo(f"│ {line:<{BOX_CONTENT_WIDTH}} │") - typer.echo("└" + "─" * 77 + "┘") - typer.echo() + # Display sentence details in debug mode + if self.show_debug and doc.sentences: + self._print_sentence_details(doc) - def _get_debug_line(self, doc: DocumentResult) -> Optional[str]: - """Get debug information line. Override in subclasses.""" - return None - - def _should_show_debug(self) -> bool: - """Whether to show debug information. Override in subclasses.""" - return False - - -class BoxedDebugFormatter(BoxedFormatter): - """Modern detailed formatter with debug information in boxes.""" + typer.echo("└" + "─" * BOX_TOTAL_WIDTH + "┘") + typer.echo() - def _get_debug_line(self, doc: DocumentResult) -> str: - """Format debug metrics line.""" + def _print_debug_line(self, doc: DocumentResult) -> None: + """Print debug metrics line.""" combined = ( f"{doc.combined_rank:.5f}" if doc.combined_rank is not None else "N/A" ) @@ -161,88 +164,36 @@ def _get_debug_line(self, doc: DocumentResult) -> str: if doc.fts_rank is not None else "N/A" ) - return f"│ Combined: {combined} │ Vector: {vec_info} │ FTS: {fts_info}" - - def _should_show_debug(self) -> bool: - return True - - def _format_single_result(self, doc: DocumentResult, idx: int) -> None: - """Format a single result with box layout including sentence summary.""" - icon = self._get_file_icon(doc.document.uri or "") - - # Get snippet from DocumentResult (handles sentence-based preview automatically) - snippet_text = doc.get_preview(max_chars=400) - - snippet_lines = self._clean_and_wrap_snippet( - snippet_text, width=75, max_length=400 - ) + debug_line = f"│ Combined: {combined} │ Vector: {vec_info} │ FTS: {fts_info}" + typer.echo(debug_line) - # Draw the result box header - header = f"┌─ Result #{idx} " + "─" * (67 - len(str(idx))) - typer.echo(header) + def _print_sentence_details(self, doc: DocumentResult) -> None: + """Print sentence-level details.""" + typer.echo("├" + "─" * BOX_TOTAL_WIDTH + "┤") + typer.echo(f"│ Sentences:{' ' * (BOX_CONTENT_WIDTH - 10)}│") - # Display URI if available - if doc.document.uri: - uri_display = self._format_uri_display(doc.document.uri, icon, 75) - typer.echo(f"│ {uri_display:<75}│") - - # Add debug info - debug_line = self._get_debug_line(doc) - if debug_line: - typer.echo(debug_line) - - typer.echo("├" + "─" * 77 + "┤") - elif self._should_show_debug(): - debug_line = self._get_debug_line(doc) - if debug_line: - typer.echo(debug_line) - typer.echo("├" + "─" * 77 + "┤") - - # Display snippet preview - for line in snippet_lines: - typer.echo(f"│ {line:<75} │") - - # Display sentence details if available - if doc.sentences: - typer.echo("├" + "─" * 77 + "┤") - typer.echo( - "│ Sentences: │" + for sentence in doc.sentences[:MAX_SENTENCES_DISPLAY]: + distance_str = ( + f"{sentence.distance:.6f}" if sentence.distance is not None else "N/A" ) - - for sentence in doc.sentences[:5]: # Show max 5 sentences - distance_str = ( - f"{sentence.distance:.6f}" - if sentence.distance is not None - else "N/A" - ) - rank_str = f"#{sentence.rank}" if sentence.rank is not None else "N/A" - - # Extract sentence preview (first 50 chars) - if ( - sentence.start_offset is not None - and sentence.end_offset is not None - ): - sentence_text = doc.chunk_content[ - sentence.start_offset : sentence.end_offset - ].strip() - # Truncate and clean for display - sentence_preview = sentence_text.replace("\n", " ").replace( - "\r", "" + rank_str = f"#{sentence.rank}" if sentence.rank is not None else "N/A" + + # Extract sentence preview + if sentence.start_offset is not None and sentence.end_offset is not None: + sentence_text = doc.chunk_content[ + sentence.start_offset : sentence.end_offset + ].strip() + sentence_preview = sentence_text.replace("\n", " ").replace("\r", "") + if len(sentence_preview) > SENTENCE_PREVIEW_LENGTH: + sentence_preview = ( + sentence_preview[: SENTENCE_PREVIEW_LENGTH - 3] + "..." ) - if len(sentence_preview) > 50: - sentence_preview = sentence_preview[:47] + "..." - else: - sentence_preview = "[No offset info]" - - # Format sentence line - sentence_line = ( - f"│ {rank_str:>3} ({distance_str}) | {sentence_preview}" - ) - # Pad to 78 chars and add closing border - typer.echo(sentence_line.ljust(78) + " │") - - typer.echo("└" + "─" * 77 + "┘") - typer.echo() + else: + sentence_preview = "[No offset info]" + + # Format and print sentence line + sentence_line = f"│ {rank_str:>3} ({distance_str}) | {sentence_preview}" + typer.echo(sentence_line.ljust(BOX_TOTAL_WIDTH + 1) + " │") class TableDebugFormatter(SearchResultFormatter): @@ -312,10 +263,15 @@ def _print_table_row(self, idx: int, doc: DocumentResult) -> None: def get_formatter( debug: bool = False, table_view: bool = False ) -> SearchResultFormatter: - """Factory function to get the appropriate formatter.""" + """Factory function to get the appropriate formatter. + + Args: + debug: Show debug information and sentence details + table_view: Use table format instead of boxed format + + Returns: + SearchResultFormatter instance + """ if table_view: return TableDebugFormatter() - elif debug: - return BoxedDebugFormatter() - else: - return BoxedFormatter() + return BoxedFormatter(show_debug=debug) diff --git a/src/sqlite_rag/models/document_result.py b/src/sqlite_rag/models/document_result.py index 0776592..f0c7dc8 100644 --- a/src/sqlite_rag/models/document_result.py +++ b/src/sqlite_rag/models/document_result.py @@ -9,7 +9,6 @@ class DocumentResult: document: Document chunk_id: int - chunk_content: str combined_rank: float vec_rank: float | None = None @@ -18,6 +17,8 @@ class DocumentResult: vec_distance: float | None = None fts_score: float | None = None + chunk_content: str = "" + # highlight sentences sentences: list[SentenceResult] = field(default_factory=list) @@ -40,18 +41,8 @@ def get_preview( # Fallback: no sentences, return truncated chunk content return self.chunk_content[:max_chars] - # Filter sentences that have offset information - sentences_with_offsets = [ - s - for s in top_sentences - if s.start_offset is not None and s.end_offset is not None - ] - - if not sentences_with_offsets: - return self.chunk_content[:max_chars] - # Sort by start_offset to maintain document order - sentences_with_offsets.sort( + top_sentences.sort( key=lambda s: s.start_offset if s.start_offset is not None else -1 ) @@ -59,17 +50,8 @@ def get_preview( total_chars = 0 prev_end_offset = None - for sentence in sentences_with_offsets: - sentence_text = self.chunk_content[ - sentence.start_offset : sentence.end_offset - ].strip() - - # Calculate remaining budget including potential separator - separator_len = len("[...] ") if preview_parts else 0 - remaining = max_chars - total_chars - separator_len - - if remaining <= 0: - break + for sentence in top_sentences: + sentence_text = sentence.content if prev_end_offset is not None and sentence.start_offset is not None: gap_size = sentence.start_offset - prev_end_offset diff --git a/src/sqlite_rag/models/sentence_result.py b/src/sqlite_rag/models/sentence_result.py index 7094efd..d2ffa1d 100644 --- a/src/sqlite_rag/models/sentence_result.py +++ b/src/sqlite_rag/models/sentence_result.py @@ -6,6 +6,8 @@ class SentenceResult: id: int | None = None chunk_id: int | None = None + content: str = "" + rank: float | None = None distance: float | None = None diff --git a/tests/models/test_document_result.py b/tests/models/test_document_result.py index 7fb80fc..1246ce6 100644 --- a/tests/models/test_document_result.py +++ b/tests/models/test_document_result.py @@ -6,61 +6,34 @@ class TestDocumentResult: def test_get_preview_no_sentences(self): doc = Document(uri="test.txt", content="test content") - chunk_content = "This is a long piece of text. " * 50 result = DocumentResult( document=doc, chunk_id=1, - chunk_content=chunk_content, combined_rank=1.0, sentences=[], ) preview = result.get_preview(max_chars=100) - assert len(preview) == 100 - assert preview == chunk_content[:100] - - def test_get_preview_with_sentences_no_offsets(self): - doc = Document(uri="test.txt", content="test content") - chunk_content = "First sentence. Second sentence. Third sentence." - - # Sentences without offset information - sentences = [ - SentenceResult(chunk_id=1, id=1, rank=1, distance=0.1), - SentenceResult(chunk_id=1, id=2, rank=2, distance=0.2), - ] - - result = DocumentResult( - document=doc, - chunk_id=1, - chunk_content=chunk_content, - combined_rank=1.0, - sentences=sentences, - ) - - preview = result.get_preview(max_chars=100) - assert preview == chunk_content[:100] + assert preview == "" def test_get_preview_with_single_sentence(self): doc = Document(uri="test.txt", content="test content") - chunk_content = ( - "First sentence here. Second sentence there. Third sentence everywhere." - ) sentences = [ SentenceResult( chunk_id=1, id=2, + content="Second sentence there.", rank=1, distance=0.1, - start_offset=21, - end_offset=44, + start_offset=15, + end_offset=36, ), ] result = DocumentResult( document=doc, chunk_id=1, - chunk_content=chunk_content, combined_rank=1.0, sentences=sentences, ) @@ -71,35 +44,31 @@ def test_get_preview_with_single_sentence(self): def test_get_preview_with_gaps(self): """Test get_preview adds [...] separator for gaps.""" doc = Document(uri="test.txt", content="test content") - chunk_content = ( - "First sentence at the beginning." - "Some middle content that we skip over here." - "Last sentence at the end." - ) sentences = [ SentenceResult( chunk_id=1, id=1, + content="First sentence at the beginning.", rank=1, distance=0.1, start_offset=0, - end_offset=32, # "First sentence at the beginning." + end_offset=32, ), SentenceResult( chunk_id=1, id=3, + content="Last sentence at the end.", rank=2, distance=0.2, start_offset=75, - end_offset=103, # "Last sentence at the end." + end_offset=103, ), ] result = DocumentResult( document=doc, chunk_id=1, - chunk_content=chunk_content, combined_rank=1.0, sentences=sentences, ) @@ -113,14 +82,13 @@ def test_get_preview_with_gaps(self): def test_get_preview_respects_max_chars(self): """Test get_preview truncates when exceeding max_chars.""" doc = Document(uri="test.txt", content="test content") - chunk_content = ( - "A very long sentence that exceeds the maximum character limit. " * 10 - ) + content = "A very long sentence that exceeds the maximum character limit. " * 10 sentences = [ SentenceResult( chunk_id=1, id=1, + content=content, rank=1, distance=0.1, start_offset=0, @@ -131,7 +99,6 @@ def test_get_preview_respects_max_chars(self): result = DocumentResult( document=doc, chunk_id=1, - chunk_content=chunk_content, combined_rank=1.0, sentences=sentences, ) @@ -141,12 +108,12 @@ def test_get_preview_respects_max_chars(self): def test_get_preview_with_multiple_consecutive_and_ordered_sentences(self): doc = Document(uri="test.txt", content="test content") - chunk_content = "First sentence. Second sentence. Third sentence." sentences = [ SentenceResult( chunk_id=1, id=1, + content="First sentence.", rank=1, distance=0.1, start_offset=0, @@ -155,6 +122,7 @@ def test_get_preview_with_multiple_consecutive_and_ordered_sentences(self): SentenceResult( chunk_id=1, id=2, + content="Second sentence.", rank=2, distance=0.2, start_offset=16, @@ -165,7 +133,6 @@ def test_get_preview_with_multiple_consecutive_and_ordered_sentences(self): result = DocumentResult( document=doc, chunk_id=1, - chunk_content=chunk_content, combined_rank=1.0, sentences=sentences, ) @@ -176,32 +143,32 @@ def test_get_preview_with_multiple_consecutive_and_ordered_sentences(self): def test_get_preview_orders_sentences_by_offset(self): """Test get_preview reorders sentences by start_offset (document order).""" doc = Document(uri="test.txt", content="test content") - chunk_content = "First sentence. " + "x" * 50 + " Third sentence." # Sentences in reverse rank order (rank 1 is last in document) sentences = [ SentenceResult( chunk_id=1, id=3, + content="Third sentence.", rank=1, # higher rank but appears latter in document distance=0.1, start_offset=66, - end_offset=82, # "Third sentence." + end_offset=82, ), SentenceResult( chunk_id=1, id=1, + content="First sentence.", rank=2, distance=0.2, start_offset=0, - end_offset=15, # "First sentence." + end_offset=15, ), ] result = DocumentResult( document=doc, chunk_id=1, - chunk_content=chunk_content, combined_rank=1.0, sentences=sentences, ) @@ -213,31 +180,59 @@ def test_get_preview_orders_sentences_by_offset(self): def test_get_preview_limits_to_top_k_sentences(self): """Test get_preview respects top_k_sentences parameter.""" doc = Document(uri="test.txt", content="test content") - chunk_content = "First. Second. Third. Fourth. Fifth." # 5 sentences, but only top 2 should be used sentences = [ SentenceResult( - chunk_id=1, id=1, rank=1, distance=0.1, start_offset=0, end_offset=6 + chunk_id=1, + id=1, + content="First.", + rank=1, + distance=0.1, + start_offset=0, + end_offset=6, ), SentenceResult( - chunk_id=1, id=2, rank=2, distance=0.2, start_offset=7, end_offset=14 + chunk_id=1, + id=2, + content="Second.", + rank=2, + distance=0.2, + start_offset=7, + end_offset=14, ), SentenceResult( - chunk_id=1, id=3, rank=3, distance=0.3, start_offset=15, end_offset=21 + chunk_id=1, + id=3, + content="Third.", + rank=3, + distance=0.3, + start_offset=15, + end_offset=21, ), SentenceResult( - chunk_id=1, id=4, rank=4, distance=0.4, start_offset=22, end_offset=29 + chunk_id=1, + id=4, + content="Fourth.", + rank=4, + distance=0.4, + start_offset=22, + end_offset=29, ), SentenceResult( - chunk_id=1, id=5, rank=5, distance=0.5, start_offset=30, end_offset=36 + chunk_id=1, + id=5, + content="Fifth.", + rank=5, + distance=0.5, + start_offset=30, + end_offset=36, ), ] result = DocumentResult( document=doc, chunk_id=1, - chunk_content=chunk_content, combined_rank=1.0, sentences=sentences, ) diff --git a/tests/test_formatters.py b/tests/test_formatters.py new file mode 100644 index 0000000..c738c91 --- /dev/null +++ b/tests/test_formatters.py @@ -0,0 +1,299 @@ +from sqlite_rag.formatters import ( + BoxedFormatter, + TableDebugFormatter, + get_formatter, +) +from sqlite_rag.models.document import Document +from sqlite_rag.models.document_result import DocumentResult +from sqlite_rag.models.sentence_result import SentenceResult + + +class TestGetFormatter: + """Test the get_formatter factory function.""" + + def test_get_formatter_default(self): + """Test getting formatter with default parameters.""" + formatter = get_formatter() + assert isinstance(formatter, BoxedFormatter) + assert formatter.show_debug is False + + def test_get_formatter_debug(self): + """Test getting formatter with debug=True.""" + formatter = get_formatter(debug=True) + assert isinstance(formatter, BoxedFormatter) + assert formatter.show_debug is True + + def test_get_formatter_table_view(self): + """Test getting table formatter.""" + formatter = get_formatter(table_view=True) + assert isinstance(formatter, TableDebugFormatter) + + def test_get_formatter_table_view_takes_precedence(self): + """Test that table_view takes precedence over debug.""" + formatter = get_formatter(debug=True, table_view=True) + assert isinstance(formatter, TableDebugFormatter) + # Table formatter doesn't have show_debug attribute + + +class TestSearchResultFormatter: + """Test base SearchResultFormatter methods.""" + + def setup_method(self): + """Set up test fixtures.""" + self.formatter = BoxedFormatter() + + def test_get_file_icon_python(self): + """Test getting icon for Python files.""" + assert self.formatter._get_file_icon("test.py") == "🐍" + assert self.formatter._get_file_icon("test.pyx") == "🐍" + + def test_get_file_icon_javascript(self): + """Test getting icon for JavaScript/TypeScript files.""" + assert self.formatter._get_file_icon("test.js") == "⚡" + assert self.formatter._get_file_icon("test.ts") == "⚡" + assert self.formatter._get_file_icon("test.jsx") == "⚡" + assert self.formatter._get_file_icon("test.tsx") == "⚡" + + def test_get_file_icon_markdown(self): + """Test getting icon for Markdown files.""" + assert self.formatter._get_file_icon("README.md") == "📄" + assert self.formatter._get_file_icon("doc.markdown") == "📄" + + def test_get_file_icon_case_insensitive(self): + """Test that file icon detection is case insensitive.""" + assert self.formatter._get_file_icon("TEST.PY") == "🐍" + assert self.formatter._get_file_icon("Test.Js") == "⚡" + + def test_get_file_icon_empty_uri(self): + """Test getting icon for empty URI.""" + assert self.formatter._get_file_icon("") == "📝" + + def test_get_file_icon_unknown_extension(self): + """Test getting default icon for unknown extensions.""" + assert self.formatter._get_file_icon("test.xyz") == "📄" + + def test_clean_and_wrap_snippet_basic(self): + """Test basic snippet cleaning and wrapping.""" + snippet = "This is a simple test snippet." + result = self.formatter._clean_and_wrap_snippet(snippet, width=30) + assert len(result) > 0 + assert all(len(line) <= 30 for line in result) + + def test_clean_and_wrap_snippet_removes_newlines(self): + """Test that newlines and carriage returns are removed.""" + snippet = "Line 1\nLine 2\r\nLine 3" + result = self.formatter._clean_and_wrap_snippet(snippet) + combined = " ".join(result) + assert "\n" not in combined + assert "\r" not in combined + assert "Line 1 Line 2 Line 3" == combined + + def test_clean_and_wrap_snippet_truncates_long_text(self): + """Test that long snippets are truncated.""" + snippet = "A" * 500 + result = self.formatter._clean_and_wrap_snippet(snippet, max_length=100) + combined = "".join(result) + assert len(combined) <= 103 # 100 + "..." + assert combined.endswith("...") + + def test_format_uri_display_basic(self): + """Test basic URI formatting.""" + uri_display = self.formatter._format_uri_display( + "path/to/file.py", "🐍", max_width=50 + ) + assert uri_display == "🐍 path/to/file.py" + + def test_format_uri_display_truncates_long_uri(self): + """Test that long URIs are truncated.""" + long_uri = "very/long/path/" * 10 + "file.py" + uri_display = self.formatter._format_uri_display(long_uri, "🐍", max_width=50) + assert len(uri_display) <= 50 + assert uri_display.startswith("🐍 ...") + + def test_format_uri_display_empty_uri(self): + """Test formatting empty URI.""" + assert self.formatter._format_uri_display("", "🐍") == "" + + +class TestBoxedFormatter: + """Test BoxedFormatter functionality.""" + + def test_init_default(self): + """Test BoxedFormatter initialization with default parameters.""" + formatter = BoxedFormatter() + assert formatter.show_debug is False + + def test_init_with_debug(self): + """Test BoxedFormatter initialization with debug enabled.""" + formatter = BoxedFormatter(show_debug=True) + assert formatter.show_debug is True + + def test_format_results_empty(self, mocker): + """Test formatting with empty results.""" + formatter = BoxedFormatter() + mock_echo = mocker.patch("typer.echo") + formatter.format_results([], "test query") + mock_echo.assert_called_once_with("No documents found matching the query.") + + def test_format_results_with_results(self, mocker): + """Test formatting with actual results.""" + doc = Document(uri="test.py", content="test content") + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content="This is test content.", + combined_rank=0.95, + vec_rank=1, + fts_rank=2, + vec_distance=0.1, + fts_score=5.0, + ) + + formatter = BoxedFormatter() + mock_echo = mocker.patch("typer.echo") + formatter.format_results([result], "test query") + # Should print header, result box, and empty line + assert mock_echo.call_count > 3 + # Check that it prints the search results header + first_call = mock_echo.call_args_list[0][0][0] + assert "Search Results" in first_call + assert "1 matches" in first_call + + def test_format_results_with_debug(self, mocker): + """Test formatting with debug information.""" + doc = Document(uri="test.py", content="test content") + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content="This is test content.", + combined_rank=0.95, + vec_rank=1, + fts_rank=2, + vec_distance=0.123456, + fts_score=5.678901, + ) + + formatter = BoxedFormatter(show_debug=True) + mock_echo = mocker.patch("typer.echo") + formatter.format_results([result], "test query") + # Check that debug info is printed + output = "\n".join( + [ + str(call.args[0]) if call.args else "" + for call in mock_echo.call_args_list + ] + ) + assert "Combined:" in output + assert "Vector:" in output + assert "FTS:" in output + + def test_format_results_with_sentences_in_debug_mode(self, mocker): + """Test formatting with sentence details in debug mode.""" + doc = Document(uri="test.py", content="test content") + sentences = [ + SentenceResult( + id=1, + chunk_id=1, + content="First sentence.", + rank=1, + distance=0.1, + start_offset=0, + end_offset=15, + ), + SentenceResult( + id=2, + chunk_id=1, + content="Second sentence.", + rank=2, + distance=0.2, + start_offset=16, + end_offset=32, + ), + ] + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content="First sentence. Second sentence.", + combined_rank=0.95, + sentences=sentences, + ) + + formatter = BoxedFormatter(show_debug=True) + mock_echo = mocker.patch("typer.echo") + formatter.format_results([result], "test query") + output = "\n".join( + [ + str(call.args[0]) if call.args else "" + for call in mock_echo.call_args_list + ] + ) + assert "Sentences:" in output + + def test_format_results_without_sentences_in_non_debug_mode(self, mocker): + """Test that sentences are not shown in non-debug mode.""" + doc = Document(uri="test.py", content="test content") + sentences = [ + SentenceResult( + id=1, + chunk_id=1, + content="First sentence.", + rank=1, + distance=0.1, + start_offset=0, + end_offset=15, + ), + ] + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content="First sentence.", + combined_rank=0.95, + sentences=sentences, + ) + + formatter = BoxedFormatter(show_debug=False) + mock_echo = mocker.patch("typer.echo") + formatter.format_results([result], "test query") + output = "\n".join( + [ + str(call.args[0]) if call.args else "" + for call in mock_echo.call_args_list + ] + ) + assert "Sentences:" not in output + + +class TestTableDebugFormatter: + """Test TableDebugFormatter functionality.""" + + def test_format_results_empty(self, mocker): + """Test table formatting with empty results.""" + formatter = TableDebugFormatter() + mock_echo = mocker.patch("typer.echo") + formatter.format_results([], "test query") + mock_echo.assert_called_once_with("No documents found matching the query.") + + def test_format_results_with_results(self, mocker): + """Test table formatting with actual results.""" + doc = Document(uri="test.py", content="test content") + result = DocumentResult( + document=doc, + chunk_id=1, + chunk_content="This is test content.", + combined_rank=0.95, + vec_rank=1, + fts_rank=2, + vec_distance=0.1, + fts_score=5.0, + ) + + formatter = TableDebugFormatter() + mock_echo = mocker.patch("typer.echo") + formatter.format_results([result], "test query") + # Should print header, table header, separator, and row + assert mock_echo.call_count >= 4 + # Check that headers are printed + output = "\n".join([str(call[0][0]) for call in mock_echo.call_args_list]) + assert "Preview" in output + assert "URI" in output + assert "C.Rank" in output From c9ee5ddb985f9f5574b6a8bc5c6849c4d2e95664 Mon Sep 17 00:00:00 2001 From: Daniele Briggi <=> Date: Tue, 21 Oct 2025 10:06:16 +0000 Subject: [PATCH 7/7] fix(seg-fault): attempt --- .devcontainer/devcontainer.json | 20 ------------- .devcontainer/py3.11/devcontainer.json | 38 ++++++++++++++++++++++++ .github/workflows/test.yaml | 4 +++ src/sqlite_rag/models/document_result.py | 5 ++-- tests/conftest.py | 12 ++++++-- 5 files changed, 55 insertions(+), 24 deletions(-) create mode 100644 .devcontainer/py3.11/devcontainer.json diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b22273d..855ba7b 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,18 +1,6 @@ { "name": "Python 3.10", "image": "mcr.microsoft.com/devcontainers/python:3.10", - "runArgs": [ - "--runtime", - "nvidia", - "--gpus", - "all", - // optional but make sure CUDA workloads are available - "--env", - "NVIDIA_VISIBLE_DEVICES=all", - // optional but make sure CUDA workloads are available - "--env", - "NVIDIA_DRIVER_CAPABILITIES=compute,utility" - ], "customizations": { "vscode": { "extensions": [ @@ -26,13 +14,5 @@ "hbenl.vscode-test-explorer" ] } - }, - "hostRequirements": { - "gpu": "optional" - }, - "remoteEnv": { - // optional but make sure CUDA workloads are available - "NVIDIA_VISIBLE_DEVICES": "all", - "NVIDIA_DRIVER_CAPABILITIES": "compute,utility" } } diff --git a/.devcontainer/py3.11/devcontainer.json b/.devcontainer/py3.11/devcontainer.json new file mode 100644 index 0000000..42bb225 --- /dev/null +++ b/.devcontainer/py3.11/devcontainer.json @@ -0,0 +1,38 @@ +{ + "name": "Python 3.11", + "image": "mcr.microsoft.com/devcontainers/python:3.11", + "runArgs": [ + "--runtime", + "nvidia", + "--gpus", + "all", + // optional but make sure CUDA workloads are available + "--env", + "NVIDIA_VISIBLE_DEVICES=all", + // optional but make sure CUDA workloads are available + "--env", + "NVIDIA_DRIVER_CAPABILITIES=compute,utility" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.black-formatter", + "ms-python.flake8", + "ms-python.isort", + "ms-python.vscode-pylance", + "ms-python.python", + "ms-python.debugpy", + "ms-python.vscode-python-envs", + "hbenl.vscode-test-explorer" + ] + } + }, + "hostRequirements": { + "gpu": "optional" + }, + "remoteEnv": { + // optional but make sure CUDA workloads are available + "NVIDIA_VISIBLE_DEVICES": "all", + "NVIDIA_DRIVER_CAPABILITIES": "compute,utility" + } +} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 36c77c2..a929f23 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -49,6 +49,10 @@ jobs: - name: Test # Using default directory for models + # COVERAGE_CORE=pytrace: Workaround for Python 3.11 segfault with SQLite extensions + C tracer + # See: https://github.com/nedbat/coveragepy/issues/1665 + env: + COVERAGE_CORE: ${{ matrix.python-version == '3.11' && 'pytrace' || '' }} run: | pytest --cov --cov-branch --cov-report=xml -v -m "not slow" ./tests diff --git a/src/sqlite_rag/models/document_result.py b/src/sqlite_rag/models/document_result.py index f0c7dc8..07b364f 100644 --- a/src/sqlite_rag/models/document_result.py +++ b/src/sqlite_rag/models/document_result.py @@ -42,8 +42,9 @@ def get_preview( return self.chunk_content[:max_chars] # Sort by start_offset to maintain document order - top_sentences.sort( - key=lambda s: s.start_offset if s.start_offset is not None else -1 + top_sentences = sorted( + top_sentences, + key=lambda s: s.start_offset if s.start_offset is not None else -1, ) preview_parts = [] diff --git a/tests/conftest.py b/tests/conftest.py index 04a3596..2bd096a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import sqlite3 import tempfile +from collections.abc import Generator import pytest @@ -26,7 +27,7 @@ def db_conn(): @pytest.fixture -def engine(db_conn) -> Engine: +def engine(db_conn) -> Generator[Engine, None, None]: conn, settings = db_conn engine = Engine( @@ -39,4 +40,11 @@ def engine(db_conn) -> Engine: engine.quantize() engine.create_new_context() - return engine + yield engine + + # Cleanup resources to prevent segfaults in Python 3.11 + # Must explicitly free resources before garbage collection + try: + engine.close() + except Exception: + pass