Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
.venv/
venv/

# PyInstaller
*.manifest
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dependencies = [
"markitdown[pptx]",
"markitdown[xls]",
"markitdown[xlsx]",
"python-frontmatter",
"prompt-toolkit",
"sqlite-ai",
"sqliteai-vector"
Expand All @@ -34,7 +35,7 @@ dependencies = [
dev = [
"pytest",
"pytest-mock",
"pytest-cov==6.3.0",
"pytest-cov",
"black",
"flake8",
"bandit",
Expand Down
2 changes: 1 addition & 1 deletion src/sqlite_rag/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def initialize(conn: sqlite3.Connection, settings: Settings) -> sqlite3.Connecti
)
)
conn.load_extension(
str(importlib.resources.files("sqlite-vector.binaries") / "vector")
str(importlib.resources.files("sqlite_vector.binaries") / "vector")
)
except sqlite3.OperationalError as e:
raise RuntimeError(
Expand Down
37 changes: 37 additions & 0 deletions src/sqlite_rag/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pathlib import Path
from typing import Dict, Optional, Tuple

from sqlite_rag.extractors.base import MetadataExtractor
from sqlite_rag.extractors.frontmatter import FrontmatterExtractor


class Extractor:
extractors = [
FrontmatterExtractor(),
]

def get_extractor(self, file_extension: str) -> Optional[MetadataExtractor]:
"""Get the appropriate extractor based on file type."""
for extractor in self.extractors:
if extractor.supports_file_type(file_extension):
return extractor

return None

def extract_metadata(self, content: str, file_path: Path) -> Tuple[str, Dict]:
"""Extract metadata and clean content based on file type.

Args:
content: Raw content to extract metadata from
file_path: Path to the file for context

Returns:
Tuple of (clean_content, metadata_dict)
"""
file_extension = file_path.suffix

extractor = self.get_extractor(file_extension)
if extractor:
return extractor.extract(content, file_path)

return content, {}
Empty file.
32 changes: 32 additions & 0 deletions src/sqlite_rag/extractors/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, Optional, Tuple


class MetadataExtractor(ABC):
"""Base interface for metadata extractors."""

@abstractmethod
def extract(
self, content: str, file_path: Optional[Path] = None
) -> Tuple[str, Dict]:
"""Extract metadata from content.

Args:
content: The raw content to extract metadata from
file_path: Optional file path for context

Returns:
Tuple of (clean_content, metadata_dict)
"""

@abstractmethod
def supports_file_type(self, file_extension: str) -> bool:
"""Check if this extractor supports the given file type.

Args:
file_extension: File extension (e.g., '.md', '.pdf')

Returns:
True if this extractor can handle the file type
"""
27 changes: 27 additions & 0 deletions src/sqlite_rag/extractors/frontmatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from pathlib import Path
from typing import Dict, Optional, Tuple

import frontmatter

from sqlite_rag.extractors.base import MetadataExtractor


class FrontmatterExtractor(MetadataExtractor):
"""Extracts frontmatter from markdown files."""

def extract(
self, content: str, file_path: Optional[Path] = None
) -> Tuple[str, Dict]:
"""Extract frontmatter from markdown content."""
try:
post = frontmatter.loads(content)
clean_content = post.content
metadata = dict(post.metadata)
return clean_content, metadata
except Exception:
# If frontmatter parsing fails, return original content
return content, {}

def supports_file_type(self, file_extension: str) -> bool:
"""Support markdown files."""
return file_extension.lower() in [".md", ".mdx", ".txt"]
1 change: 1 addition & 0 deletions src/sqlite_rag/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def is_supported(path: Path) -> bool:

@staticmethod
def parse_file(path: Path, max_document_size_bytes: Optional[int] = None) -> str:
"""Read the file and convert into Markdown text."""
try:
converter = MarkItDown()
text = converter.convert(
Expand Down
27 changes: 26 additions & 1 deletion src/sqlite_rag/sqliterag.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path
from typing import Any, Optional

from sqlite_rag.extractor import Extractor
from sqlite_rag.logger import Logger
from sqlite_rag.models.document_result import DocumentResult

Expand All @@ -25,6 +26,7 @@ def __init__(self, connection: sqlite3.Connection, settings: Settings):
self._repository = Repository(self._conn, settings)
self._chunker = Chunker(self._conn, settings)
self._engine = Engine(self._conn, settings, chunker=self._chunker)
self._extractor = Extractor()

self.ready = False

Expand Down Expand Up @@ -103,7 +105,16 @@ def add(
if use_relative_paths
else str(file_path.absolute())
)
document = Document(content=content, uri=uri, metadata=metadata.copy())

content, file_metadata = self._extractor.extract_metadata(
content, file_path
)

merged_metadata = metadata.copy()
if file_metadata:
merged_metadata["extracted"] = file_metadata

document = Document(content=content, uri=uri, metadata=merged_metadata)

exists = self._repository.document_exists_by_hash(document.hash())
if exists:
Expand Down Expand Up @@ -187,7 +198,21 @@ def rebuild(self, remove_missing: bool = False) -> dict:
content = FileReader.parse_file(
Path(doc.uri), self._settings.max_document_size_bytes
)

if not content:
self._logger.warning(
f"{i+1}/{total_docs} Skipping empty file: {doc.uri}"
)
not_found += 1
continue

content, file_metadata = self._extractor.extract_metadata(
content, Path(doc.uri)
)

doc.content = content
if file_metadata:
doc.metadata["extracted"] = file_metadata

self._repository.remove_document(doc_id)
processed_doc = self._engine.process(doc)
Expand Down
52 changes: 52 additions & 0 deletions tests/extractors/test_frontmatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from sqlite_rag.extractors.frontmatter import FrontmatterExtractor


class TestFrontmatterExtractor:
def test_extract_with_frontmatter(self):
content = """---
title: Test Document
author: John Doe
---
# Heading 1
This is a test document.
"""
extractor = FrontmatterExtractor()
clean_content, metadata = extractor.extract(content)
assert "title" in metadata
assert metadata["title"] == "Test Document"
assert "author" in metadata
assert metadata["author"] == "John Doe"
assert "# Heading 1" in clean_content
assert "This is a test document." in clean_content

def test_extract_without_frontmatter(self):
content = """# Heading 1
This is a test document without frontmatter.
"""
extractor = FrontmatterExtractor()
clean_content, metadata = extractor.extract(content)
assert metadata == {}
assert "# Heading 1" in clean_content
assert "This is a test document without frontmatter." in clean_content

def test_supports_file_type(self):
extractor = FrontmatterExtractor()
assert extractor.supports_file_type(".md")
assert extractor.supports_file_type(".MDX")
assert extractor.supports_file_type(".txt")
assert not extractor.supports_file_type(".pdf")
assert not extractor.supports_file_type(".html")

def test_extract_malformed_frontmatter(self):
content = """---
title: Test Document
author John Doe
---
# Heading 1
"""
extractor = FrontmatterExtractor()
clean_content, metadata = extractor.extract(content)
# Should return original content and empty metadata on failure
assert metadata == {}
assert content == clean_content
assert "# Heading 1" in clean_content
40 changes: 40 additions & 0 deletions tests/test_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from pathlib import Path

from sqlite_rag.extractor import Extractor


class TestExtractor:
def test_extract_metadata_from_md(self):
extractor = Extractor()
content = """---
title: Sample Document
author: Test Author
---
# Heading 1
This is a sample markdown document.
"""
file_path = Path("sample.md")
clean_content, metadata = extractor.extract_metadata(content, file_path)
assert "title" in metadata
assert metadata["title"] == "Sample Document"
assert "author" in metadata
assert metadata["author"] == "Test Author"
assert "# Heading 1" in clean_content
assert "This is a sample markdown document." in clean_content

def test_no_extractor_for_unsupported_file(self):
extractor = Extractor()
content = "<html><body>This is HTML content.</body></html>"
file_path = Path("sample.html")
clean_content, metadata = extractor.extract_metadata(content, file_path)
assert clean_content == content
assert metadata == {}

def test_get_extractor(self):
extractor = Extractor()
md_extractor = extractor.get_extractor(".md")
assert md_extractor is not None
assert md_extractor.supports_file_type(".md")

html_extractor = extractor.get_extractor(".html")
assert html_extractor is None
80 changes: 78 additions & 2 deletions tests/test_sqlite_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sqlite_rag.settings import Settings


class TestSQLiteRag:
class TestSQLiteRagAdd:
def test_add_simple_text_file(self):
# test file
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
Expand Down Expand Up @@ -270,6 +270,40 @@ def test_add_text_with_metadata(self):
}
)

def test_add_markdown_with_frontmatter(self):
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
f.write(
"""---
title: Sample Document
author: Test Author
---
# Heading 1
This is a sample markdown document.
"""
)
temp_file_path = f.name

rag = SQLiteRag.create(":memory:")

rag.add(temp_file_path)

conn = rag._conn
cursor = conn.execute("SELECT content, metadata FROM documents")
doc = cursor.fetchone()

assert doc
assert "# Heading 1" in doc[0]
assert "This is a sample markdown document." in doc[0]

metadata = json.loads(doc[1])
assert "extracted" in metadata
assert "title" in metadata["extracted"]
assert metadata["extracted"]["title"] == "Sample Document"
assert "author" in metadata["extracted"]
assert metadata["extracted"]["author"] == "Test Author"


class TestSQLiteRag:
def test_list_documents(self):
rag = SQLiteRag.create(":memory:")

Expand Down Expand Up @@ -545,6 +579,48 @@ def test_rebuild_text_documents(self):
documents = rag.list_documents()
assert len(documents) == 1

def test_rebuild_with_md_frontmatter(self):
"""Test rebuild with markdown file that have frontmatter"""
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f1:
f1.write(
"""---
title: Document 1
author: Author 1
---
# Heading 1
Content of document 1.
"""
)
file1_path = f1.name
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f2:
f2.write(
"""# Heading 2

Content of document 2.
"""
)
file2_path = f2.name

rag = SQLiteRag.create(":memory:")
rag.add(file1_path)
rag.add(file2_path)

result = rag.rebuild()

assert result["total"] == 2
assert result["reprocessed"] == 2
assert result["not_found"] == 0
assert result["removed"] == 0

documents = rag.list_documents()
assert len(documents) == 2

titles = [
doc.metadata.get("extracted", {}).get("title", "") for doc in documents
]
assert "Document 1" in titles
assert "Document 2" not in titles # No frontmatter title

def test_reset_database(self):
temp_file_path = os.path.join(tempfile.mkdtemp(), "something")

Expand Down Expand Up @@ -621,7 +697,7 @@ def test_search_samples_exact_match_by_scan_type(self, quantize_scan: bool):
sample_files = list(samples_dir.glob("*.txt"))

for sample_file in sample_files:
file_content = sample_file.read_text(encoding="utf-8")
file_content = sample_file.read_text(encoding="utf-8").rstrip("\n")

# Search for the exact content
results = rag.search(file_content, top_k=2)
Expand Down
Loading