From 0d80f1537d67473c3ae07bdb1b300f736345835e Mon Sep 17 00:00:00 2001 From: Daniele Briggi <=> Date: Tue, 30 Sep 2025 09:07:02 +0000 Subject: [PATCH] feat(extractor): file specific extractor Add extractor of md frontmatter for metadata --- .gitignore | 2 + pyproject.toml | 3 +- src/sqlite_rag/database.py | 2 +- src/sqlite_rag/extractor.py | 37 +++++++++++ src/sqlite_rag/extractors/__init__.py | 0 src/sqlite_rag/extractors/base.py | 32 ++++++++++ src/sqlite_rag/extractors/frontmatter.py | 27 ++++++++ src/sqlite_rag/reader.py | 1 + src/sqlite_rag/sqliterag.py | 27 +++++++- tests/extractors/test_frontmatter.py | 52 +++++++++++++++ tests/test_extractor.py | 40 ++++++++++++ tests/test_sqlite_rag.py | 80 +++++++++++++++++++++++- 12 files changed, 298 insertions(+), 5 deletions(-) create mode 100644 src/sqlite_rag/extractor.py create mode 100644 src/sqlite_rag/extractors/__init__.py create mode 100644 src/sqlite_rag/extractors/base.py create mode 100644 src/sqlite_rag/extractors/frontmatter.py create mode 100644 tests/extractors/test_frontmatter.py create mode 100644 tests/test_extractor.py diff --git a/.gitignore b/.gitignore index 3946243..d9b8e71 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,8 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +.venv/ +venv/ # PyInstaller *.manifest diff --git a/pyproject.toml b/pyproject.toml index 8bc080d..a643cd9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "markitdown[pptx]", "markitdown[xls]", "markitdown[xlsx]", + "python-frontmatter", "prompt-toolkit", "sqlite-ai", "sqliteai-vector" @@ -34,7 +35,7 @@ dependencies = [ dev = [ "pytest", "pytest-mock", - "pytest-cov==6.3.0", + "pytest-cov", "black", "flake8", "bandit", diff --git a/src/sqlite_rag/database.py b/src/sqlite_rag/database.py index fb92ddf..8ad1791 100644 --- a/src/sqlite_rag/database.py +++ b/src/sqlite_rag/database.py @@ -29,7 +29,7 @@ def initialize(conn: sqlite3.Connection, settings: Settings) -> sqlite3.Connecti ) ) conn.load_extension( - str(importlib.resources.files("sqlite-vector.binaries") / "vector") + str(importlib.resources.files("sqlite_vector.binaries") / "vector") ) except sqlite3.OperationalError as e: raise RuntimeError( diff --git a/src/sqlite_rag/extractor.py b/src/sqlite_rag/extractor.py new file mode 100644 index 0000000..13d78fc --- /dev/null +++ b/src/sqlite_rag/extractor.py @@ -0,0 +1,37 @@ +from pathlib import Path +from typing import Dict, Optional, Tuple + +from sqlite_rag.extractors.base import MetadataExtractor +from sqlite_rag.extractors.frontmatter import FrontmatterExtractor + + +class Extractor: + extractors = [ + FrontmatterExtractor(), + ] + + def get_extractor(self, file_extension: str) -> Optional[MetadataExtractor]: + """Get the appropriate extractor based on file type.""" + for extractor in self.extractors: + if extractor.supports_file_type(file_extension): + return extractor + + return None + + def extract_metadata(self, content: str, file_path: Path) -> Tuple[str, Dict]: + """Extract metadata and clean content based on file type. + + Args: + content: Raw content to extract metadata from + file_path: Path to the file for context + + Returns: + Tuple of (clean_content, metadata_dict) + """ + file_extension = file_path.suffix + + extractor = self.get_extractor(file_extension) + if extractor: + return extractor.extract(content, file_path) + + return content, {} diff --git a/src/sqlite_rag/extractors/__init__.py b/src/sqlite_rag/extractors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/sqlite_rag/extractors/base.py b/src/sqlite_rag/extractors/base.py new file mode 100644 index 0000000..a446c83 --- /dev/null +++ b/src/sqlite_rag/extractors/base.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Dict, Optional, Tuple + + +class MetadataExtractor(ABC): + """Base interface for metadata extractors.""" + + @abstractmethod + def extract( + self, content: str, file_path: Optional[Path] = None + ) -> Tuple[str, Dict]: + """Extract metadata from content. + + Args: + content: The raw content to extract metadata from + file_path: Optional file path for context + + Returns: + Tuple of (clean_content, metadata_dict) + """ + + @abstractmethod + def supports_file_type(self, file_extension: str) -> bool: + """Check if this extractor supports the given file type. + + Args: + file_extension: File extension (e.g., '.md', '.pdf') + + Returns: + True if this extractor can handle the file type + """ diff --git a/src/sqlite_rag/extractors/frontmatter.py b/src/sqlite_rag/extractors/frontmatter.py new file mode 100644 index 0000000..221792d --- /dev/null +++ b/src/sqlite_rag/extractors/frontmatter.py @@ -0,0 +1,27 @@ +from pathlib import Path +from typing import Dict, Optional, Tuple + +import frontmatter + +from sqlite_rag.extractors.base import MetadataExtractor + + +class FrontmatterExtractor(MetadataExtractor): + """Extracts frontmatter from markdown files.""" + + def extract( + self, content: str, file_path: Optional[Path] = None + ) -> Tuple[str, Dict]: + """Extract frontmatter from markdown content.""" + try: + post = frontmatter.loads(content) + clean_content = post.content + metadata = dict(post.metadata) + return clean_content, metadata + except Exception: + # If frontmatter parsing fails, return original content + return content, {} + + def supports_file_type(self, file_extension: str) -> bool: + """Support markdown files.""" + return file_extension.lower() in [".md", ".mdx", ".txt"] diff --git a/src/sqlite_rag/reader.py b/src/sqlite_rag/reader.py index 3255516..4633a96 100644 --- a/src/sqlite_rag/reader.py +++ b/src/sqlite_rag/reader.py @@ -47,6 +47,7 @@ def is_supported(path: Path) -> bool: @staticmethod def parse_file(path: Path, max_document_size_bytes: Optional[int] = None) -> str: + """Read the file and convert into Markdown text.""" try: converter = MarkItDown() text = converter.convert( diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py index dfebbeb..84927d6 100644 --- a/src/sqlite_rag/sqliterag.py +++ b/src/sqlite_rag/sqliterag.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Any, Optional +from sqlite_rag.extractor import Extractor from sqlite_rag.logger import Logger from sqlite_rag.models.document_result import DocumentResult @@ -25,6 +26,7 @@ def __init__(self, connection: sqlite3.Connection, settings: Settings): self._repository = Repository(self._conn, settings) self._chunker = Chunker(self._conn, settings) self._engine = Engine(self._conn, settings, chunker=self._chunker) + self._extractor = Extractor() self.ready = False @@ -103,7 +105,16 @@ def add( if use_relative_paths else str(file_path.absolute()) ) - document = Document(content=content, uri=uri, metadata=metadata.copy()) + + content, file_metadata = self._extractor.extract_metadata( + content, file_path + ) + + merged_metadata = metadata.copy() + if file_metadata: + merged_metadata["extracted"] = file_metadata + + document = Document(content=content, uri=uri, metadata=merged_metadata) exists = self._repository.document_exists_by_hash(document.hash()) if exists: @@ -187,7 +198,21 @@ def rebuild(self, remove_missing: bool = False) -> dict: content = FileReader.parse_file( Path(doc.uri), self._settings.max_document_size_bytes ) + + if not content: + self._logger.warning( + f"{i+1}/{total_docs} Skipping empty file: {doc.uri}" + ) + not_found += 1 + continue + + content, file_metadata = self._extractor.extract_metadata( + content, Path(doc.uri) + ) + doc.content = content + if file_metadata: + doc.metadata["extracted"] = file_metadata self._repository.remove_document(doc_id) processed_doc = self._engine.process(doc) diff --git a/tests/extractors/test_frontmatter.py b/tests/extractors/test_frontmatter.py new file mode 100644 index 0000000..32e55f6 --- /dev/null +++ b/tests/extractors/test_frontmatter.py @@ -0,0 +1,52 @@ +from sqlite_rag.extractors.frontmatter import FrontmatterExtractor + + +class TestFrontmatterExtractor: + def test_extract_with_frontmatter(self): + content = """--- +title: Test Document +author: John Doe +--- +# Heading 1 +This is a test document. +""" + extractor = FrontmatterExtractor() + clean_content, metadata = extractor.extract(content) + assert "title" in metadata + assert metadata["title"] == "Test Document" + assert "author" in metadata + assert metadata["author"] == "John Doe" + assert "# Heading 1" in clean_content + assert "This is a test document." in clean_content + + def test_extract_without_frontmatter(self): + content = """# Heading 1 +This is a test document without frontmatter. +""" + extractor = FrontmatterExtractor() + clean_content, metadata = extractor.extract(content) + assert metadata == {} + assert "# Heading 1" in clean_content + assert "This is a test document without frontmatter." in clean_content + + def test_supports_file_type(self): + extractor = FrontmatterExtractor() + assert extractor.supports_file_type(".md") + assert extractor.supports_file_type(".MDX") + assert extractor.supports_file_type(".txt") + assert not extractor.supports_file_type(".pdf") + assert not extractor.supports_file_type(".html") + + def test_extract_malformed_frontmatter(self): + content = """--- +title: Test Document +author John Doe +--- +# Heading 1 +""" + extractor = FrontmatterExtractor() + clean_content, metadata = extractor.extract(content) + # Should return original content and empty metadata on failure + assert metadata == {} + assert content == clean_content + assert "# Heading 1" in clean_content diff --git a/tests/test_extractor.py b/tests/test_extractor.py new file mode 100644 index 0000000..1d3c116 --- /dev/null +++ b/tests/test_extractor.py @@ -0,0 +1,40 @@ +from pathlib import Path + +from sqlite_rag.extractor import Extractor + + +class TestExtractor: + def test_extract_metadata_from_md(self): + extractor = Extractor() + content = """--- +title: Sample Document +author: Test Author +--- +# Heading 1 +This is a sample markdown document. +""" + file_path = Path("sample.md") + clean_content, metadata = extractor.extract_metadata(content, file_path) + assert "title" in metadata + assert metadata["title"] == "Sample Document" + assert "author" in metadata + assert metadata["author"] == "Test Author" + assert "# Heading 1" in clean_content + assert "This is a sample markdown document." in clean_content + + def test_no_extractor_for_unsupported_file(self): + extractor = Extractor() + content = "This is HTML content." + file_path = Path("sample.html") + clean_content, metadata = extractor.extract_metadata(content, file_path) + assert clean_content == content + assert metadata == {} + + def test_get_extractor(self): + extractor = Extractor() + md_extractor = extractor.get_extractor(".md") + assert md_extractor is not None + assert md_extractor.supports_file_type(".md") + + html_extractor = extractor.get_extractor(".html") + assert html_extractor is None diff --git a/tests/test_sqlite_rag.py b/tests/test_sqlite_rag.py index f9fcac4..db5e556 100644 --- a/tests/test_sqlite_rag.py +++ b/tests/test_sqlite_rag.py @@ -9,7 +9,7 @@ from sqlite_rag.settings import Settings -class TestSQLiteRag: +class TestSQLiteRagAdd: def test_add_simple_text_file(self): # test file with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: @@ -270,6 +270,40 @@ def test_add_text_with_metadata(self): } ) + def test_add_markdown_with_frontmatter(self): + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: + f.write( + """--- +title: Sample Document +author: Test Author +--- +# Heading 1 +This is a sample markdown document. +""" + ) + temp_file_path = f.name + + rag = SQLiteRag.create(":memory:") + + rag.add(temp_file_path) + + conn = rag._conn + cursor = conn.execute("SELECT content, metadata FROM documents") + doc = cursor.fetchone() + + assert doc + assert "# Heading 1" in doc[0] + assert "This is a sample markdown document." in doc[0] + + metadata = json.loads(doc[1]) + assert "extracted" in metadata + assert "title" in metadata["extracted"] + assert metadata["extracted"]["title"] == "Sample Document" + assert "author" in metadata["extracted"] + assert metadata["extracted"]["author"] == "Test Author" + + +class TestSQLiteRag: def test_list_documents(self): rag = SQLiteRag.create(":memory:") @@ -545,6 +579,48 @@ def test_rebuild_text_documents(self): documents = rag.list_documents() assert len(documents) == 1 + def test_rebuild_with_md_frontmatter(self): + """Test rebuild with markdown file that have frontmatter""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f1: + f1.write( + """--- +title: Document 1 +author: Author 1 +--- +# Heading 1 +Content of document 1. +""" + ) + file1_path = f1.name + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f2: + f2.write( + """# Heading 2 + + Content of document 2. + """ + ) + file2_path = f2.name + + rag = SQLiteRag.create(":memory:") + rag.add(file1_path) + rag.add(file2_path) + + result = rag.rebuild() + + assert result["total"] == 2 + assert result["reprocessed"] == 2 + assert result["not_found"] == 0 + assert result["removed"] == 0 + + documents = rag.list_documents() + assert len(documents) == 2 + + titles = [ + doc.metadata.get("extracted", {}).get("title", "") for doc in documents + ] + assert "Document 1" in titles + assert "Document 2" not in titles # No frontmatter title + def test_reset_database(self): temp_file_path = os.path.join(tempfile.mkdtemp(), "something") @@ -621,7 +697,7 @@ def test_search_samples_exact_match_by_scan_type(self, quantize_scan: bool): sample_files = list(samples_dir.glob("*.txt")) for sample_file in sample_files: - file_content = sample_file.read_text(encoding="utf-8") + file_content = sample_file.read_text(encoding="utf-8").rstrip("\n") # Search for the exact content results = rag.search(file_content, top_k=2)