From e2dca4493b949f30415689f6672238caaf9435b6 Mon Sep 17 00:00:00 2001
From: Ray Tien <ray.tien0907@gmail.com>
Date: Tue, 20 Jan 2026 10:14:39 +0800
Subject: [PATCH] Add native TOC optimization to skip LLM calls.

---
 pageindex/__init__.py    |   8 ++-
 pageindex/page_index.py  |  44 +++++++++---
 pageindex/utils.py       | 132 +++++++++++++++++++++++++++++++++++
 requirements-dev.txt     |   2 +
 tests/test_native_toc.py | 147 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 321 insertions(+), 12 deletions(-)
 create mode 100644 requirements-dev.txt
 create mode 100644 tests/test_native_toc.py

diff --git a/pageindex/__init__.py b/pageindex/__init__.py
index 4606eb396..e72e0da40 100644
--- a/pageindex/__init__.py
+++ b/pageindex/__init__.py
@@ -1,2 +1,8 @@
 from .page_index import *
-from .page_index_md import md_to_tree
\ No newline at end of file
+from .page_index_md import md_to_tree
+from .utils import (
+    try_native_toc,
+    extract_pdf_native_toc,
+    validate_native_toc_quality,
+    convert_toc_levels_to_structure,
+)
\ No newline at end of file
diff --git a/pageindex/page_index.py b/pageindex/page_index.py
index 882fb5dea..6090c3a43 100644
--- a/pageindex/page_index.py
+++ b/pageindex/page_index.py
@@ -1019,39 +1019,61 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None)
     return node
 
 async def tree_parser(page_list, opt, doc=None, logger=None):
+    # === Fast path: try native TOC first ===
+    if isinstance(doc, str):
+        native_toc = try_native_toc(doc, len(page_list))
+        if native_toc:
+            logger.info({'source': 'native_toc', 'items': len(native_toc)})
+            print(f'Using native TOC with {len(native_toc)} items')
+
+            # Add preface if needed and process
+            native_toc = add_preface_if_needed(native_toc)
+            valid_toc_items = [item for item in native_toc if item.get('physical_index') is not None]
+            toc_tree = post_processing(valid_toc_items, len(page_list))
+
+            # Still process large nodes if needed
+            tasks = [
+                process_large_node_recursively(node, page_list, opt, logger=logger)
+                for node in toc_tree
+            ]
+            await asyncio.gather(*tasks)
+
+            return toc_tree
+
+    # === Original flow (fallback) ===
     check_toc_result = check_toc(page_list, opt)
     logger.info(check_toc_result)
 
     if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip() and check_toc_result["page_index_given_in_toc"] == "yes":
         toc_with_page_number = await meta_processor(
-            page_list, 
-            mode='process_toc_with_page_numbers', 
-            start_index=1, 
-            toc_content=check_toc_result['toc_content'], 
-            toc_page_list=check_toc_result['toc_page_list'], 
+            page_list,
+            mode='process_toc_with_page_numbers',
+            start_index=1,
+            toc_content=check_toc_result['toc_content'],
+            toc_page_list=check_toc_result['toc_page_list'],
             opt=opt,
             logger=logger)
     else:
         toc_with_page_number = await meta_processor(
-            page_list, 
-            mode='process_no_toc', 
-            start_index=1, 
+            page_list,
+            mode='process_no_toc',
+            start_index=1,
             opt=opt,
             logger=logger)
 
     toc_with_page_number = add_preface_if_needed(toc_with_page_number)
     toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger)
-    
+
     # Filter out items with None physical_index before post_processings
     valid_toc_items = [item for item in toc_with_page_number if item.get('physical_index') is not None]
-    
+
     toc_tree = post_processing(valid_toc_items, len(page_list))
     tasks = [
         process_large_node_recursively(node, page_list, opt, logger=logger)
         for node in toc_tree
     ]
     await asyncio.gather(*tasks)
-    
+
     return toc_tree
 
 
diff --git a/pageindex/utils.py b/pageindex/utils.py
index dc7acd888..7f85ef3da 100644
--- a/pageindex/utils.py
+++ b/pageindex/utils.py
@@ -678,6 +678,138 @@ def format_structure(structure, order=None):
     return structure
 
 
+def extract_pdf_native_toc(pdf_path):
+    """
+    Extract native TOC/bookmarks from PDF if available.
+
+    Args:
+        pdf_path: Path to PDF file (string)
+
+    Returns:
+        List of tuples [(level, title, page), ...] or None if extraction fails
+    """
+    try:
+        if not isinstance(pdf_path, str):
+            return None
+        if not os.path.isfile(pdf_path):
+            return None
+
+        doc = pymupdf.open(pdf_path)
+        toc = doc.get_toc()
+        doc.close()
+
+        if not toc:
+            return None
+
+        return toc
+    except Exception:
+        return None
+
+
+def validate_native_toc_quality(toc, total_pages):
+    """
+    Validate that native TOC is high quality and usable.
+
+    Conservative validation - only returns True if TOC is clearly reliable.
+
+    Args:
+        toc: List of tuples [(level, title, page), ...]
+        total_pages: Total number of pages in the document
+
+    Returns:
+        True if TOC passes all quality checks, False otherwise
+    """
+    if not toc:
+        return False
+
+    # Must have at least 5 items to be considered complete
+    if len(toc) < 5:
+        return False
+
+    # All titles must be non-empty
+    for level, title, page in toc:
+        if not title or not title.strip():
+            return False
+
+    # All pages must be within document range
+    for level, title, page in toc:
+        if page < 1 or page > total_pages:
+            return False
+
+    # Check for too many duplicate titles (suggests auto-generated junk)
+    titles = [title for _, title, _ in toc]
+    unique_titles = set(titles)
+    if len(unique_titles) < len(titles) * 0.8:
+        return False
+
+    return True
+
+
+def convert_toc_levels_to_structure(toc):
+    """
+    Convert pymupdf TOC format to PageIndex structure format.
+
+    Transforms [(level, title, page), ...] into the format expected by
+    post_processing and list_to_tree functions.
+
+    Args:
+        toc: List of tuples [(level, title, page), ...]
+
+    Returns:
+        List of dicts [{"structure": "1.2.3", "title": "...", "physical_index": N}, ...]
+    """
+    result = []
+    counters = {}  # {level: current_count}
+
+    for level, title, page in toc:
+        # Reset counters for deeper levels when we go back up
+        for l in list(counters.keys()):
+            if l > level:
+                del counters[l]
+
+        # Increment counter for this level
+        counters[level] = counters.get(level, 0) + 1
+
+        # Build structure string: "1.2.3"
+        structure_parts = []
+        for l in range(1, level + 1):
+            structure_parts.append(str(counters.get(l, 1)))
+        structure = '.'.join(structure_parts)
+
+        result.append({
+            'structure': structure,
+            'title': title,
+            'physical_index': page
+        })
+
+    return result
+
+
+def try_native_toc(pdf_path, total_pages):
+    """
+    Attempt to extract and validate native TOC from PDF.
+
+    Only returns a result if the native TOC exists AND passes quality validation.
+    This is the main entry point for the native TOC optimization.
+
+    Args:
+        pdf_path: Path to PDF file (string)
+        total_pages: Total number of pages in the document
+
+    Returns:
+        List of dicts in PageIndex format if successful, None otherwise
+    """
+    toc = extract_pdf_native_toc(pdf_path)
+
+    if toc is None:
+        return None
+
+    if not validate_native_toc_quality(toc, total_pages):
+        return None
+
+    return convert_toc_levels_to_structure(toc)
+
+
 class ConfigLoader:
     def __init__(self, default_path: str = None):
         if default_path is None:
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 000000000..7330f8b31
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,2 @@
+pytest==8.0.0
+pytest-asyncio==0.23.0
diff --git a/tests/test_native_toc.py b/tests/test_native_toc.py
new file mode 100644
index 000000000..f53b237c7
--- /dev/null
+++ b/tests/test_native_toc.py
@@ -0,0 +1,147 @@
+import pytest
+import os
+import sys
+
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from pageindex.utils import extract_pdf_native_toc, validate_native_toc_quality, convert_toc_levels_to_structure, try_native_toc
+
+
+class TestExtractPdfNativeToc:
+    """Tests for extracting native TOC from PDF bookmarks."""
+
+    def test_returns_none_for_nonexistent_file(self):
+        result = extract_pdf_native_toc("/nonexistent/path.pdf")
+        assert result is None
+
+    def test_returns_none_for_invalid_file(self, tmp_path):
+        fake_pdf = tmp_path / "fake.pdf"
+        fake_pdf.write_text("not a pdf")
+        result = extract_pdf_native_toc(str(fake_pdf))
+        assert result is None
+
+
+class TestValidateNativeTocQuality:
+    """Tests for TOC quality validation."""
+
+    def test_rejects_empty_toc(self):
+        result = validate_native_toc_quality([], total_pages=100)
+        assert result is False
+
+    def test_rejects_too_few_items(self):
+        toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10)]
+        result = validate_native_toc_quality(toc, total_pages=100)
+        assert result is False
+
+    def test_rejects_empty_titles(self):
+        toc = [(1, "", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 20),
+               (1, "Chapter 4", 30), (1, "Chapter 5", 40)]
+        result = validate_native_toc_quality(toc, total_pages=100)
+        assert result is False
+
+    def test_rejects_out_of_range_pages(self):
+        toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 200),
+               (1, "Chapter 4", 30), (1, "Chapter 5", 40)]
+        result = validate_native_toc_quality(toc, total_pages=100)
+        assert result is False
+
+    def test_accepts_valid_toc(self):
+        toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 20),
+               (1, "Chapter 4", 30), (1, "Chapter 5", 40)]
+        result = validate_native_toc_quality(toc, total_pages=100)
+        assert result is True
+
+
+class TestConvertTocLevelsToStructure:
+    """Tests for converting pymupdf TOC format to PageIndex structure format."""
+
+    def test_single_level(self):
+        toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 20)]
+        result = convert_toc_levels_to_structure(toc)
+
+        assert len(result) == 3
+        assert result[0] == {"structure": "1", "title": "Chapter 1", "physical_index": 1}
+        assert result[1] == {"structure": "2", "title": "Chapter 2", "physical_index": 10}
+        assert result[2] == {"structure": "3", "title": "Chapter 3", "physical_index": 20}
+
+    def test_nested_levels(self):
+        toc = [
+            (1, "Chapter 1", 1),
+            (2, "Section 1.1", 2),
+            (2, "Section 1.2", 5),
+            (1, "Chapter 2", 10),
+            (2, "Section 2.1", 11),
+        ]
+        result = convert_toc_levels_to_structure(toc)
+
+        assert result[0] == {"structure": "1", "title": "Chapter 1", "physical_index": 1}
+        assert result[1] == {"structure": "1.1", "title": "Section 1.1", "physical_index": 2}
+        assert result[2] == {"structure": "1.2", "title": "Section 1.2", "physical_index": 5}
+        assert result[3] == {"structure": "2", "title": "Chapter 2", "physical_index": 10}
+        assert result[4] == {"structure": "2.1", "title": "Section 2.1", "physical_index": 11}
+
+    def test_three_levels_deep(self):
+        toc = [
+            (1, "Part 1", 1),
+            (2, "Chapter 1", 2),
+            (3, "Section 1.1.1", 3),
+        ]
+        result = convert_toc_levels_to_structure(toc)
+
+        assert result[0] == {"structure": "1", "title": "Part 1", "physical_index": 1}
+        assert result[1] == {"structure": "1.1", "title": "Chapter 1", "physical_index": 2}
+        assert result[2] == {"structure": "1.1.1", "title": "Section 1.1.1", "physical_index": 3}
+
+
+class TestTryNativeToc:
+    """Tests for the combined try_native_toc function."""
+
+    def test_returns_none_for_nonexistent_file(self):
+        result = try_native_toc("/nonexistent/path.pdf", total_pages=100)
+        assert result is None
+
+    def test_returns_none_for_invalid_file(self, tmp_path):
+        fake_pdf = tmp_path / "fake.pdf"
+        fake_pdf.write_text("not a pdf")
+        result = try_native_toc(str(fake_pdf), total_pages=100)
+        assert result is None
+
+
+class TestIntegrationWithRealPdf:
+    """Integration tests using actual test PDFs."""
+
+    @pytest.fixture
+    def test_pdf_path(self):
+        """Path to a test PDF that should have native TOC."""
+        return "tests/pdfs/2023-annual-report.pdf"
+
+    def test_extract_from_real_pdf(self, test_pdf_path):
+        """Test extraction from a real PDF file."""
+        if not os.path.exists(test_pdf_path):
+            pytest.skip(f"Test PDF not found: {test_pdf_path}")
+
+        result = extract_pdf_native_toc(test_pdf_path)
+        # Result can be None (no TOC) or a list - both are valid
+        assert result is None or isinstance(result, list)
+
+    def test_try_native_toc_with_real_pdf(self, test_pdf_path):
+        """Test full pipeline with a real PDF."""
+        if not os.path.exists(test_pdf_path):
+            pytest.skip(f"Test PDF not found: {test_pdf_path}")
+
+        import pymupdf
+        doc = pymupdf.open(test_pdf_path)
+        total_pages = len(doc)
+        doc.close()
+
+        result = try_native_toc(test_pdf_path, total_pages)
+        # Result can be None (no/bad TOC) or properly formatted list
+        if result is not None:
+            assert isinstance(result, list)
+            assert len(result) > 0
+            # Check structure format
+            for item in result:
+                assert 'structure' in item
+                assert 'title' in item
+                assert 'physical_index' in item