From e2dca4493b949f30415689f6672238caaf9435b6 Mon Sep 17 00:00:00 2001 From: Ray Tien Date: Tue, 20 Jan 2026 10:14:39 +0800 Subject: [PATCH] Add native TOC optimization to skip LLM calls. --- pageindex/__init__.py | 8 ++- pageindex/page_index.py | 44 +++++++++--- pageindex/utils.py | 132 +++++++++++++++++++++++++++++++++++ requirements-dev.txt | 2 + tests/test_native_toc.py | 147 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 321 insertions(+), 12 deletions(-) create mode 100644 requirements-dev.txt create mode 100644 tests/test_native_toc.py diff --git a/pageindex/__init__.py b/pageindex/__init__.py index 4606eb396..e72e0da40 100644 --- a/pageindex/__init__.py +++ b/pageindex/__init__.py @@ -1,2 +1,8 @@ from .page_index import * -from .page_index_md import md_to_tree \ No newline at end of file +from .page_index_md import md_to_tree +from .utils import ( + try_native_toc, + extract_pdf_native_toc, + validate_native_toc_quality, + convert_toc_levels_to_structure, +) \ No newline at end of file diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 882fb5dea..6090c3a43 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1019,39 +1019,61 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None) return node async def tree_parser(page_list, opt, doc=None, logger=None): + # === Fast path: try native TOC first === + if isinstance(doc, str): + native_toc = try_native_toc(doc, len(page_list)) + if native_toc: + logger.info({'source': 'native_toc', 'items': len(native_toc)}) + print(f'Using native TOC with {len(native_toc)} items') + + # Add preface if needed and process + native_toc = add_preface_if_needed(native_toc) + valid_toc_items = [item for item in native_toc if item.get('physical_index') is not None] + toc_tree = post_processing(valid_toc_items, len(page_list)) + + # Still process large nodes if needed + tasks = [ + process_large_node_recursively(node, page_list, opt, logger=logger) + for node in toc_tree + ] + await asyncio.gather(*tasks) + + return toc_tree + + # === Original flow (fallback) === check_toc_result = check_toc(page_list, opt) logger.info(check_toc_result) if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip() and check_toc_result["page_index_given_in_toc"] == "yes": toc_with_page_number = await meta_processor( - page_list, - mode='process_toc_with_page_numbers', - start_index=1, - toc_content=check_toc_result['toc_content'], - toc_page_list=check_toc_result['toc_page_list'], + page_list, + mode='process_toc_with_page_numbers', + start_index=1, + toc_content=check_toc_result['toc_content'], + toc_page_list=check_toc_result['toc_page_list'], opt=opt, logger=logger) else: toc_with_page_number = await meta_processor( - page_list, - mode='process_no_toc', - start_index=1, + page_list, + mode='process_no_toc', + start_index=1, opt=opt, logger=logger) toc_with_page_number = add_preface_if_needed(toc_with_page_number) toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger) - + # Filter out items with None physical_index before post_processings valid_toc_items = [item for item in toc_with_page_number if item.get('physical_index') is not None] - + toc_tree = post_processing(valid_toc_items, len(page_list)) tasks = [ process_large_node_recursively(node, page_list, opt, logger=logger) for node in toc_tree ] await asyncio.gather(*tasks) - + return toc_tree diff --git a/pageindex/utils.py b/pageindex/utils.py index dc7acd888..7f85ef3da 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -678,6 +678,138 @@ def format_structure(structure, order=None): return structure +def extract_pdf_native_toc(pdf_path): + """ + Extract native TOC/bookmarks from PDF if available. + + Args: + pdf_path: Path to PDF file (string) + + Returns: + List of tuples [(level, title, page), ...] or None if extraction fails + """ + try: + if not isinstance(pdf_path, str): + return None + if not os.path.isfile(pdf_path): + return None + + doc = pymupdf.open(pdf_path) + toc = doc.get_toc() + doc.close() + + if not toc: + return None + + return toc + except Exception: + return None + + +def validate_native_toc_quality(toc, total_pages): + """ + Validate that native TOC is high quality and usable. + + Conservative validation - only returns True if TOC is clearly reliable. + + Args: + toc: List of tuples [(level, title, page), ...] + total_pages: Total number of pages in the document + + Returns: + True if TOC passes all quality checks, False otherwise + """ + if not toc: + return False + + # Must have at least 5 items to be considered complete + if len(toc) < 5: + return False + + # All titles must be non-empty + for level, title, page in toc: + if not title or not title.strip(): + return False + + # All pages must be within document range + for level, title, page in toc: + if page < 1 or page > total_pages: + return False + + # Check for too many duplicate titles (suggests auto-generated junk) + titles = [title for _, title, _ in toc] + unique_titles = set(titles) + if len(unique_titles) < len(titles) * 0.8: + return False + + return True + + +def convert_toc_levels_to_structure(toc): + """ + Convert pymupdf TOC format to PageIndex structure format. + + Transforms [(level, title, page), ...] into the format expected by + post_processing and list_to_tree functions. + + Args: + toc: List of tuples [(level, title, page), ...] + + Returns: + List of dicts [{"structure": "1.2.3", "title": "...", "physical_index": N}, ...] + """ + result = [] + counters = {} # {level: current_count} + + for level, title, page in toc: + # Reset counters for deeper levels when we go back up + for l in list(counters.keys()): + if l > level: + del counters[l] + + # Increment counter for this level + counters[level] = counters.get(level, 0) + 1 + + # Build structure string: "1.2.3" + structure_parts = [] + for l in range(1, level + 1): + structure_parts.append(str(counters.get(l, 1))) + structure = '.'.join(structure_parts) + + result.append({ + 'structure': structure, + 'title': title, + 'physical_index': page + }) + + return result + + +def try_native_toc(pdf_path, total_pages): + """ + Attempt to extract and validate native TOC from PDF. + + Only returns a result if the native TOC exists AND passes quality validation. + This is the main entry point for the native TOC optimization. + + Args: + pdf_path: Path to PDF file (string) + total_pages: Total number of pages in the document + + Returns: + List of dicts in PageIndex format if successful, None otherwise + """ + toc = extract_pdf_native_toc(pdf_path) + + if toc is None: + return None + + if not validate_native_toc_quality(toc, total_pages): + return None + + return convert_toc_levels_to_structure(toc) + + class ConfigLoader: def __init__(self, default_path: str = None): if default_path is None: diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..7330f8b31 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +pytest==8.0.0 +pytest-asyncio==0.23.0 diff --git a/tests/test_native_toc.py b/tests/test_native_toc.py new file mode 100644 index 000000000..f53b237c7 --- /dev/null +++ b/tests/test_native_toc.py @@ -0,0 +1,147 @@ +import pytest +import os +import sys + +# Add parent directory to path for imports +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from pageindex.utils import extract_pdf_native_toc, validate_native_toc_quality, convert_toc_levels_to_structure, try_native_toc + + +class TestExtractPdfNativeToc: + """Tests for extracting native TOC from PDF bookmarks.""" + + def test_returns_none_for_nonexistent_file(self): + result = extract_pdf_native_toc("/nonexistent/path.pdf") + assert result is None + + def test_returns_none_for_invalid_file(self, tmp_path): + fake_pdf = tmp_path / "fake.pdf" + fake_pdf.write_text("not a pdf") + result = extract_pdf_native_toc(str(fake_pdf)) + assert result is None + + +class TestValidateNativeTocQuality: + """Tests for TOC quality validation.""" + + def test_rejects_empty_toc(self): + result = validate_native_toc_quality([], total_pages=100) + assert result is False + + def test_rejects_too_few_items(self): + toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10)] + result = validate_native_toc_quality(toc, total_pages=100) + assert result is False + + def test_rejects_empty_titles(self): + toc = [(1, "", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 20), + (1, "Chapter 4", 30), (1, "Chapter 5", 40)] + result = validate_native_toc_quality(toc, total_pages=100) + assert result is False + + def test_rejects_out_of_range_pages(self): + toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 200), + (1, "Chapter 4", 30), (1, "Chapter 5", 40)] + result = validate_native_toc_quality(toc, total_pages=100) + assert result is False + + def test_accepts_valid_toc(self): + toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 20), + (1, "Chapter 4", 30), (1, "Chapter 5", 40)] + result = validate_native_toc_quality(toc, total_pages=100) + assert result is True + + +class TestConvertTocLevelsToStructure: + """Tests for converting pymupdf TOC format to PageIndex structure format.""" + + def test_single_level(self): + toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 20)] + result = convert_toc_levels_to_structure(toc) + + assert len(result) == 3 + assert result[0] == {"structure": "1", "title": "Chapter 1", "physical_index": 1} + assert result[1] == {"structure": "2", "title": "Chapter 2", "physical_index": 10} + assert result[2] == {"structure": "3", "title": "Chapter 3", "physical_index": 20} + + def test_nested_levels(self): + toc = [ + (1, "Chapter 1", 1), + (2, "Section 1.1", 2), + (2, "Section 1.2", 5), + (1, "Chapter 2", 10), + (2, "Section 2.1", 11), + ] + result = convert_toc_levels_to_structure(toc) + + assert result[0] == {"structure": "1", "title": "Chapter 1", "physical_index": 1} + assert result[1] == {"structure": "1.1", "title": "Section 1.1", "physical_index": 2} + assert result[2] == {"structure": "1.2", "title": "Section 1.2", "physical_index": 5} + assert result[3] == {"structure": "2", "title": "Chapter 2", "physical_index": 10} + assert result[4] == {"structure": "2.1", "title": "Section 2.1", "physical_index": 11} + + def test_three_levels_deep(self): + toc = [ + (1, "Part 1", 1), + (2, "Chapter 1", 2), + (3, "Section 1.1.1", 3), + ] + result = convert_toc_levels_to_structure(toc) + + assert result[0] == {"structure": "1", "title": "Part 1", "physical_index": 1} + assert result[1] == {"structure": "1.1", "title": "Chapter 1", "physical_index": 2} + assert result[2] == {"structure": "1.1.1", "title": "Section 1.1.1", "physical_index": 3} + + +class TestTryNativeToc: + """Tests for the combined try_native_toc function.""" + + def test_returns_none_for_nonexistent_file(self): + result = try_native_toc("/nonexistent/path.pdf", total_pages=100) + assert result is None + + def test_returns_none_for_invalid_file(self, tmp_path): + fake_pdf = tmp_path / "fake.pdf" + fake_pdf.write_text("not a pdf") + result = try_native_toc(str(fake_pdf), total_pages=100) + assert result is None + + +class TestIntegrationWithRealPdf: + """Integration tests using actual test PDFs.""" + + @pytest.fixture + def test_pdf_path(self): + """Path to a test PDF that should have native TOC.""" + return "tests/pdfs/2023-annual-report.pdf" + + def test_extract_from_real_pdf(self, test_pdf_path): + """Test extraction from a real PDF file.""" + if not os.path.exists(test_pdf_path): + pytest.skip(f"Test PDF not found: {test_pdf_path}") + + result = extract_pdf_native_toc(test_pdf_path) + # Result can be None (no TOC) or a list - both are valid + assert result is None or isinstance(result, list) + + def test_try_native_toc_with_real_pdf(self, test_pdf_path): + """Test full pipeline with a real PDF.""" + if not os.path.exists(test_pdf_path): + pytest.skip(f"Test PDF not found: {test_pdf_path}") + + import pymupdf + doc = pymupdf.open(test_pdf_path) + total_pages = len(doc) + doc.close() + + result = try_native_toc(test_pdf_path, total_pages) + # Result can be None (no/bad TOC) or properly formatted list + if result is not None: + assert isinstance(result, list) + assert len(result) > 0 + # Check structure format + for item in result: + assert 'structure' in item + assert 'title' in item + assert 'physical_index' in item