Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pageindex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from .page_index import *
from .page_index_md import md_to_tree
from .page_index_md import md_to_tree
from .utils import (
try_native_toc,
extract_pdf_native_toc,
validate_native_toc_quality,
convert_toc_levels_to_structure,
)
44 changes: 33 additions & 11 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,39 +1019,61 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None)
return node

async def tree_parser(page_list, opt, doc=None, logger=None):
# === Fast path: try native TOC first ===
if isinstance(doc, str):
native_toc = try_native_toc(doc, len(page_list))
if native_toc:
logger.info({'source': 'native_toc', 'items': len(native_toc)})
print(f'Using native TOC with {len(native_toc)} items')

# Add preface if needed and process
native_toc = add_preface_if_needed(native_toc)
valid_toc_items = [item for item in native_toc if item.get('physical_index') is not None]
toc_tree = post_processing(valid_toc_items, len(page_list))

# Still process large nodes if needed
tasks = [
process_large_node_recursively(node, page_list, opt, logger=logger)
for node in toc_tree
]
await asyncio.gather(*tasks)

return toc_tree

# === Original flow (fallback) ===
check_toc_result = check_toc(page_list, opt)
logger.info(check_toc_result)

if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip() and check_toc_result["page_index_given_in_toc"] == "yes":
toc_with_page_number = await meta_processor(
page_list,
mode='process_toc_with_page_numbers',
start_index=1,
toc_content=check_toc_result['toc_content'],
toc_page_list=check_toc_result['toc_page_list'],
page_list,
mode='process_toc_with_page_numbers',
start_index=1,
toc_content=check_toc_result['toc_content'],
toc_page_list=check_toc_result['toc_page_list'],
opt=opt,
logger=logger)
else:
toc_with_page_number = await meta_processor(
page_list,
mode='process_no_toc',
start_index=1,
page_list,
mode='process_no_toc',
start_index=1,
opt=opt,
logger=logger)

toc_with_page_number = add_preface_if_needed(toc_with_page_number)
toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger)

# Filter out items with None physical_index before post_processings
valid_toc_items = [item for item in toc_with_page_number if item.get('physical_index') is not None]

toc_tree = post_processing(valid_toc_items, len(page_list))
tasks = [
process_large_node_recursively(node, page_list, opt, logger=logger)
for node in toc_tree
]
await asyncio.gather(*tasks)

return toc_tree


Expand Down
132 changes: 132 additions & 0 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,138 @@ def format_structure(structure, order=None):
return structure


def extract_pdf_native_toc(pdf_path):
"""
Extract native TOC/bookmarks from PDF if available.

Args:
pdf_path: Path to PDF file (string)

Returns:
List of tuples [(level, title, page), ...] or None if extraction fails
"""
try:
if not isinstance(pdf_path, str):
return None
if not os.path.isfile(pdf_path):
return None

doc = pymupdf.open(pdf_path)
toc = doc.get_toc()
doc.close()

if not toc:
return None

return toc
except Exception:
return None


def validate_native_toc_quality(toc, total_pages):
"""
Validate that native TOC is high quality and usable.

Conservative validation - only returns True if TOC is clearly reliable.

Args:
toc: List of tuples [(level, title, page), ...]
total_pages: Total number of pages in the document

Returns:
True if TOC passes all quality checks, False otherwise
"""
if not toc:
return False

# Must have at least 5 items to be considered complete
if len(toc) < 5:
return False

# All titles must be non-empty
for level, title, page in toc:
if not title or not title.strip():
return False

# All pages must be within document range
for level, title, page in toc:
if page < 1 or page > total_pages:
return False

# Check for too many duplicate titles (suggests auto-generated junk)
titles = [title for _, title, _ in toc]
unique_titles = set(titles)
if len(unique_titles) < len(titles) * 0.8:
return False

return True


def convert_toc_levels_to_structure(toc):
"""
Convert pymupdf TOC format to PageIndex structure format.

Transforms [(level, title, page), ...] into the format expected by
post_processing and list_to_tree functions.

Args:
toc: List of tuples [(level, title, page), ...]

Returns:
List of dicts [{"structure": "1.2.3", "title": "...", "physical_index": N}, ...]
"""
result = []
counters = {} # {level: current_count}

for level, title, page in toc:
# Reset counters for deeper levels when we go back up
for l in list(counters.keys()):
if l > level:
del counters[l]

# Increment counter for this level
counters[level] = counters.get(level, 0) + 1

# Build structure string: "1.2.3"
structure_parts = []
for l in range(1, level + 1):
structure_parts.append(str(counters.get(l, 1)))
structure = '.'.join(structure_parts)

result.append({
'structure': structure,
'title': title,
'physical_index': page
})

return result


def try_native_toc(pdf_path, total_pages):
"""
Attempt to extract and validate native TOC from PDF.

Only returns a result if the native TOC exists AND passes quality validation.
This is the main entry point for the native TOC optimization.

Args:
pdf_path: Path to PDF file (string)
total_pages: Total number of pages in the document

Returns:
List of dicts in PageIndex format if successful, None otherwise
"""
toc = extract_pdf_native_toc(pdf_path)

if toc is None:
return None

if not validate_native_toc_quality(toc, total_pages):
return None

return convert_toc_levels_to_structure(toc)


class ConfigLoader:
def __init__(self, default_path: str = None):
if default_path is None:
Expand Down
2 changes: 2 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pytest==8.0.0
pytest-asyncio==0.23.0
147 changes: 147 additions & 0 deletions tests/test_native_toc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import pytest
import os
import sys

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from pageindex.utils import extract_pdf_native_toc, validate_native_toc_quality, convert_toc_levels_to_structure, try_native_toc


class TestExtractPdfNativeToc:
"""Tests for extracting native TOC from PDF bookmarks."""

def test_returns_none_for_nonexistent_file(self):
result = extract_pdf_native_toc("/nonexistent/path.pdf")
assert result is None

def test_returns_none_for_invalid_file(self, tmp_path):
fake_pdf = tmp_path / "fake.pdf"
fake_pdf.write_text("not a pdf")
result = extract_pdf_native_toc(str(fake_pdf))
assert result is None


class TestValidateNativeTocQuality:
"""Tests for TOC quality validation."""

def test_rejects_empty_toc(self):
result = validate_native_toc_quality([], total_pages=100)
assert result is False

def test_rejects_too_few_items(self):
toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10)]
result = validate_native_toc_quality(toc, total_pages=100)
assert result is False

def test_rejects_empty_titles(self):
toc = [(1, "", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 20),
(1, "Chapter 4", 30), (1, "Chapter 5", 40)]
result = validate_native_toc_quality(toc, total_pages=100)
assert result is False

def test_rejects_out_of_range_pages(self):
toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 200),
(1, "Chapter 4", 30), (1, "Chapter 5", 40)]
result = validate_native_toc_quality(toc, total_pages=100)
assert result is False

def test_accepts_valid_toc(self):
toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 20),
(1, "Chapter 4", 30), (1, "Chapter 5", 40)]
result = validate_native_toc_quality(toc, total_pages=100)
assert result is True


class TestConvertTocLevelsToStructure:
"""Tests for converting pymupdf TOC format to PageIndex structure format."""

def test_single_level(self):
toc = [(1, "Chapter 1", 1), (1, "Chapter 2", 10), (1, "Chapter 3", 20)]
result = convert_toc_levels_to_structure(toc)

assert len(result) == 3
assert result[0] == {"structure": "1", "title": "Chapter 1", "physical_index": 1}
assert result[1] == {"structure": "2", "title": "Chapter 2", "physical_index": 10}
assert result[2] == {"structure": "3", "title": "Chapter 3", "physical_index": 20}

def test_nested_levels(self):
toc = [
(1, "Chapter 1", 1),
(2, "Section 1.1", 2),
(2, "Section 1.2", 5),
(1, "Chapter 2", 10),
(2, "Section 2.1", 11),
]
result = convert_toc_levels_to_structure(toc)

assert result[0] == {"structure": "1", "title": "Chapter 1", "physical_index": 1}
assert result[1] == {"structure": "1.1", "title": "Section 1.1", "physical_index": 2}
assert result[2] == {"structure": "1.2", "title": "Section 1.2", "physical_index": 5}
assert result[3] == {"structure": "2", "title": "Chapter 2", "physical_index": 10}
assert result[4] == {"structure": "2.1", "title": "Section 2.1", "physical_index": 11}

def test_three_levels_deep(self):
toc = [
(1, "Part 1", 1),
(2, "Chapter 1", 2),
(3, "Section 1.1.1", 3),
]
result = convert_toc_levels_to_structure(toc)

assert result[0] == {"structure": "1", "title": "Part 1", "physical_index": 1}
assert result[1] == {"structure": "1.1", "title": "Chapter 1", "physical_index": 2}
assert result[2] == {"structure": "1.1.1", "title": "Section 1.1.1", "physical_index": 3}


class TestTryNativeToc:
"""Tests for the combined try_native_toc function."""

def test_returns_none_for_nonexistent_file(self):
result = try_native_toc("/nonexistent/path.pdf", total_pages=100)
assert result is None

def test_returns_none_for_invalid_file(self, tmp_path):
fake_pdf = tmp_path / "fake.pdf"
fake_pdf.write_text("not a pdf")
result = try_native_toc(str(fake_pdf), total_pages=100)
assert result is None


class TestIntegrationWithRealPdf:
"""Integration tests using actual test PDFs."""

@pytest.fixture
def test_pdf_path(self):
"""Path to a test PDF that should have native TOC."""
return "tests/pdfs/2023-annual-report.pdf"

def test_extract_from_real_pdf(self, test_pdf_path):
"""Test extraction from a real PDF file."""
if not os.path.exists(test_pdf_path):
pytest.skip(f"Test PDF not found: {test_pdf_path}")

result = extract_pdf_native_toc(test_pdf_path)
# Result can be None (no TOC) or a list - both are valid
assert result is None or isinstance(result, list)

def test_try_native_toc_with_real_pdf(self, test_pdf_path):
"""Test full pipeline with a real PDF."""
if not os.path.exists(test_pdf_path):
pytest.skip(f"Test PDF not found: {test_pdf_path}")

import pymupdf
doc = pymupdf.open(test_pdf_path)
total_pages = len(doc)
doc.close()

result = try_native_toc(test_pdf_path, total_pages)
# Result can be None (no/bad TOC) or properly formatted list
if result is not None:
assert isinstance(result, list)
assert len(result) > 0
# Check structure format
for item in result:
assert 'structure' in item
assert 'title' in item
assert 'physical_index' in item