From 72e113ba2ab0d0c51fb726f18e1b9e1a7f8d8fa3 Mon Sep 17 00:00:00 2001 From: luojiyin Date: Mon, 19 Jan 2026 12:37:25 +0800 Subject: [PATCH 1/2] fix: prevent list_index variable shadowing in fix_incorrect_toc The list_index variable (original TOC index) was being overwritten by the page loop iterator, causing results to be written back to wrong index positions when updating toc_with_page_number. Rename the loop variable to page_list_idx to avoid shadowing. --- pageindex/page_index.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 882fb5dea..74277d1ba 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -804,9 +804,9 @@ async def process_and_check_item(incorrect_item): page_contents=[] for page_index in range(prev_correct, next_correct+1): # Add bounds checking to prevent IndexError - list_index = page_index - start_index - if list_index >= 0 and list_index < len(page_list): - page_text = f"\n{page_list[list_index][0]}\n\n\n" + page_list_idx = page_index - start_index + if page_list_idx >= 0 and page_list_idx < len(page_list): + page_text = f"\n{page_list[page_list_idx][0]}\n\n\n" page_contents.append(page_text) else: continue From 1c448f10cc86b5eaac3529eed8bd5c1ff2cfd5dc Mon Sep 17 00:00:00 2001 From: luojiyin Date: Mon, 19 Jan 2026 12:52:29 +0800 Subject: [PATCH 2/2] feat: add validation, logging, and docstring to fix_incorrect_toc - Add docstring documenting parameters, returns, and constraints - Validate start_index >= 1 (raise ValueError if invalid) - Log page_list alignment assumptions for debugging - Track and warn when pages are skipped due to bounds issues --- pageindex/page_index.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 74277d1ba..80b57f415 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -750,7 +750,33 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20 async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_index=1, model=None, logger=None): + """ + Fix incorrect physical_index values in TOC items by searching content ranges. + + Args: + toc_with_page_number: List of TOC items with physical_index to validate/fix + page_list: List of (text, page_num) tuples aligned to start_index + incorrect_results: List of items that failed validation + start_index: Physical page number of page_list[0] (must be >= 1) + model: OpenAI model name for API calls + logger: Optional logger for structured logging + + Returns: + Updated toc_with_page_number with corrected physical_index values + + Raises: + ValueError: If start_index < 1 + """ print(f'start fix_incorrect_toc with {len(incorrect_results)} incorrect results') + if start_index < 1: + raise ValueError(f"start_index must be >= 1, got {start_index}") + if logger: + logger.info({ + 'message': 'fix_incorrect_toc assumes page_list aligns to start_index', + 'start_index': start_index, + 'page_list_length': len(page_list), + 'expected_physical_end': start_index + len(page_list) - 1 + }) incorrect_indices = {result['list_index'] for result in incorrect_results} end_index = len(page_list) + start_index - 1 @@ -802,6 +828,7 @@ async def process_and_check_item(incorrect_item): }) page_contents=[] + skipped_pages = 0 for page_index in range(prev_correct, next_correct+1): # Add bounds checking to prevent IndexError page_list_idx = page_index - start_index @@ -809,8 +836,18 @@ async def process_and_check_item(incorrect_item): page_text = f"\n{page_list[page_list_idx][0]}\n\n\n" page_contents.append(page_text) else: + skipped_pages += 1 continue content_range = ''.join(page_contents) + if skipped_pages and logger: + logger.warning({ + 'message': 'Skipped pages while building content range; check start_index/page_list alignment', + 'list_index': list_index, + 'start_index': start_index, + 'prev_correct': prev_correct, + 'next_correct': next_correct, + 'skipped_pages': skipped_pages + }) physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model) @@ -1141,4 +1178,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt if truncated_items: print(f"Truncated {len(truncated_items)} TOC items that exceeded document length") - return toc_with_page_number \ No newline at end of file + return toc_with_page_number