VectifyAI · DankerMu · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -6,10 +6,13 @@ temp/*
 chroma-collections.parquet
 chroma-embeddings.parquet
 .DS_Store
+.venv/
+.env
 .env*
 notebook
 SDK/*
 log/*
 logs/
+results/
 parts/*
 json_results/*
diff --git a/README.md b/README.md
@@ -149,17 +149,24 @@ pip3 install --upgrade -r requirements.txt
 
 ### 2. Set your OpenAI API key
 
-Create a `.env` file in the root directory and add your API key:
+Create a `.env` file in the root directory and add your API key (supports `CHATGPT_API_KEY` or `OPENAI_API_KEY`).
+If you use an OpenAI-compatible endpoint/proxy, also set a base URL (include `/v1`):
 
 ```bash
 CHATGPT_API_KEY=your_openai_key_here
+CHATGPT_BASE_URL=https://your-base-url/v1
+CHATGPT_MODEL=gpt-4o-2024-11-20
 ```
 
 ### 3. Run PageIndex on your PDF
 
 ```bash
 python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
 ```
+If your file path contains spaces, wrap it in quotes:
+```bash
+python3 run_pageindex.py --pdf_path "/path/with spaces/document.pdf"
+```
 
 <details>
 <summary><strong>Optional parameters</strong></summary>
@@ -173,7 +180,8 @@ You can customize the processing with additional optional arguments:
 --max-tokens-per-node   Max tokens per node (default: 20000)
 --if-add-node-id        Add node ID (yes/no, default: yes)
 --if-add-node-summary   Add node summary (yes/no, default: yes)
---if-add-doc-description Add doc description (yes/no, default: yes)
+--if-add-doc-description Add doc description (yes/no, default: no)
+--if-add-node-text      Add node text (yes/no, default: no)
 ```
 </details>
 

diff --git a/pageindex/page_index.py b/pageindex/page_index.py
@@ -166,34 +166,35 @@ def extract_toc_content(content, model=None):
     Directly return the full table of contents content. Do not output anything else."""
 
     response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
-
+    if finish_reason == "error" and not response:
+        raise Exception("Failed to extract table of contents (API error)")
+
     if_complete = check_if_toc_transformation_is_complete(content, response, model)
     if if_complete == "yes" and finish_reason == "finished":
         return response
-
-    chat_history = [
-        {"role": "user", "content": prompt}, 
-        {"role": "assistant", "content": response},    
-    ]
-    prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
-    new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
-    response = response + new_response
-    if_complete = check_if_toc_transformation_is_complete(content, response, model)
-
+
+    max_attempts = 5
+    attempt = 0
+    continue_prompt = "please continue the generation of table of contents, directly output the remaining part of the structure"
+
     while not (if_complete == "yes" and finish_reason == "finished"):
+        attempt += 1
+        if attempt > max_attempts:
+            raise Exception("Failed to complete table of contents after maximum retries")
+
         chat_history = [
-            {"role": "user", "content": prompt}, 
-            {"role": "assistant", "content": response},    
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": response},
         ]
-        prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
-        new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
+        new_response, finish_reason = ChatGPT_API_with_finish_reason(
+            model=model, prompt=continue_prompt, chat_history=chat_history
+        )
+        if finish_reason == "error" and not new_response:
+            raise Exception("Failed to continue table of contents extraction (API error)")
+
         response = response + new_response
         if_complete = check_if_toc_transformation_is_complete(content, response, model)
-
-        # Optional: Add a maximum retry limit to prevent infinite loops
-        if len(chat_history) > 5:  # Arbitrary limit of 10 attempts
-            raise Exception('Failed to complete table of contents after maximum retries')
-
+
     return response
 
 def detect_page_index(toc_content, model=None):
@@ -414,6 +415,29 @@ def add_page_offset_to_toc_json(data, offset):
     return data
 
 
+def sanitize_toc_page_numbers(toc_items):
+    """
+    Best-effort cleanup for parsed TOC page numbers.
+
+    Some PDFs have TOCs with inconsistent numbering (resets/spikes). Since downstream
+    logic assumes page numbers are monotonic, treat non-monotonic integers as invalid
+    so they can be re-inferred later (e.g. via `process_none_page_numbers` / verification).
+    """
+    last_page = None
+    for item in toc_items:
+        page = item.get("page")
+        if not isinstance(page, int):
+            continue
+        if page <= 0:
+            item["page"] = None
+            continue
+        if last_page is not None and page < last_page:
+            item["page"] = None
+            continue
+        last_page = page
+    return toc_items
+
+
 
 def page_list_to_group_text(page_contents, token_lengths, max_tokens=20000, overlap_page=1):    
     num_tokens = sum(token_lengths)
@@ -565,14 +589,14 @@ def generate_toc_init(part, model=None):
     else:
         raise Exception(f'finish reason: {finish_reason}')
 
-def process_no_toc(page_list, start_index=1, model=None, logger=None):
+def process_no_toc(page_list, start_index=1, model=None, logger=None, max_tokens=20000):
     page_contents=[]
     token_lengths=[]
     for page_index in range(start_index, start_index+len(page_list)):
         page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
         page_contents.append(page_text)
         token_lengths.append(count_tokens(page_text, model))
-    group_texts = page_list_to_group_text(page_contents, token_lengths)
+    group_texts = page_list_to_group_text(page_contents, token_lengths, max_tokens=max_tokens)
     logger.info(f'len(group_texts): {len(group_texts)}')
 
     toc_with_page_number= generate_toc_init(group_texts[0], model)
@@ -586,7 +610,7 @@ def process_no_toc(page_list, start_index=1, model=None, logger=None):
 
     return toc_with_page_number
 
-def process_toc_no_page_numbers(toc_content, toc_page_list, page_list,  start_index=1, model=None, logger=None):
+def process_toc_no_page_numbers(toc_content, toc_page_list, page_list,  start_index=1, model=None, logger=None, max_tokens=20000):
     page_contents=[]
     token_lengths=[]
     toc_content = toc_transformer(toc_content, model)
@@ -596,7 +620,7 @@ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list,  start_in
         page_contents.append(page_text)
         token_lengths.append(count_tokens(page_text, model))
 
-    group_texts = page_list_to_group_text(page_contents, token_lengths)
+    group_texts = page_list_to_group_text(page_contents, token_lengths, max_tokens=max_tokens)
     logger.info(f'len(group_texts): {len(group_texts)}')
 
     toc_with_page_number=copy.deepcopy(toc_content)
@@ -613,6 +637,7 @@ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list,  start_in
 
 def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=None, model=None, logger=None):
     toc_with_page_number = toc_transformer(toc_content, model)
+    toc_with_page_number = sanitize_toc_page_numbers(toc_with_page_number)
     logger.info(f'toc_with_page_number: {toc_with_page_number}')
 
     toc_no_page_number = remove_page_number(copy.deepcopy(toc_with_page_number))
@@ -674,11 +699,11 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
                     continue
 
             item_copy = copy.deepcopy(item)
-            del item_copy['page']
+            item_copy.pop('page', None)
             result = add_page_number_to_toc(page_contents, item_copy, model)
             if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('<physical_index'):
                 item['physical_index'] = int(result[0]['physical_index'].split('_')[-1].rstrip('>').strip())
-                del item['page']
+                item.pop('page', None)
 
     return toc_items
 
@@ -758,21 +783,21 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
     incorrect_results_and_range_logs = []
     # Helper function to process and check a single incorrect item
     async def process_and_check_item(incorrect_item):
-        list_index = incorrect_item['list_index']
+        toc_list_index = incorrect_item['list_index']
 
         # Check if list_index is valid
-        if list_index < 0 or list_index >= len(toc_with_page_number):
+        if toc_list_index < 0 or toc_list_index >= len(toc_with_page_number):
             # Return an invalid result for out-of-bounds indices
             return {
-                'list_index': list_index,
+                'list_index': toc_list_index,
                 'title': incorrect_item['title'],
                 'physical_index': incorrect_item.get('physical_index'),
                 'is_valid': False
             }
 
         # Find the previous correct item
         prev_correct = None
-        for i in range(list_index-1, -1, -1):
+        for i in range(toc_list_index - 1, -1, -1):
             if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
                 physical_index = toc_with_page_number[i].get('physical_index')
                 if physical_index is not None:
@@ -784,7 +809,7 @@ async def process_and_check_item(incorrect_item):
 
         # Find the next correct item
         next_correct = None
-        for i in range(list_index+1, len(toc_with_page_number)):
+        for i in range(toc_list_index + 1, len(toc_with_page_number)):
             if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
                 physical_index = toc_with_page_number[i].get('physical_index')
                 if physical_index is not None:
@@ -795,7 +820,7 @@ async def process_and_check_item(incorrect_item):
             next_correct = end_index
 
         incorrect_results_and_range_logs.append({
-            'list_index': list_index,
+            'list_index': toc_list_index,
             'title': incorrect_item['title'],
             'prev_correct': prev_correct,
             'next_correct': next_correct
@@ -804,9 +829,9 @@ async def process_and_check_item(incorrect_item):
         page_contents=[]
         for page_index in range(prev_correct, next_correct+1):
             # Add bounds checking to prevent IndexError
-            list_index = page_index - start_index
-            if list_index >= 0 and list_index < len(page_list):
-                page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
+            page_list_index = page_index - start_index
+            if 0 <= page_list_index < len(page_list):
+                page_text = f"<physical_index_{page_index}>\n{page_list[page_list_index][0]}\n<physical_index_{page_index}>\n\n"
                 page_contents.append(page_text)
             else:
                 continue
@@ -820,7 +845,7 @@ async def process_and_check_item(incorrect_item):
         check_result = await check_title_appearance(check_item, page_list, start_index, model)
 
         return {
-            'list_index': list_index,
+            'list_index': toc_list_index,
             'title': incorrect_item['title'],
             'physical_index': physical_index_int,
             'is_valid': check_result['answer'] == 'yes'
@@ -951,13 +976,36 @@ async def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
 async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=None, start_index=1, opt=None, logger=None):
     print(mode)
     print(f'start_index: {start_index}')
-
-    if mode == 'process_toc_with_page_numbers':
-        toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger)
-    elif mode == 'process_toc_no_page_numbers':
-        toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger)
-    else:
-        toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
+    max_tokens = getattr(opt, "max_token_num_each_node", 20000)
+    try:
+        if mode == 'process_toc_with_page_numbers':
+            toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger)
+        elif mode == 'process_toc_no_page_numbers':
+            toc_with_page_number = process_toc_no_page_numbers(
+                toc_content,
+                toc_page_list,
+                page_list,
+                start_index=start_index,
+                model=opt.model,
+                logger=logger,
+                max_tokens=max_tokens,
+            )
+        else:
+            toc_with_page_number = process_no_toc(
+                page_list,
+                start_index=start_index,
+                model=opt.model,
+                logger=logger,
+                max_tokens=max_tokens,
+            )
+    except Exception as e:
+        if logger:
+            logger.error({"mode": mode, "start_index": start_index, "error": str(e)})
+        if mode == 'process_toc_with_page_numbers':
+            return await meta_processor(page_list, mode='process_toc_no_page_numbers', toc_content=toc_content, toc_page_list=toc_page_list, start_index=start_index, opt=opt, logger=logger)
+        if mode == 'process_toc_no_page_numbers':
+            return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
+        return [{"structure": "1", "title": "Document", "physical_index": start_index}]
 
     toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None] 
 
@@ -971,7 +1019,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
     accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
 
     logger.info({
-        'mode': 'process_toc_with_page_numbers',
+        'mode': mode,
         'accuracy': accuracy,
         'incorrect_results': incorrect_results
     })
@@ -986,7 +1034,9 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
         elif mode == 'process_toc_no_page_numbers':
             return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
         else:
-            raise Exception('Processing failed')
+            if logger:
+                logger.error({"mode": mode, "start_index": start_index, "accuracy": accuracy, "incorrect_results_count": len(incorrect_results)})
+            return [{"structure": "1", "title": "Document", "physical_index": start_index}]
 
 
 async def process_large_node_recursively(node, page_list, opt=None, logger=None):
@@ -1022,15 +1072,25 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
     check_toc_result = check_toc(page_list, opt)
     logger.info(check_toc_result)
 
-    if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip() and check_toc_result["page_index_given_in_toc"] == "yes":
-        toc_with_page_number = await meta_processor(
-            page_list, 
-            mode='process_toc_with_page_numbers', 
-            start_index=1, 
-            toc_content=check_toc_result['toc_content'], 
-            toc_page_list=check_toc_result['toc_page_list'], 
-            opt=opt,
-            logger=logger)
+    if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip():
+        if check_toc_result["page_index_given_in_toc"] == "yes":
+            toc_with_page_number = await meta_processor(
+                page_list,
+                mode='process_toc_with_page_numbers',
+                start_index=1,
+                toc_content=check_toc_result['toc_content'],
+                toc_page_list=check_toc_result['toc_page_list'],
+                opt=opt,
+                logger=logger)
+        else:
+            toc_with_page_number = await meta_processor(
+                page_list,
+                mode='process_toc_no_page_numbers',
+                start_index=1,
+                toc_content=check_toc_result['toc_content'],
+                toc_page_list=check_toc_result['toc_page_list'],
+                opt=opt,
+                logger=logger)
     else:
         toc_with_page_number = await meta_processor(
             page_list, 
@@ -1066,7 +1126,7 @@ def page_index_main(doc, opt=None):
         raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")
 
     print('Parsing PDF...')
-    page_list = get_page_tokens(doc)
+    page_list = get_page_tokens(doc, model=getattr(opt, "model", None))
 
     logger.info({'total_page_number': len(page_list)})
     logger.info({'total_token': sum([page[1] for page in page_list])})
@@ -1141,4 +1201,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt
     if truncated_items:
         print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")
 
-    return toc_with_page_number
+    return toc_with_page_number