diff --git a/.gitignore b/.gitignore
index 47d38baef..2ec4f07d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,10 +6,13 @@ temp/*
 chroma-collections.parquet
 chroma-embeddings.parquet
 .DS_Store
+.venv/
+.env
 .env*
 notebook
 SDK/*
 log/*
 logs/
+results/
 parts/*
 json_results/*
diff --git a/README.md b/README.md
index 879a67efc..60e059386 100644
--- a/README.md
+++ b/README.md
@@ -149,10 +149,13 @@ pip3 install --upgrade -r requirements.txt
 
 ### 2. Set your OpenAI API key
 
-Create a `.env` file in the root directory and add your API key:
+Create a `.env` file in the root directory and add your API key (supports `CHATGPT_API_KEY` or `OPENAI_API_KEY`).
+If you use an OpenAI-compatible endpoint/proxy, also set a base URL (include `/v1`):
 
 ```bash
 CHATGPT_API_KEY=your_openai_key_here
+CHATGPT_BASE_URL=https://your-base-url/v1
+CHATGPT_MODEL=gpt-4o-2024-11-20
 ```
 
 ### 3. Run PageIndex on your PDF
@@ -160,6 +163,10 @@ CHATGPT_API_KEY=your_openai_key_here
 ```bash
 python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
 ```
+If your file path contains spaces, wrap it in quotes:
+```bash
+python3 run_pageindex.py --pdf_path "/path/with spaces/document.pdf"
+```
 
 <details>
 <summary><strong>Optional parameters</strong></summary>
@@ -173,7 +180,8 @@ You can customize the processing with additional optional arguments:
 --max-tokens-per-node   Max tokens per node (default: 20000)
 --if-add-node-id        Add node ID (yes/no, default: yes)
 --if-add-node-summary   Add node summary (yes/no, default: yes)
---if-add-doc-description Add doc description (yes/no, default: yes)
+--if-add-doc-description Add doc description (yes/no, default: no)
+--if-add-node-text      Add node text (yes/no, default: no)
 ```
 </details>
 
diff --git a/pageindex/page_index.py b/pageindex/page_index.py
index 882fb5dea..1ffbdb448 100644
--- a/pageindex/page_index.py
+++ b/pageindex/page_index.py
@@ -166,34 +166,35 @@ def extract_toc_content(content, model=None):
     Directly return the full table of contents content. Do not output anything else."""
 
     response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
-    
+    if finish_reason == "error" and not response:
+        raise Exception("Failed to extract table of contents (API error)")
+
     if_complete = check_if_toc_transformation_is_complete(content, response, model)
     if if_complete == "yes" and finish_reason == "finished":
         return response
-    
-    chat_history = [
-        {"role": "user", "content": prompt}, 
-        {"role": "assistant", "content": response},    
-    ]
-    prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
-    new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
-    response = response + new_response
-    if_complete = check_if_toc_transformation_is_complete(content, response, model)
-    
+
+    max_attempts = 5
+    attempt = 0
+    continue_prompt = "please continue the generation of table of contents, directly output the remaining part of the structure"
+
     while not (if_complete == "yes" and finish_reason == "finished"):
+        attempt += 1
+        if attempt > max_attempts:
+            raise Exception("Failed to complete table of contents after maximum retries")
+
         chat_history = [
-            {"role": "user", "content": prompt}, 
-            {"role": "assistant", "content": response},    
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": response},
         ]
-        prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
-        new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
+        new_response, finish_reason = ChatGPT_API_with_finish_reason(
+            model=model, prompt=continue_prompt, chat_history=chat_history
+        )
+        if finish_reason == "error" and not new_response:
+            raise Exception("Failed to continue table of contents extraction (API error)")
+
         response = response + new_response
         if_complete = check_if_toc_transformation_is_complete(content, response, model)
-        
-        # Optional: Add a maximum retry limit to prevent infinite loops
-        if len(chat_history) > 5:  # Arbitrary limit of 10 attempts
-            raise Exception('Failed to complete table of contents after maximum retries')
-    
+
     return response
 
 def detect_page_index(toc_content, model=None):
@@ -414,6 +415,29 @@ def add_page_offset_to_toc_json(data, offset):
     return data
 
 
+def sanitize_toc_page_numbers(toc_items):
+    """
+    Best-effort cleanup for parsed TOC page numbers.
+
+    Some PDFs have TOCs with inconsistent numbering (resets/spikes). Since downstream
+    logic assumes page numbers are monotonic, treat non-monotonic integers as invalid
+    so they can be re-inferred later (e.g. via `process_none_page_numbers` / verification).
+    """
+    last_page = None
+    for item in toc_items:
+        page = item.get("page")
+        if not isinstance(page, int):
+            continue
+        if page <= 0:
+            item["page"] = None
+            continue
+        if last_page is not None and page < last_page:
+            item["page"] = None
+            continue
+        last_page = page
+    return toc_items
+
+
 
 def page_list_to_group_text(page_contents, token_lengths, max_tokens=20000, overlap_page=1):    
     num_tokens = sum(token_lengths)
@@ -565,14 +589,14 @@ def generate_toc_init(part, model=None):
     else:
         raise Exception(f'finish reason: {finish_reason}')
 
-def process_no_toc(page_list, start_index=1, model=None, logger=None):
+def process_no_toc(page_list, start_index=1, model=None, logger=None, max_tokens=20000):
     page_contents=[]
     token_lengths=[]
     for page_index in range(start_index, start_index+len(page_list)):
         page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
         page_contents.append(page_text)
         token_lengths.append(count_tokens(page_text, model))
-    group_texts = page_list_to_group_text(page_contents, token_lengths)
+    group_texts = page_list_to_group_text(page_contents, token_lengths, max_tokens=max_tokens)
     logger.info(f'len(group_texts): {len(group_texts)}')
 
     toc_with_page_number= generate_toc_init(group_texts[0], model)
@@ -586,7 +610,7 @@ def process_no_toc(page_list, start_index=1, model=None, logger=None):
 
     return toc_with_page_number
 
-def process_toc_no_page_numbers(toc_content, toc_page_list, page_list,  start_index=1, model=None, logger=None):
+def process_toc_no_page_numbers(toc_content, toc_page_list, page_list,  start_index=1, model=None, logger=None, max_tokens=20000):
     page_contents=[]
     token_lengths=[]
     toc_content = toc_transformer(toc_content, model)
@@ -596,7 +620,7 @@ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list,  start_in
         page_contents.append(page_text)
         token_lengths.append(count_tokens(page_text, model))
     
-    group_texts = page_list_to_group_text(page_contents, token_lengths)
+    group_texts = page_list_to_group_text(page_contents, token_lengths, max_tokens=max_tokens)
     logger.info(f'len(group_texts): {len(group_texts)}')
 
     toc_with_page_number=copy.deepcopy(toc_content)
@@ -613,6 +637,7 @@ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list,  start_in
 
 def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=None, model=None, logger=None):
     toc_with_page_number = toc_transformer(toc_content, model)
+    toc_with_page_number = sanitize_toc_page_numbers(toc_with_page_number)
     logger.info(f'toc_with_page_number: {toc_with_page_number}')
 
     toc_no_page_number = remove_page_number(copy.deepcopy(toc_with_page_number))
@@ -674,11 +699,11 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
                     continue
 
             item_copy = copy.deepcopy(item)
-            del item_copy['page']
+            item_copy.pop('page', None)
             result = add_page_number_to_toc(page_contents, item_copy, model)
             if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('<physical_index'):
                 item['physical_index'] = int(result[0]['physical_index'].split('_')[-1].rstrip('>').strip())
-                del item['page']
+                item.pop('page', None)
     
     return toc_items
 
@@ -758,13 +783,13 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
     incorrect_results_and_range_logs = []
     # Helper function to process and check a single incorrect item
     async def process_and_check_item(incorrect_item):
-        list_index = incorrect_item['list_index']
+        toc_list_index = incorrect_item['list_index']
         
         # Check if list_index is valid
-        if list_index < 0 or list_index >= len(toc_with_page_number):
+        if toc_list_index < 0 or toc_list_index >= len(toc_with_page_number):
             # Return an invalid result for out-of-bounds indices
             return {
-                'list_index': list_index,
+                'list_index': toc_list_index,
                 'title': incorrect_item['title'],
                 'physical_index': incorrect_item.get('physical_index'),
                 'is_valid': False
@@ -772,7 +797,7 @@ async def process_and_check_item(incorrect_item):
         
         # Find the previous correct item
         prev_correct = None
-        for i in range(list_index-1, -1, -1):
+        for i in range(toc_list_index - 1, -1, -1):
             if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
                 physical_index = toc_with_page_number[i].get('physical_index')
                 if physical_index is not None:
@@ -784,7 +809,7 @@ async def process_and_check_item(incorrect_item):
         
         # Find the next correct item
         next_correct = None
-        for i in range(list_index+1, len(toc_with_page_number)):
+        for i in range(toc_list_index + 1, len(toc_with_page_number)):
             if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
                 physical_index = toc_with_page_number[i].get('physical_index')
                 if physical_index is not None:
@@ -795,7 +820,7 @@ async def process_and_check_item(incorrect_item):
             next_correct = end_index
         
         incorrect_results_and_range_logs.append({
-            'list_index': list_index,
+            'list_index': toc_list_index,
             'title': incorrect_item['title'],
             'prev_correct': prev_correct,
             'next_correct': next_correct
@@ -804,9 +829,9 @@ async def process_and_check_item(incorrect_item):
         page_contents=[]
         for page_index in range(prev_correct, next_correct+1):
             # Add bounds checking to prevent IndexError
-            list_index = page_index - start_index
-            if list_index >= 0 and list_index < len(page_list):
-                page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
+            page_list_index = page_index - start_index
+            if 0 <= page_list_index < len(page_list):
+                page_text = f"<physical_index_{page_index}>\n{page_list[page_list_index][0]}\n<physical_index_{page_index}>\n\n"
                 page_contents.append(page_text)
             else:
                 continue
@@ -820,7 +845,7 @@ async def process_and_check_item(incorrect_item):
         check_result = await check_title_appearance(check_item, page_list, start_index, model)
 
         return {
-            'list_index': list_index,
+            'list_index': toc_list_index,
             'title': incorrect_item['title'],
             'physical_index': physical_index_int,
             'is_valid': check_result['answer'] == 'yes'
@@ -951,13 +976,36 @@ async def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
 async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=None, start_index=1, opt=None, logger=None):
     print(mode)
     print(f'start_index: {start_index}')
-    
-    if mode == 'process_toc_with_page_numbers':
-        toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger)
-    elif mode == 'process_toc_no_page_numbers':
-        toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger)
-    else:
-        toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
+    max_tokens = getattr(opt, "max_token_num_each_node", 20000)
+    try:
+        if mode == 'process_toc_with_page_numbers':
+            toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger)
+        elif mode == 'process_toc_no_page_numbers':
+            toc_with_page_number = process_toc_no_page_numbers(
+                toc_content,
+                toc_page_list,
+                page_list,
+                start_index=start_index,
+                model=opt.model,
+                logger=logger,
+                max_tokens=max_tokens,
+            )
+        else:
+            toc_with_page_number = process_no_toc(
+                page_list,
+                start_index=start_index,
+                model=opt.model,
+                logger=logger,
+                max_tokens=max_tokens,
+            )
+    except Exception as e:
+        if logger:
+            logger.error({"mode": mode, "start_index": start_index, "error": str(e)})
+        if mode == 'process_toc_with_page_numbers':
+            return await meta_processor(page_list, mode='process_toc_no_page_numbers', toc_content=toc_content, toc_page_list=toc_page_list, start_index=start_index, opt=opt, logger=logger)
+        if mode == 'process_toc_no_page_numbers':
+            return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
+        return [{"structure": "1", "title": "Document", "physical_index": start_index}]
             
     toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None] 
     
@@ -971,7 +1019,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
     accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
         
     logger.info({
-        'mode': 'process_toc_with_page_numbers',
+        'mode': mode,
         'accuracy': accuracy,
         'incorrect_results': incorrect_results
     })
@@ -986,7 +1034,9 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
         elif mode == 'process_toc_no_page_numbers':
             return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
         else:
-            raise Exception('Processing failed')
+            if logger:
+                logger.error({"mode": mode, "start_index": start_index, "accuracy": accuracy, "incorrect_results_count": len(incorrect_results)})
+            return [{"structure": "1", "title": "Document", "physical_index": start_index}]
         
  
 async def process_large_node_recursively(node, page_list, opt=None, logger=None):
@@ -1022,15 +1072,25 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
     check_toc_result = check_toc(page_list, opt)
     logger.info(check_toc_result)
 
-    if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip() and check_toc_result["page_index_given_in_toc"] == "yes":
-        toc_with_page_number = await meta_processor(
-            page_list, 
-            mode='process_toc_with_page_numbers', 
-            start_index=1, 
-            toc_content=check_toc_result['toc_content'], 
-            toc_page_list=check_toc_result['toc_page_list'], 
-            opt=opt,
-            logger=logger)
+    if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip():
+        if check_toc_result["page_index_given_in_toc"] == "yes":
+            toc_with_page_number = await meta_processor(
+                page_list,
+                mode='process_toc_with_page_numbers',
+                start_index=1,
+                toc_content=check_toc_result['toc_content'],
+                toc_page_list=check_toc_result['toc_page_list'],
+                opt=opt,
+                logger=logger)
+        else:
+            toc_with_page_number = await meta_processor(
+                page_list,
+                mode='process_toc_no_page_numbers',
+                start_index=1,
+                toc_content=check_toc_result['toc_content'],
+                toc_page_list=check_toc_result['toc_page_list'],
+                opt=opt,
+                logger=logger)
     else:
         toc_with_page_number = await meta_processor(
             page_list, 
@@ -1066,7 +1126,7 @@ def page_index_main(doc, opt=None):
         raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")
 
     print('Parsing PDF...')
-    page_list = get_page_tokens(doc)
+    page_list = get_page_tokens(doc, model=getattr(opt, "model", None))
 
     logger.info({'total_page_number': len(page_list)})
     logger.info({'total_token': sum([page[1] for page in page_list])})
@@ -1141,4 +1201,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt
     if truncated_items:
         print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")
      
-    return toc_with_page_number
\ No newline at end of file
+    return toc_with_page_number
diff --git a/pageindex/utils.py b/pageindex/utils.py
index dc7acd888..68a79a4a2 100644
--- a/pageindex/utils.py
+++ b/pageindex/utils.py
@@ -5,7 +5,10 @@
 from datetime import datetime
 import time
 import json
-import PyPDF2
+try:
+    import pypdf as PyPDF2
+except ImportError:
+    import PyPDF2
 import copy
 import asyncio
 import pymupdf
@@ -16,24 +19,73 @@
 import yaml
 from pathlib import Path
 from types import SimpleNamespace as config
+from typing import Optional
 
-CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
+CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") or os.getenv("OPENAI_API_KEY")
+CHATGPT_BASE_URL = os.getenv("CHATGPT_BASE_URL") or os.getenv("OPENAI_BASE_URL")
+CHATGPT_MODEL = os.getenv("CHATGPT_MODEL") or os.getenv("OPENAI_MODEL")
+
+DEFAULT_MODEL = "gpt-4o-2024-11-20"
+
+
+def _default_model_from_env() -> Optional[str]:
+    return os.getenv("CHATGPT_MODEL") or os.getenv("OPENAI_MODEL")
+
+
+def _resolve_model_name(model: Optional[str]) -> str:
+    if model is None or model == DEFAULT_MODEL:
+        return _default_model_from_env() or DEFAULT_MODEL
+    return model
+
+
+def _tiktoken_encoder(model: Optional[str]):
+    model_name = _resolve_model_name(model)
+    try:
+        return tiktoken.encoding_for_model(model_name)
+    except Exception:
+        return tiktoken.get_encoding("cl100k_base")
+
+
+_SYNC_OPENAI_CLIENTS = {}
+_ASYNC_OPENAI_CLIENTS = {}
+
+
+def _client_cache_key(api_key: Optional[str], base_url: Optional[str]):
+    return (api_key or "", base_url or "")
+
+
+def _openai_client(api_key: Optional[str] = CHATGPT_API_KEY, base_url: Optional[str] = CHATGPT_BASE_URL):
+    key = _client_cache_key(api_key, base_url)
+    client = _SYNC_OPENAI_CLIENTS.get(key)
+    if client is None:
+        client = openai.OpenAI(api_key=api_key, base_url=base_url) if base_url else openai.OpenAI(api_key=api_key)
+        _SYNC_OPENAI_CLIENTS[key] = client
+    return client
+
+
+def _openai_async_client(api_key: Optional[str] = CHATGPT_API_KEY, base_url: Optional[str] = CHATGPT_BASE_URL):
+    key = _client_cache_key(api_key, base_url)
+    client = _ASYNC_OPENAI_CLIENTS.get(key)
+    if client is None:
+        client = openai.AsyncOpenAI(api_key=api_key, base_url=base_url) if base_url else openai.AsyncOpenAI(api_key=api_key)
+        _ASYNC_OPENAI_CLIENTS[key] = client
+    return client
 
 def count_tokens(text, model=None):
     if not text:
         return 0
-    enc = tiktoken.encoding_for_model(model)
+    enc = _tiktoken_encoder(model)
     tokens = enc.encode(text)
     return len(tokens)
 
 def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
     max_retries = 10
-    client = openai.OpenAI(api_key=api_key)
+    model = _resolve_model_name(model)
+    client = _openai_client(api_key=api_key)
     for i in range(max_retries):
         try:
             if chat_history:
-                messages = chat_history
-                messages.append({"role": "user", "content": prompt})
+                messages = list(chat_history) + [{"role": "user", "content": prompt}]
             else:
                 messages = [{"role": "user", "content": prompt}]
             
@@ -54,18 +106,18 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_
                 time.sleep(1)  # Wait for 1秒 before retrying
             else:
                 logging.error('Max retries reached for prompt: ' + prompt)
-                return "Error"
+                return "", "error"
 
 
 
 def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
     max_retries = 10
-    client = openai.OpenAI(api_key=api_key)
+    model = _resolve_model_name(model)
+    client = _openai_client(api_key=api_key)
     for i in range(max_retries):
         try:
             if chat_history:
-                messages = chat_history
-                messages.append({"role": "user", "content": prompt})
+                messages = list(chat_history) + [{"role": "user", "content": prompt}]
             else:
                 messages = [{"role": "user", "content": prompt}]
             
@@ -88,16 +140,17 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
 
 async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
     max_retries = 10
+    model = _resolve_model_name(model)
     messages = [{"role": "user", "content": prompt}]
     for i in range(max_retries):
         try:
-            async with openai.AsyncOpenAI(api_key=api_key) as client:
-                response = await client.chat.completions.create(
-                    model=model,
-                    messages=messages,
-                    temperature=0,
-                )
-                return response.choices[0].message.content
+            client = _openai_async_client(api_key=api_key)
+            response = await client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=0,
+            )
+            return response.choices[0].message.content
         except Exception as e:
             print('************* Retrying *************')
             logging.error(f"Error: {e}")
@@ -195,6 +248,38 @@ def structure_to_list(structure):
             nodes.extend(structure_to_list(item))
         return nodes
 
+
+def create_node_mapping(tree, include_page_ranges: bool = False, max_page: Optional[int] = None):
+    """
+    Create a `node_id -> node` mapping for convenient lookups in retrieval pipelines.
+
+    - `include_page_ranges=False` (default): returns `node_id -> node_dict` and adds `page_index`
+      (alias of `start_index`) for backward compatibility with older notebooks.
+    - `include_page_ranges=True`: returns `node_id -> {"node": node, "start_index": x, "end_index": y}`.
+    """
+    node_map = {}
+    for node in structure_to_list(tree):
+        if not isinstance(node, dict):
+            continue
+        node_id = node.get("node_id")
+        if not node_id:
+            continue
+
+        start_index = node.get("start_index")
+        end_index = node.get("end_index")
+
+        if include_page_ranges:
+            if max_page is not None and isinstance(end_index, int):
+                end_index = min(end_index, max_page)
+            node_map[node_id] = {"node": node, "start_index": start_index, "end_index": end_index}
+        else:
+            node_copy = dict(node)
+            if start_index is not None and "page_index" not in node_copy:
+                node_copy["page_index"] = start_index
+            node_map[node_id] = node_copy
+
+    return node_map
+
     
 def get_leaf_nodes(structure):
     if isinstance(structure, dict):
@@ -410,8 +495,9 @@ def add_preface_if_needed(data):
 
 
 
-def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
-    enc = tiktoken.encoding_for_model(model)
+def get_page_tokens(pdf_path, model=DEFAULT_MODEL, pdf_parser="PyPDF2"):
+    model_name = _resolve_model_name(model)
+    enc = _tiktoken_encoder(model_name)
     if pdf_parser == "PyPDF2":
         pdf_reader = PyPDF2.PdfReader(pdf_path)
         page_list = []
@@ -709,4 +795,7 @@ def load(self, user_opt=None) -> config:
 
         self._validate_keys(user_dict)
         merged = {**self._default_dict, **user_dict}
-        return config(**merged)
\ No newline at end of file
+        env_default_model = _default_model_from_env()
+        if env_default_model and "model" not in user_dict:
+            merged["model"] = env_default_model
+        return config(**merged)
diff --git a/requirements.txt b/requirements.txt
index 463db58f1..a695e9880 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 openai==1.101.0
 pymupdf==1.26.4
-PyPDF2==3.0.1
+pypdf>=4.0.0
 python-dotenv==1.1.0
 tiktoken==0.11.0
 pyyaml==6.0.2
diff --git a/run_pageindex.py b/run_pageindex.py
index 107024505..a5beca266 100644
--- a/run_pageindex.py
+++ b/run_pageindex.py
@@ -10,7 +10,8 @@
     parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
     parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
 
-    parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
+    default_model = ConfigLoader().load().model
+    parser.add_argument('--model', type=str, default=default_model, help='Model to use')
 
     parser.add_argument('--toc-check-pages', type=int, default=20, 
                       help='Number of pages to check for table of contents (PDF only)')
@@ -130,4 +131,4 @@
         with open(output_file, 'w', encoding='utf-8') as f:
             json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
         
-        print(f'Tree structure saved to: {output_file}')
\ No newline at end of file
+        print(f'Tree structure saved to: {output_file}')