Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@ temp/*
chroma-collections.parquet
chroma-embeddings.parquet
.DS_Store
.venv/
.env
.env*
notebook
SDK/*
log/*
logs/
results/
parts/*
json_results/*
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,17 +149,24 @@ pip3 install --upgrade -r requirements.txt

### 2. Set your OpenAI API key

Create a `.env` file in the root directory and add your API key:
Create a `.env` file in the root directory and add your API key (supports `CHATGPT_API_KEY` or `OPENAI_API_KEY`).
If you use an OpenAI-compatible endpoint/proxy, also set a base URL (include `/v1`):

```bash
CHATGPT_API_KEY=your_openai_key_here
CHATGPT_BASE_URL=https://your-base-url/v1
CHATGPT_MODEL=gpt-4o-2024-11-20
```

### 3. Run PageIndex on your PDF

```bash
python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
```
If your file path contains spaces, wrap it in quotes:
```bash
python3 run_pageindex.py --pdf_path "/path/with spaces/document.pdf"
```

<details>
<summary><strong>Optional parameters</strong></summary>
Expand All @@ -173,7 +180,8 @@ You can customize the processing with additional optional arguments:
--max-tokens-per-node Max tokens per node (default: 20000)
--if-add-node-id Add node ID (yes/no, default: yes)
--if-add-node-summary Add node summary (yes/no, default: yes)
--if-add-doc-description Add doc description (yes/no, default: yes)
--if-add-doc-description Add doc description (yes/no, default: no)
--if-add-node-text Add node text (yes/no, default: no)
```
</details>

Expand Down
172 changes: 116 additions & 56 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,34 +166,35 @@ def extract_toc_content(content, model=None):
Directly return the full table of contents content. Do not output anything else."""

response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)

if finish_reason == "error" and not response:
raise Exception("Failed to extract table of contents (API error)")

if_complete = check_if_toc_transformation_is_complete(content, response, model)
if if_complete == "yes" and finish_reason == "finished":
return response

chat_history = [
{"role": "user", "content": prompt},
{"role": "assistant", "content": response},
]
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
response = response + new_response
if_complete = check_if_toc_transformation_is_complete(content, response, model)


max_attempts = 5
attempt = 0
continue_prompt = "please continue the generation of table of contents, directly output the remaining part of the structure"

while not (if_complete == "yes" and finish_reason == "finished"):
attempt += 1
if attempt > max_attempts:
raise Exception("Failed to complete table of contents after maximum retries")

chat_history = [
{"role": "user", "content": prompt},
{"role": "assistant", "content": response},
{"role": "user", "content": prompt},
{"role": "assistant", "content": response},
]
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
new_response, finish_reason = ChatGPT_API_with_finish_reason(
model=model, prompt=continue_prompt, chat_history=chat_history
)
if finish_reason == "error" and not new_response:
raise Exception("Failed to continue table of contents extraction (API error)")

response = response + new_response
if_complete = check_if_toc_transformation_is_complete(content, response, model)

# Optional: Add a maximum retry limit to prevent infinite loops
if len(chat_history) > 5: # Arbitrary limit of 10 attempts
raise Exception('Failed to complete table of contents after maximum retries')


return response

def detect_page_index(toc_content, model=None):
Expand Down Expand Up @@ -414,6 +415,29 @@ def add_page_offset_to_toc_json(data, offset):
return data


def sanitize_toc_page_numbers(toc_items):
"""
Best-effort cleanup for parsed TOC page numbers.

Some PDFs have TOCs with inconsistent numbering (resets/spikes). Since downstream
logic assumes page numbers are monotonic, treat non-monotonic integers as invalid
so they can be re-inferred later (e.g. via `process_none_page_numbers` / verification).
"""
last_page = None
for item in toc_items:
page = item.get("page")
if not isinstance(page, int):
continue
if page <= 0:
item["page"] = None
continue
if last_page is not None and page < last_page:
item["page"] = None
continue
last_page = page
return toc_items



def page_list_to_group_text(page_contents, token_lengths, max_tokens=20000, overlap_page=1):
num_tokens = sum(token_lengths)
Expand Down Expand Up @@ -565,14 +589,14 @@ def generate_toc_init(part, model=None):
else:
raise Exception(f'finish reason: {finish_reason}')

def process_no_toc(page_list, start_index=1, model=None, logger=None):
def process_no_toc(page_list, start_index=1, model=None, logger=None, max_tokens=20000):
page_contents=[]
token_lengths=[]
for page_index in range(start_index, start_index+len(page_list)):
page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
page_contents.append(page_text)
token_lengths.append(count_tokens(page_text, model))
group_texts = page_list_to_group_text(page_contents, token_lengths)
group_texts = page_list_to_group_text(page_contents, token_lengths, max_tokens=max_tokens)
logger.info(f'len(group_texts): {len(group_texts)}')

toc_with_page_number= generate_toc_init(group_texts[0], model)
Expand All @@ -586,7 +610,7 @@ def process_no_toc(page_list, start_index=1, model=None, logger=None):

return toc_with_page_number

def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None):
def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None, max_tokens=20000):
page_contents=[]
token_lengths=[]
toc_content = toc_transformer(toc_content, model)
Expand All @@ -596,7 +620,7 @@ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_in
page_contents.append(page_text)
token_lengths.append(count_tokens(page_text, model))

group_texts = page_list_to_group_text(page_contents, token_lengths)
group_texts = page_list_to_group_text(page_contents, token_lengths, max_tokens=max_tokens)
logger.info(f'len(group_texts): {len(group_texts)}')

toc_with_page_number=copy.deepcopy(toc_content)
Expand All @@ -613,6 +637,7 @@ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_in

def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=None, model=None, logger=None):
toc_with_page_number = toc_transformer(toc_content, model)
toc_with_page_number = sanitize_toc_page_numbers(toc_with_page_number)
logger.info(f'toc_with_page_number: {toc_with_page_number}')

toc_no_page_number = remove_page_number(copy.deepcopy(toc_with_page_number))
Expand Down Expand Up @@ -674,11 +699,11 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
continue

item_copy = copy.deepcopy(item)
del item_copy['page']
item_copy.pop('page', None)
result = add_page_number_to_toc(page_contents, item_copy, model)
if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('<physical_index'):
item['physical_index'] = int(result[0]['physical_index'].split('_')[-1].rstrip('>').strip())
del item['page']
item.pop('page', None)

return toc_items

Expand Down Expand Up @@ -758,21 +783,21 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
incorrect_results_and_range_logs = []
# Helper function to process and check a single incorrect item
async def process_and_check_item(incorrect_item):
list_index = incorrect_item['list_index']
toc_list_index = incorrect_item['list_index']

# Check if list_index is valid
if list_index < 0 or list_index >= len(toc_with_page_number):
if toc_list_index < 0 or toc_list_index >= len(toc_with_page_number):
# Return an invalid result for out-of-bounds indices
return {
'list_index': list_index,
'list_index': toc_list_index,
'title': incorrect_item['title'],
'physical_index': incorrect_item.get('physical_index'),
'is_valid': False
}

# Find the previous correct item
prev_correct = None
for i in range(list_index-1, -1, -1):
for i in range(toc_list_index - 1, -1, -1):
if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
physical_index = toc_with_page_number[i].get('physical_index')
if physical_index is not None:
Expand All @@ -784,7 +809,7 @@ async def process_and_check_item(incorrect_item):

# Find the next correct item
next_correct = None
for i in range(list_index+1, len(toc_with_page_number)):
for i in range(toc_list_index + 1, len(toc_with_page_number)):
if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
physical_index = toc_with_page_number[i].get('physical_index')
if physical_index is not None:
Expand All @@ -795,7 +820,7 @@ async def process_and_check_item(incorrect_item):
next_correct = end_index

incorrect_results_and_range_logs.append({
'list_index': list_index,
'list_index': toc_list_index,
'title': incorrect_item['title'],
'prev_correct': prev_correct,
'next_correct': next_correct
Expand All @@ -804,9 +829,9 @@ async def process_and_check_item(incorrect_item):
page_contents=[]
for page_index in range(prev_correct, next_correct+1):
# Add bounds checking to prevent IndexError
list_index = page_index - start_index
if list_index >= 0 and list_index < len(page_list):
page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
page_list_index = page_index - start_index
if 0 <= page_list_index < len(page_list):
page_text = f"<physical_index_{page_index}>\n{page_list[page_list_index][0]}\n<physical_index_{page_index}>\n\n"
page_contents.append(page_text)
else:
continue
Expand All @@ -820,7 +845,7 @@ async def process_and_check_item(incorrect_item):
check_result = await check_title_appearance(check_item, page_list, start_index, model)

return {
'list_index': list_index,
'list_index': toc_list_index,
'title': incorrect_item['title'],
'physical_index': physical_index_int,
'is_valid': check_result['answer'] == 'yes'
Expand Down Expand Up @@ -951,13 +976,36 @@ async def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=None, start_index=1, opt=None, logger=None):
print(mode)
print(f'start_index: {start_index}')

if mode == 'process_toc_with_page_numbers':
toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger)
elif mode == 'process_toc_no_page_numbers':
toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger)
else:
toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
max_tokens = getattr(opt, "max_token_num_each_node", 20000)
try:
if mode == 'process_toc_with_page_numbers':
toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger)
elif mode == 'process_toc_no_page_numbers':
toc_with_page_number = process_toc_no_page_numbers(
toc_content,
toc_page_list,
page_list,
start_index=start_index,
model=opt.model,
logger=logger,
max_tokens=max_tokens,
)
else:
toc_with_page_number = process_no_toc(
page_list,
start_index=start_index,
model=opt.model,
logger=logger,
max_tokens=max_tokens,
)
except Exception as e:
if logger:
logger.error({"mode": mode, "start_index": start_index, "error": str(e)})
if mode == 'process_toc_with_page_numbers':
return await meta_processor(page_list, mode='process_toc_no_page_numbers', toc_content=toc_content, toc_page_list=toc_page_list, start_index=start_index, opt=opt, logger=logger)
if mode == 'process_toc_no_page_numbers':
return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
return [{"structure": "1", "title": "Document", "physical_index": start_index}]

toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None]

Expand All @@ -971,7 +1019,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)

logger.info({
'mode': 'process_toc_with_page_numbers',
'mode': mode,
'accuracy': accuracy,
'incorrect_results': incorrect_results
})
Expand All @@ -986,7 +1034,9 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
elif mode == 'process_toc_no_page_numbers':
return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
else:
raise Exception('Processing failed')
if logger:
logger.error({"mode": mode, "start_index": start_index, "accuracy": accuracy, "incorrect_results_count": len(incorrect_results)})
return [{"structure": "1", "title": "Document", "physical_index": start_index}]


async def process_large_node_recursively(node, page_list, opt=None, logger=None):
Expand Down Expand Up @@ -1022,15 +1072,25 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
check_toc_result = check_toc(page_list, opt)
logger.info(check_toc_result)

if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip() and check_toc_result["page_index_given_in_toc"] == "yes":
toc_with_page_number = await meta_processor(
page_list,
mode='process_toc_with_page_numbers',
start_index=1,
toc_content=check_toc_result['toc_content'],
toc_page_list=check_toc_result['toc_page_list'],
opt=opt,
logger=logger)
if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip():
if check_toc_result["page_index_given_in_toc"] == "yes":
toc_with_page_number = await meta_processor(
page_list,
mode='process_toc_with_page_numbers',
start_index=1,
toc_content=check_toc_result['toc_content'],
toc_page_list=check_toc_result['toc_page_list'],
opt=opt,
logger=logger)
else:
toc_with_page_number = await meta_processor(
page_list,
mode='process_toc_no_page_numbers',
start_index=1,
toc_content=check_toc_result['toc_content'],
toc_page_list=check_toc_result['toc_page_list'],
opt=opt,
logger=logger)
else:
toc_with_page_number = await meta_processor(
page_list,
Expand Down Expand Up @@ -1066,7 +1126,7 @@ def page_index_main(doc, opt=None):
raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")

print('Parsing PDF...')
page_list = get_page_tokens(doc)
page_list = get_page_tokens(doc, model=getattr(opt, "model", None))

logger.info({'total_page_number': len(page_list)})
logger.info({'total_token': sum([page[1] for page in page_list])})
Expand Down Expand Up @@ -1141,4 +1201,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt
if truncated_items:
print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")

return toc_with_page_number
return toc_with_page_number
Loading