diff --git a/README.md b/README.md index 879a67ef..6ce0a1b7 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,10 @@ CHATGPT_API_KEY=your_openai_key_here python3 run_pageindex.py --pdf_path /path/to/your/document.pdf ``` +### Self Hosting + +This repository does not ship a production API server. For a minimal self-hosted HTTP wrapper, see `SELF_HOSTING.md`. +
Optional parameters
diff --git a/SELF_HOSTING.md b/SELF_HOSTING.md new file mode 100644 index 00000000..7729e345 --- /dev/null +++ b/SELF_HOSTING.md @@ -0,0 +1,42 @@ +# Self Hosting (Open-Source) + +This repository focuses on **generating PageIndex tree structures** locally (PDF → tree JSON). It does **not** ship the same hosted SaaS/API backend as `api.pageindex.ai`. + +If you want to “self host” for internal use, the typical approach is: + +1) run the CLI (`run_pageindex.py`) in your own environment, or +2) wrap the CLI/library with a small HTTP server. + +## Minimal FastAPI Server + +An example server is provided at `examples/self_host_fastapi.py`. It exposes a single endpoint that accepts a PDF upload and returns the generated tree JSON. + +### Install + +```bash +pip3 install -r requirements.txt fastapi uvicorn +``` + +### Configure + +Create a `.env` in the repo root with your LLM settings (example): + +```bash +CHATGPT_API_KEY= +CHATGPT_MODEL=gpt-4o-2024-11-20 +CHATGPT_BASE_URL=https://api.openai.com/v1 +``` + +### Run + +```bash +uvicorn examples.self_host_fastapi:app --reload --port 8000 +``` + +### Call + +```bash +curl -F "file=@/path/to/document.pdf" "http://localhost:8000/index" +``` + +Addresses #82 diff --git a/examples/self_host_fastapi.py b/examples/self_host_fastapi.py new file mode 100644 index 00000000..538365cb --- /dev/null +++ b/examples/self_host_fastapi.py @@ -0,0 +1,58 @@ +import os +import tempfile + +from fastapi import FastAPI, File, HTTPException, UploadFile + +from pageindex.page_index import page_index_main +from pageindex.utils import config + + +app = FastAPI(title="PageIndex (Self-Hosted Example)") + + +@app.get("/health") +def health(): + return {"status": "ok"} + + +@app.post("/index") +def index_pdf( + file: UploadFile = File(...), + model: str = "gpt-4o-2024-11-20", + toc_check_pages: int = 20, + max_pages_per_node: int = 10, + max_tokens_per_node: int = 20000, + if_add_node_id: str = "yes", + if_add_node_summary: str = "yes", + if_add_doc_description: str = "no", + if_add_node_text: str = "no", +): + filename = file.filename or "" + if not filename.lower().endswith(".pdf"): + raise HTTPException(status_code=400, detail="Only .pdf uploads are supported") + + pdf_bytes = file.file.read() + if not pdf_bytes: + raise HTTPException(status_code=400, detail="Empty upload") + + tmp_path = None + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: + tmp.write(pdf_bytes) + tmp_path = tmp.name + + opt = config( + model=model, + toc_check_page_num=toc_check_pages, + max_page_num_each_node=max_pages_per_node, + max_token_num_each_node=max_tokens_per_node, + if_add_node_id=if_add_node_id, + if_add_node_summary=if_add_node_summary, + if_add_doc_description=if_add_doc_description, + if_add_node_text=if_add_node_text, + ) + + return page_index_main(tmp_path, opt) + finally: + if tmp_path and os.path.exists(tmp_path): + os.remove(tmp_path)