diff --git a/requirements.txt b/requirements.txt index e147ebfa..130edf32 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ click>=8.0.0 fastapi-analytics fastapi[standard] +pypdf python-dotenv slowapi starlette diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index d8f57b71..e142838d 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -3,6 +3,7 @@ from typing import Any import tiktoken +from pypdf import PdfReader MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal @@ -98,6 +99,23 @@ def _is_safe_symlink(symlink_path: str, base_path: str) -> bool: return False +def _is_pdf_file(file_path: str) -> bool: + """ + Check if the file is a PDF based on its extension. + + Parameters + ---------- + file_path : str + The path to the file to check. + + Returns + ------- + bool + `True` if the file is a PDF, `False` otherwise. + """ + return file_path.lower().endswith(".pdf") + + def _is_text_file(file_path: str) -> bool: """ Determine if a file is likely a text file based on its content. @@ -124,13 +142,33 @@ def _is_text_file(file_path: str) -> bool: return False +def _read_pdf_content(file_path: str) -> str: + """ + Extract text from a PDF file. + + Parameters + ---------- + file_path : str + The path to the PDF file. + + Returns + ------- + str + The extracted text from the PDF, or an error message if extraction fails. + """ + try: + reader = PdfReader(file_path) + return "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) + except Exception as e: + return f"Error reading PDF file: {str(e)}" + + def _read_file_content(file_path: str) -> str: """ Reads the content of a file. - This function attempts to open a file and read its contents using UTF-8 encoding. - If an error occurs during reading (e.g., file is not found or permission error), - it returns an error message. + This function reads text files using UTF-8 encoding or extracts text from PDF files. + If an error occurs during reading, it returns an error message. Parameters ---------- @@ -142,6 +180,8 @@ def _read_file_content(file_path: str) -> str: str The content of the file, or an error message if the file could not be read. """ + if _is_pdf_file(file_path): + return _read_pdf_content(file_path) try: with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read()