From 9b2c1d89f11df3d4d1c751921d1333b9d7a9256d Mon Sep 17 00:00:00 2001 From: Vyaas Valsaraj Date: Mon, 30 Dec 2024 20:52:18 +0530 Subject: [PATCH 1/5] Update ingest_from_query.py Modified to read PDF files --- src/gitingest/ingest_from_query.py | 42 +++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index d8f57b71..6b112838 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,6 +1,7 @@ import os from fnmatch import fnmatch from typing import Any +from PyPDF2 import PdfReader import tiktoken @@ -97,6 +98,21 @@ def _is_safe_symlink(symlink_path: str, base_path: str) -> bool: # If there's any error resolving the paths, consider it unsafe return False +def _is_pdf_file(file_path: str) -> bool: + """ + Check if the file is a PDF based on its extension. + + Parameters + ---------- + file_path : str + The path to the file to check. + + Returns + ------- + bool + `True` if the file is a PDF, `False` otherwise. + """ + return file_path.lower().endswith(".pdf") def _is_text_file(file_path: str) -> bool: """ @@ -123,14 +139,32 @@ def _is_text_file(file_path: str) -> bool: except OSError: return False +def _read_pdf_content(file_path: str) -> str: + """ + Extract text from a PDF file. + + Parameters + ---------- + file_path : str + The path to the PDF file. + + Returns + ------- + str + The extracted text from the PDF, or an error message if extraction fails. + """ + try: + reader = PdfReader(file_path) + return "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) + except Exception as e: + return f"Error reading PDF file: {str(e)}" def _read_file_content(file_path: str) -> str: """ Reads the content of a file. - This function attempts to open a file and read its contents using UTF-8 encoding. - If an error occurs during reading (e.g., file is not found or permission error), - it returns an error message. + This function reads text files using UTF-8 encoding or extracts text from PDF files. + If an error occurs during reading, it returns an error message. Parameters ---------- @@ -142,6 +176,8 @@ def _read_file_content(file_path: str) -> str: str The content of the file, or an error message if the file could not be read. """ + if _is_pdf_file(file_path): + return _read_pdf_content(file_path) try: with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read() From 6f63da6c8345884f4dd1344ef82b6c04d84c5845 Mon Sep 17 00:00:00 2001 From: Vyaas Valsaraj Date: Mon, 30 Dec 2024 20:57:05 +0530 Subject: [PATCH 2/5] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index e147ebfa..2004da0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ slowapi starlette tiktoken uvicorn +PyPDF2 From 70eb824d3d16f842ba12ac29da336abb62e48f35 Mon Sep 17 00:00:00 2001 From: Vyaas Valsaraj Date: Tue, 31 Dec 2024 22:23:33 +0530 Subject: [PATCH 3/5] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2004da0c..6eb28b5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,4 @@ slowapi starlette tiktoken uvicorn -PyPDF2 +pypdf From 5250f805be47c2997aa39eca6303c6d87e403e76 Mon Sep 17 00:00:00 2001 From: Vyaas Valsaraj Date: Tue, 31 Dec 2024 22:23:56 +0530 Subject: [PATCH 4/5] Update ingest_from_query.py --- src/gitingest/ingest_from_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 6b112838..49f6e6e9 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,7 +1,7 @@ import os from fnmatch import fnmatch from typing import Any -from PyPDF2 import PdfReader +from pypdf import PdfReader import tiktoken From 7beef71bb7fcbab98575400fd485e5c631a08bfe Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Tue, 31 Dec 2024 18:50:22 +0100 Subject: [PATCH 5/5] Ran pre-commit on Vyaas99's branch for CI to pass --- requirements.txt | 2 +- src/gitingest/ingest_from_query.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6eb28b5e..130edf32 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ click>=8.0.0 fastapi-analytics fastapi[standard] +pypdf python-dotenv slowapi starlette tiktoken uvicorn -pypdf diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 49f6e6e9..e142838d 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,9 +1,9 @@ import os from fnmatch import fnmatch from typing import Any -from pypdf import PdfReader import tiktoken +from pypdf import PdfReader MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal @@ -98,6 +98,7 @@ def _is_safe_symlink(symlink_path: str, base_path: str) -> bool: # If there's any error resolving the paths, consider it unsafe return False + def _is_pdf_file(file_path: str) -> bool: """ Check if the file is a PDF based on its extension. @@ -114,6 +115,7 @@ def _is_pdf_file(file_path: str) -> bool: """ return file_path.lower().endswith(".pdf") + def _is_text_file(file_path: str) -> bool: """ Determine if a file is likely a text file based on its content. @@ -139,6 +141,7 @@ def _is_text_file(file_path: str) -> bool: except OSError: return False + def _read_pdf_content(file_path: str) -> str: """ Extract text from a PDF file. @@ -159,6 +162,7 @@ def _read_pdf_content(file_path: str) -> str: except Exception as e: return f"Error reading PDF file: {str(e)}" + def _read_file_content(file_path: str) -> str: """ Reads the content of a file.