From 9b2c1d89f11df3d4d1c751921d1333b9d7a9256d Mon Sep 17 00:00:00 2001
From: Vyaas Valsaraj <vyaas.valsaraj99@gmail.com>
Date: Mon, 30 Dec 2024 20:52:18 +0530
Subject: [PATCH 1/5] Update ingest_from_query.py

Modified to read PDF files
---
 src/gitingest/ingest_from_query.py | 42 +++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py
index d8f57b71..6b112838 100644
--- a/src/gitingest/ingest_from_query.py
+++ b/src/gitingest/ingest_from_query.py
@@ -1,6 +1,7 @@
 import os
 from fnmatch import fnmatch
 from typing import Any
+from PyPDF2 import PdfReader
 
 import tiktoken
 
@@ -97,6 +98,21 @@ def _is_safe_symlink(symlink_path: str, base_path: str) -> bool:
         # If there's any error resolving the paths, consider it unsafe
         return False
 
+def _is_pdf_file(file_path: str) -> bool:
+    """
+    Check if the file is a PDF based on its extension.
+
+    Parameters
+    ----------
+    file_path : str
+        The path to the file to check.
+
+    Returns
+    -------
+    bool
+        `True` if the file is a PDF, `False` otherwise.
+    """
+    return file_path.lower().endswith(".pdf")
 
 def _is_text_file(file_path: str) -> bool:
     """
@@ -123,14 +139,32 @@ def _is_text_file(file_path: str) -> bool:
     except OSError:
         return False
 
+def _read_pdf_content(file_path: str) -> str:
+    """
+    Extract text from a PDF file.
+
+    Parameters
+    ----------
+    file_path : str
+        The path to the PDF file.
+
+    Returns
+    -------
+    str
+        The extracted text from the PDF, or an error message if extraction fails.
+    """
+    try:
+        reader = PdfReader(file_path)
+        return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
+    except Exception as e:
+        return f"Error reading PDF file: {str(e)}"
 
 def _read_file_content(file_path: str) -> str:
     """
     Reads the content of a file.
 
-    This function attempts to open a file and read its contents using UTF-8 encoding.
-    If an error occurs during reading (e.g., file is not found or permission error),
-    it returns an error message.
+    This function reads text files using UTF-8 encoding or extracts text from PDF files.
+    If an error occurs during reading, it returns an error message.
 
     Parameters
     ----------
@@ -142,6 +176,8 @@ def _read_file_content(file_path: str) -> str:
     str
         The content of the file, or an error message if the file could not be read.
     """
+    if _is_pdf_file(file_path):
+        return _read_pdf_content(file_path)
     try:
         with open(file_path, encoding="utf-8", errors="ignore") as f:
             return f.read()

From 6f63da6c8345884f4dd1344ef82b6c04d84c5845 Mon Sep 17 00:00:00 2001
From: Vyaas Valsaraj <vyaas.valsaraj99@gmail.com>
Date: Mon, 30 Dec 2024 20:57:05 +0530
Subject: [PATCH 2/5] Update requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index e147ebfa..2004da0c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ slowapi
 starlette
 tiktoken
 uvicorn
+PyPDF2

From 70eb824d3d16f842ba12ac29da336abb62e48f35 Mon Sep 17 00:00:00 2001
From: Vyaas Valsaraj <vyaas.valsaraj99@gmail.com>
Date: Tue, 31 Dec 2024 22:23:33 +0530
Subject: [PATCH 3/5] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 2004da0c..6eb28b5e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,4 @@ slowapi
 starlette
 tiktoken
 uvicorn
-PyPDF2
+pypdf

From 5250f805be47c2997aa39eca6303c6d87e403e76 Mon Sep 17 00:00:00 2001
From: Vyaas Valsaraj <vyaas.valsaraj99@gmail.com>
Date: Tue, 31 Dec 2024 22:23:56 +0530
Subject: [PATCH 4/5] Update ingest_from_query.py

---
 src/gitingest/ingest_from_query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py
index 6b112838..49f6e6e9 100644
--- a/src/gitingest/ingest_from_query.py
+++ b/src/gitingest/ingest_from_query.py
@@ -1,7 +1,7 @@
 import os
 from fnmatch import fnmatch
 from typing import Any
-from PyPDF2 import PdfReader
+from pypdf import PdfReader
 
 import tiktoken
 

From 7beef71bb7fcbab98575400fd485e5c631a08bfe Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Tue, 31 Dec 2024 18:50:22 +0100
Subject: [PATCH 5/5] Ran pre-commit on Vyaas99's branch for CI to pass

---
 requirements.txt                   | 2 +-
 src/gitingest/ingest_from_query.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 6eb28b5e..130edf32 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 click>=8.0.0
 fastapi-analytics
 fastapi[standard]
+pypdf
 python-dotenv
 slowapi
 starlette
 tiktoken
 uvicorn
-pypdf
diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py
index 49f6e6e9..e142838d 100644
--- a/src/gitingest/ingest_from_query.py
+++ b/src/gitingest/ingest_from_query.py
@@ -1,9 +1,9 @@
 import os
 from fnmatch import fnmatch
 from typing import Any
-from pypdf import PdfReader
 
 import tiktoken
+from pypdf import PdfReader
 
 MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB
 MAX_DIRECTORY_DEPTH = 20  # Maximum depth of directory traversal
@@ -98,6 +98,7 @@ def _is_safe_symlink(symlink_path: str, base_path: str) -> bool:
         # If there's any error resolving the paths, consider it unsafe
         return False
 
+
 def _is_pdf_file(file_path: str) -> bool:
     """
     Check if the file is a PDF based on its extension.
@@ -114,6 +115,7 @@ def _is_pdf_file(file_path: str) -> bool:
     """
     return file_path.lower().endswith(".pdf")
 
+
 def _is_text_file(file_path: str) -> bool:
     """
     Determine if a file is likely a text file based on its content.
@@ -139,6 +141,7 @@ def _is_text_file(file_path: str) -> bool:
     except OSError:
         return False
 
+
 def _read_pdf_content(file_path: str) -> str:
     """
     Extract text from a PDF file.
@@ -159,6 +162,7 @@ def _read_pdf_content(file_path: str) -> str:
     except Exception as e:
         return f"Error reading PDF file: {str(e)}"
 
+
 def _read_file_content(file_path: str) -> str:
     """
     Reads the content of a file.