Skip to content

Commit e6d0dc2

Browse files
Refactor ingestion logic to unify single-file and directory output, remove unused exceptions, and fix partial clone subpath handling.
- Consolidate `format_directory` and `format_single_file` into a single `format_node` function - Remove unused exceptions (`MaxFilesReachedError`, `MaxFileSizeReachedError`, `AlreadyVisitedError`) - Update partial clone logic to correctly handle single-file paths by stripping the filename from subpath when `blob` is True - Improve docstrings and clean up code for better readability
1 parent ee8a351 commit e6d0dc2

File tree

5 files changed

+151
-198
lines changed

5 files changed

+151
-198
lines changed

src/gitingest/cloning.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,12 @@ async def clone_repo(config: CloneConfig) -> None:
100100
checkout_cmd = ["git", "-C", local_path]
101101

102102
if partial_clone:
103+
subpath = config.subpath.lstrip("/")
103104
if config.blob:
104-
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name
105-
checkout_cmd += ["sparse-checkout", "set", Path(config.subpath.lstrip("/")).parent]
106-
else:
107-
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")]
105+
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
106+
subpath = str(Path(subpath).parent.as_posix())
107+
108+
checkout_cmd += ["sparse-checkout", "set", subpath]
108109

109110
if commit:
110111
checkout_cmd += ["checkout", commit]

src/gitingest/exceptions.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,27 +30,6 @@ class AsyncTimeoutError(Exception):
3030
"""
3131

3232

33-
class MaxFilesReachedError(Exception):
34-
"""Exception raised when the maximum number of files is reached."""
35-
36-
def __init__(self, max_files: int) -> None:
37-
super().__init__(f"Maximum number of files ({max_files}) reached.")
38-
39-
40-
class MaxFileSizeReachedError(Exception):
41-
"""Exception raised when the maximum file size is reached."""
42-
43-
def __init__(self, max_size: int):
44-
super().__init__(f"Maximum file size limit ({max_size/1024/1024:.1f}MB) reached.")
45-
46-
47-
class AlreadyVisitedError(Exception):
48-
"""Exception raised when a symlink target has already been visited."""
49-
50-
def __init__(self, path: str) -> None:
51-
super().__init__(f"Symlink target already visited: {path}")
52-
53-
5433
class InvalidNotebookError(Exception):
5534
"""Exception raised when a Jupyter notebook is invalid or cannot be processed."""
5635

src/gitingest/filesystem_schema.py

Lines changed: 60 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@
77
from enum import Enum, auto
88
from pathlib import Path
99

10-
from gitingest.exceptions import InvalidNotebookError
1110
from gitingest.utils.ingestion_utils import _get_encoding_list
1211
from gitingest.utils.notebook_utils import process_notebook
1312
from gitingest.utils.textfile_checker_utils import is_textfile
1413

15-
SEPARATOR = "=" * 48 + "\n"
14+
SEPARATOR = "=" * 48
1615

1716

1817
class FileSystemNodeType(Enum):
@@ -36,108 +35,104 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
3635
"""
3736
Class representing a node in the file system (either a file or directory).
3837
39-
This class has more than the recommended number of attributes because it needs to
40-
track various properties of files and directories for comprehensive analysis.
38+
Tracks properties of files/directories for comprehensive analysis.
4139
"""
4240

4341
name: str
44-
type: FileSystemNodeType # e.g., "directory" or "file"
42+
type: FileSystemNodeType
4543
path_str: str
4644
path: Path
4745
size: int = 0
4846
file_count: int = 0
4947
dir_count: int = 0
5048
depth: int = 0
51-
children: list[FileSystemNode] = field(default_factory=list) # Using default_factory instead of empty list
49+
children: list[FileSystemNode] = field(default_factory=list)
5250

5351
def sort_children(self) -> None:
5452
"""
5553
Sort the children nodes of a directory according to a specific order.
5654
5755
Order of sorting:
58-
1. README.md first
59-
2. Regular files (not starting with dot)
60-
3. Hidden files (starting with dot)
61-
4. Regular directories (not starting with dot)
62-
5. Hidden directories (starting with dot)
63-
All groups are sorted alphanumerically within themselves.
64-
"""
65-
# Separate files and directories
66-
files = [child for child in self.children if child.type == FileSystemNodeType.FILE]
67-
directories = [child for child in self.children if child.type == FileSystemNodeType.DIRECTORY]
56+
2. Regular files (not starting with dot)
57+
3. Hidden files (starting with dot)
58+
4. Regular directories (not starting with dot)
59+
5. Hidden directories (starting with dot)
6860
69-
# Find README.md
70-
readme_files = [f for f in files if f.name.lower() == "readme.md"]
71-
other_files = [f for f in files if f.name.lower() != "readme.md"]
61+
All groups are sorted alphanumerically within themselves.
7262
73-
# Separate hidden and regular files/directories
74-
regular_files = [f for f in other_files if not f.name.startswith(".")]
75-
hidden_files = [f for f in other_files if f.name.startswith(".")]
76-
regular_dirs = [d for d in directories if not d.name.startswith(".")]
77-
hidden_dirs = [d for d in directories if d.name.startswith(".")]
63+
Raises
64+
------
65+
ValueError
66+
If the node is not a directory.
67+
"""
68+
if self.type != FileSystemNodeType.DIRECTORY:
69+
raise ValueError("Cannot sort children of a non-directory node")
7870

79-
# Sort each group alphanumerically
80-
regular_files.sort(key=lambda x: x.name)
81-
hidden_files.sort(key=lambda x: x.name)
82-
regular_dirs.sort(key=lambda x: x.name)
83-
hidden_dirs.sort(key=lambda x: x.name)
71+
def _sort_key(child: FileSystemNode) -> tuple[int, str]:
72+
# Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir
73+
name = child.name.lower()
74+
if child.type == FileSystemNodeType.FILE:
75+
if name == "readme.md":
76+
return (0, name)
77+
return (1 if not name.startswith(".") else 2, name)
78+
return (3 if not name.startswith(".") else 4, name)
8479

85-
self.children = readme_files + regular_files + hidden_files + regular_dirs + hidden_dirs
80+
self.children.sort(key=_sort_key)
8681

8782
@property
8883
def content_string(self) -> str:
8984
"""
90-
Return the content of the node as a string.
91-
92-
This property returns the content of the node as a string, including the path and content.
85+
Return the content of the node as a string, including path and content.
9386
9487
Returns
9588
-------
9689
str
9790
A string representation of the node's content.
9891
"""
99-
content_repr = SEPARATOR
92+
parts = [
93+
SEPARATOR,
94+
f"File: {str(self.path_str).replace(os.sep, '/')}",
95+
SEPARATOR,
96+
f"{self.content}",
97+
]
10098

101-
# Use forward slashes in output paths
102-
content_repr += f"File: {str(self.path_str).replace(os.sep, '/')}\n"
103-
content_repr += SEPARATOR
104-
content_repr += f"{self.content}\n\n"
105-
return content_repr
99+
return "\n".join(parts) + "\n\n"
106100

107101
@property
108102
def content(self) -> str: # pylint: disable=too-many-return-statements
109103
"""
110-
Read the content of a file.
111-
112-
This function attempts to open a file and read its contents using UTF-8 encoding.
113-
If an error occurs during reading (e.g., file is not found or permission error),
114-
it returns an error message.
104+
Read the content of a file if it's text (or a notebook). Return an error message otherwise.
115105
116106
Returns
117107
-------
118108
str
119109
The content of the file, or an error message if the file could not be read.
110+
111+
Raises
112+
------
113+
ValueError
114+
If the node is a directory.
120115
"""
121-
if self.type == FileSystemNodeType.FILE and not is_textfile(self.path):
116+
if self.type == FileSystemNodeType.DIRECTORY:
117+
raise ValueError("Cannot read content of a directory node")
118+
119+
if not is_textfile(self.path):
122120
return "[Non-text file]"
123121

124-
try:
125-
if self.path.suffix == ".ipynb":
126-
try:
127-
return process_notebook(self.path)
128-
except Exception as exc:
129-
return f"Error processing notebook: {exc}"
130-
131-
for encoding in _get_encoding_list():
132-
try:
133-
with self.path.open(encoding=encoding) as f:
134-
return f.read()
135-
except UnicodeDecodeError:
136-
continue
137-
except OSError as exc:
138-
return f"Error reading file: {exc}"
139-
140-
return "Error: Unable to decode file with available encodings"
141-
142-
except (OSError, InvalidNotebookError) as exc:
143-
return f"Error reading file: {exc}"
122+
if self.path.suffix == ".ipynb":
123+
try:
124+
return process_notebook(self.path)
125+
except Exception as exc:
126+
return f"Error processing notebook: {exc}"
127+
128+
# Try multiple encodings
129+
for encoding in _get_encoding_list():
130+
try:
131+
with self.path.open(encoding=encoding) as f:
132+
return f.read()
133+
except UnicodeDecodeError:
134+
continue
135+
except OSError as exc:
136+
return f"Error reading file: {exc}"
137+
138+
return "Error: Unable to decode file with available encodings"

src/gitingest/ingestion.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES
88
from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats
9-
from gitingest.output_formatters import format_directory, format_single_file
9+
from gitingest.output_formatters import format_node
1010
from gitingest.query_parsing import ParsedQuery
1111
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
1212
from gitingest.utils.path_utils import _is_safe_symlink
@@ -38,7 +38,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]:
3838
Raises
3939
------
4040
ValueError
41-
If the specified path cannot be found or if the file is not a text file.
41+
If the path cannot be found, is not a file, or the file has no content.
4242
"""
4343
subpath = Path(query.subpath.strip("/")).as_posix()
4444
path = query.local_path / subpath
@@ -63,7 +63,11 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]:
6363
path_str=str(relative_path),
6464
path=path,
6565
)
66-
return format_single_file(file_node, query)
66+
67+
if not file_node.content:
68+
raise ValueError(f"File {file_node.name} has no content")
69+
70+
return format_node(file_node, query)
6771

6872
root_node = FileSystemNode(
6973
name=path.name,
@@ -80,7 +84,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]:
8084
stats=stats,
8185
)
8286

83-
return format_directory(root_node, query)
87+
return format_node(root_node, query)
8488

8589

8690
def apply_gitingest_file(path: Path, query: ParsedQuery) -> None:

0 commit comments

Comments
 (0)