Skip to content

Commit 5fa0f41

Browse files
authored
Merge branch 'main' into feat/api
2 parents 24af730 + b098bb4 commit 5fa0f41

16 files changed

+431
-470
lines changed

src/gitingest/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
""" Gitingest: A package for ingesting data from Git repositories. """
22

3-
from gitingest.cloning import clone_repo
3+
from gitingest.cloning import clone
4+
from gitingest.entrypoint import ingest, ingest_async
45
from gitingest.ingestion import ingest_query
56
from gitingest.query_parsing import parse_query
6-
from gitingest.repository_ingest import ingest, ingest_async
77

8-
__all__ = ["ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"]
8+
__all__ = ["ingest_query", "clone", "parse_query", "ingest", "ingest_async"]

src/gitingest/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import click
99

1010
from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME
11-
from gitingest.repository_ingest import ingest_async
11+
from gitingest.entrypoint import ingest_async
1212

1313

1414
@click.command()

src/gitingest/cloning.py

Lines changed: 7 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,17 @@
22

33
import asyncio
44
import os
5-
from dataclasses import dataclass
65
from pathlib import Path
76
from typing import List, Optional, Tuple
87

8+
from gitingest.ingestion_schema import CloneConfig
99
from gitingest.utils.timeout_wrapper import async_timeout
1010

1111
TIMEOUT: int = 60
1212

1313

14-
@dataclass
15-
class CloneConfig:
16-
"""
17-
Configuration for cloning a Git repository.
18-
19-
This class holds the necessary parameters for cloning a repository to a local path, including
20-
the repository's URL, the target local path, and optional parameters for a specific commit or branch.
21-
22-
Attributes
23-
----------
24-
url : str
25-
The URL of the Git repository to clone.
26-
local_path : str
27-
The local directory where the repository will be cloned.
28-
commit : str, optional
29-
The specific commit hash to check out after cloning (default is None).
30-
branch : str, optional
31-
The branch to clone (default is None).
32-
subpath : str
33-
The subpath to clone from the repository (default is "/").
34-
"""
35-
36-
url: str
37-
local_path: str
38-
commit: Optional[str] = None
39-
branch: Optional[str] = None
40-
subpath: str = "/"
41-
blob: bool = False
42-
43-
4414
@async_timeout(TIMEOUT)
45-
async def clone_repo(config: CloneConfig) -> None:
15+
async def clone(config: CloneConfig) -> None:
4616
"""
4717
Clone a repository to a local path based on the provided configuration.
4818
@@ -100,11 +70,12 @@ async def clone_repo(config: CloneConfig) -> None:
10070
checkout_cmd = ["git", "-C", local_path]
10171

10272
if partial_clone:
73+
subpath = config.subpath.lstrip("/")
10374
if config.blob:
104-
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name
105-
checkout_cmd += ["sparse-checkout", "set", Path(config.subpath.lstrip("/")).parent]
106-
else:
107-
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")]
75+
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
76+
subpath = str(Path(subpath).parent.as_posix())
77+
78+
checkout_cmd += ["sparse-checkout", "set", subpath]
10879

10980
if commit:
11081
checkout_cmd += ["checkout", commit]
Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
import shutil
66
from typing import Optional, Set, Tuple, Union
77

8-
from gitingest.cloning import clone_repo
8+
from gitingest.cloning import clone
99
from gitingest.config import TMP_BASE_PATH
1010
from gitingest.ingestion import ingest_query
11-
from gitingest.query_parsing import ParsedQuery, parse_query
11+
from gitingest.query_parsing import IngestionQuery, parse_query
1212

1313

1414
async def ingest_async(
@@ -53,37 +53,37 @@ async def ingest_async(
5353
Raises
5454
------
5555
TypeError
56-
If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type.
56+
If `clone` does not return a coroutine, or if the `source` is of an unsupported type.
5757
"""
5858
repo_cloned = False
5959

6060
try:
61-
parsed_query: ParsedQuery = await parse_query(
61+
query: IngestionQuery = await parse_query(
6262
source=source,
6363
max_file_size=max_file_size,
6464
from_web=False,
6565
include_patterns=include_patterns,
6666
ignore_patterns=exclude_patterns,
6767
)
6868

69-
if parsed_query.url:
70-
selected_branch = branch if branch else parsed_query.branch # prioritize branch argument
71-
parsed_query.branch = selected_branch
69+
if query.url:
70+
selected_branch = branch if branch else query.branch # prioritize branch argument
71+
query.branch = selected_branch
7272

73-
clone_config = parsed_query.extact_clone_config()
74-
clone_coroutine = clone_repo(clone_config)
73+
clone_config = query.extract_clone_config()
74+
clone_coroutine = clone(clone_config)
7575

7676
if inspect.iscoroutine(clone_coroutine):
7777
if asyncio.get_event_loop().is_running():
7878
await clone_coroutine
7979
else:
8080
asyncio.run(clone_coroutine)
8181
else:
82-
raise TypeError("clone_repo did not return a coroutine as expected.")
82+
raise TypeError("clone did not return a coroutine as expected.")
8383

8484
repo_cloned = True
8585

86-
summary, tree, content = ingest_query(parsed_query)
86+
summary, tree, content = ingest_query(query)
8787

8888
if output is not None:
8989
with open(output, "w", encoding="utf-8") as f:

src/gitingest/exceptions.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,27 +30,6 @@ class AsyncTimeoutError(Exception):
3030
"""
3131

3232

33-
class MaxFilesReachedError(Exception):
34-
"""Exception raised when the maximum number of files is reached."""
35-
36-
def __init__(self, max_files: int) -> None:
37-
super().__init__(f"Maximum number of files ({max_files}) reached.")
38-
39-
40-
class MaxFileSizeReachedError(Exception):
41-
"""Exception raised when the maximum file size is reached."""
42-
43-
def __init__(self, max_size: int):
44-
super().__init__(f"Maximum file size limit ({max_size/1024/1024:.1f}MB) reached.")
45-
46-
47-
class AlreadyVisitedError(Exception):
48-
"""Exception raised when a symlink target has already been visited."""
49-
50-
def __init__(self, path: str) -> None:
51-
super().__init__(f"Symlink target already visited: {path}")
52-
53-
5433
class InvalidNotebookError(Exception):
5534
"""Exception raised when a Jupyter notebook is invalid or cannot be processed."""
5635

src/gitingest/filesystem_schema.py

Lines changed: 61 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@
77
from enum import Enum, auto
88
from pathlib import Path
99

10-
from gitingest.exceptions import InvalidNotebookError
1110
from gitingest.utils.ingestion_utils import _get_encoding_list
1211
from gitingest.utils.notebook_utils import process_notebook
1312
from gitingest.utils.textfile_checker_utils import is_textfile
1413

15-
SEPARATOR = "=" * 48 + "\n"
14+
SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48
1615

1716

1817
class FileSystemNodeType(Enum):
@@ -36,108 +35,105 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
3635
"""
3736
Class representing a node in the file system (either a file or directory).
3837
39-
This class has more than the recommended number of attributes because it needs to
40-
track various properties of files and directories for comprehensive analysis.
38+
Tracks properties of files/directories for comprehensive analysis.
4139
"""
4240

4341
name: str
44-
type: FileSystemNodeType # e.g., "directory" or "file"
42+
type: FileSystemNodeType
4543
path_str: str
4644
path: Path
4745
size: int = 0
4846
file_count: int = 0
4947
dir_count: int = 0
5048
depth: int = 0
51-
children: list[FileSystemNode] = field(default_factory=list) # Using default_factory instead of empty list
49+
children: list[FileSystemNode] = field(default_factory=list)
5250

5351
def sort_children(self) -> None:
5452
"""
5553
Sort the children nodes of a directory according to a specific order.
5654
5755
Order of sorting:
58-
1. README.md first
59-
2. Regular files (not starting with dot)
60-
3. Hidden files (starting with dot)
61-
4. Regular directories (not starting with dot)
62-
5. Hidden directories (starting with dot)
63-
All groups are sorted alphanumerically within themselves.
64-
"""
65-
# Separate files and directories
66-
files = [child for child in self.children if child.type == FileSystemNodeType.FILE]
67-
directories = [child for child in self.children if child.type == FileSystemNodeType.DIRECTORY]
56+
2. Regular files (not starting with dot)
57+
3. Hidden files (starting with dot)
58+
4. Regular directories (not starting with dot)
59+
5. Hidden directories (starting with dot)
6860
69-
# Find README.md
70-
readme_files = [f for f in files if f.name.lower() == "readme.md"]
71-
other_files = [f for f in files if f.name.lower() != "readme.md"]
61+
All groups are sorted alphanumerically within themselves.
7262
73-
# Separate hidden and regular files/directories
74-
regular_files = [f for f in other_files if not f.name.startswith(".")]
75-
hidden_files = [f for f in other_files if f.name.startswith(".")]
76-
regular_dirs = [d for d in directories if not d.name.startswith(".")]
77-
hidden_dirs = [d for d in directories if d.name.startswith(".")]
63+
Raises
64+
------
65+
ValueError
66+
If the node is not a directory.
67+
"""
68+
if self.type != FileSystemNodeType.DIRECTORY:
69+
raise ValueError("Cannot sort children of a non-directory node")
7870

79-
# Sort each group alphanumerically
80-
regular_files.sort(key=lambda x: x.name)
81-
hidden_files.sort(key=lambda x: x.name)
82-
regular_dirs.sort(key=lambda x: x.name)
83-
hidden_dirs.sort(key=lambda x: x.name)
71+
def _sort_key(child: FileSystemNode) -> tuple[int, str]:
72+
# returns the priority order for the sort function, 0 is first
73+
# Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir
74+
name = child.name.lower()
75+
if child.type == FileSystemNodeType.FILE:
76+
if name == "readme.md":
77+
return (0, name)
78+
return (1 if not name.startswith(".") else 2, name)
79+
return (3 if not name.startswith(".") else 4, name)
8480

85-
self.children = readme_files + regular_files + hidden_files + regular_dirs + hidden_dirs
81+
self.children.sort(key=_sort_key)
8682

8783
@property
8884
def content_string(self) -> str:
8985
"""
90-
Return the content of the node as a string.
91-
92-
This property returns the content of the node as a string, including the path and content.
86+
Return the content of the node as a string, including path and content.
9387
9488
Returns
9589
-------
9690
str
9791
A string representation of the node's content.
9892
"""
99-
content_repr = SEPARATOR
93+
parts = [
94+
SEPARATOR,
95+
f"File: {str(self.path_str).replace(os.sep, '/')}",
96+
SEPARATOR,
97+
f"{self.content}",
98+
]
10099

101-
# Use forward slashes in output paths
102-
content_repr += f"File: {str(self.path_str).replace(os.sep, '/')}\n"
103-
content_repr += SEPARATOR
104-
content_repr += f"{self.content}\n\n"
105-
return content_repr
100+
return "\n".join(parts) + "\n\n"
106101

107102
@property
108103
def content(self) -> str: # pylint: disable=too-many-return-statements
109104
"""
110-
Read the content of a file.
111-
112-
This function attempts to open a file and read its contents using UTF-8 encoding.
113-
If an error occurs during reading (e.g., file is not found or permission error),
114-
it returns an error message.
105+
Read the content of a file if it's text (or a notebook). Return an error message otherwise.
115106
116107
Returns
117108
-------
118109
str
119110
The content of the file, or an error message if the file could not be read.
111+
112+
Raises
113+
------
114+
ValueError
115+
If the node is a directory.
120116
"""
121-
if self.type == FileSystemNodeType.FILE and not is_textfile(self.path):
117+
if self.type == FileSystemNodeType.DIRECTORY:
118+
raise ValueError("Cannot read content of a directory node")
119+
120+
if not is_textfile(self.path):
122121
return "[Non-text file]"
123122

124-
try:
125-
if self.path.suffix == ".ipynb":
126-
try:
127-
return process_notebook(self.path)
128-
except Exception as exc:
129-
return f"Error processing notebook: {exc}"
130-
131-
for encoding in _get_encoding_list():
132-
try:
133-
with self.path.open(encoding=encoding) as f:
134-
return f.read()
135-
except UnicodeDecodeError:
136-
continue
137-
except OSError as exc:
138-
return f"Error reading file: {exc}"
139-
140-
return "Error: Unable to decode file with available encodings"
141-
142-
except (OSError, InvalidNotebookError) as exc:
143-
return f"Error reading file: {exc}"
123+
if self.path.suffix == ".ipynb":
124+
try:
125+
return process_notebook(self.path)
126+
except Exception as exc:
127+
return f"Error processing notebook: {exc}"
128+
129+
# Try multiple encodings
130+
for encoding in _get_encoding_list():
131+
try:
132+
with self.path.open(encoding=encoding) as f:
133+
return f.read()
134+
except UnicodeDecodeError:
135+
continue
136+
except OSError as exc:
137+
return f"Error reading file: {exc}"
138+
139+
return "Error: Unable to decode file with available encodings"

0 commit comments

Comments
 (0)