Skip to content

Commit 1bda9f6

Browse files
fix: split sparse-checkout & commit checkout when cloning; refresh docs/CLI
* Run `git sparse-checkout set …` and `git checkout <sha>` as two calls—matches Git’s CLI rules and fixes failures. * Tidy clone path creation via _ensure_directory; use DEFAULT_TIMEOUT. * Clarify CLI/help strings and schema docstrings. * Update tests for the new two-step checkout flow.
1 parent 789be9b commit 1bda9f6

File tree

5 files changed

+89
-51
lines changed

5 files changed

+89
-51
lines changed

src/gitingest/cli.py

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,19 @@
2929
"--exclude-pattern",
3030
"-e",
3131
multiple=True,
32-
help="""Patterns to exclude. Handles python's arbitrary subset of Unix
33-
shell-style wildcards. See:
34-
https://docs.python.org/3/library/fnmatch.html""",
32+
help=(
33+
"Patterns to exclude. Handles Python's arbitrary subset of Unix shell-style "
34+
"wildcards. See: https://docs.python.org/3/library/fnmatch.html"
35+
),
3536
)
3637
@click.option(
3738
"--include-pattern",
3839
"-i",
3940
multiple=True,
40-
help="""Patterns to include. Handles python's arbitrary subset of Unix
41-
shell-style wildcards. See:
42-
https://docs.python.org/3/library/fnmatch.html""",
41+
help=(
42+
"Patterns to include. Handles Python's arbitrary subset of Unix shell-style "
43+
"wildcards. See: https://docs.python.org/3/library/fnmatch.html"
44+
),
4345
)
4446
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
4547
def main(
@@ -58,21 +60,29 @@ def main(
5860
Parameters
5961
----------
6062
source : str
61-
The source directory or repository to analyze.
63+
A directory path or a Git repository URL.
6264
output : str, optional
63-
The path where the output file will be written. If not specified, the output will be written
64-
to a file named `<repo_name>.txt` in the current directory.
65+
Output file path. Defaults to `<repo_name>.txt`.
6566
max_size : int
66-
The maximum file size to process, in bytes. Files larger than this size will be ignored.
67+
Maximum file size (in bytes) to consider.
6768
exclude_pattern : Tuple[str, ...]
68-
A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored.
69+
Glob patterns for pruning the file set.
6970
include_pattern : Tuple[str, ...]
70-
A tuple of patterns to include during the analysis. Only files matching these patterns will be processed.
71+
Glob patterns for including files in the output.
7172
branch : str, optional
72-
The branch to clone (optional).
73+
Specific branch to ingest (defaults to the repository's default).
7374
"""
74-
# Main entry point for the CLI. This function is called when the CLI is run as a script.
75-
asyncio.run(_async_main(source, output, max_size, exclude_pattern, include_pattern, branch))
75+
76+
asyncio.run(
77+
_async_main(
78+
source=source,
79+
output=output,
80+
max_size=max_size,
81+
exclude_pattern=exclude_pattern,
82+
include_pattern=include_pattern,
83+
branch=branch,
84+
)
85+
)
7686

7787

7888
async def _async_main(
@@ -92,40 +102,49 @@ async def _async_main(
92102
Parameters
93103
----------
94104
source : str
95-
The source directory or repository to analyze.
105+
A directory path or a Git repository URL.
96106
output : str, optional
97-
The path where the output file will be written. If not specified, the output will be written
98-
to a file named `<repo_name>.txt` in the current directory.
107+
Output file path. Defaults to `<repo_name>.txt`.
99108
max_size : int
100-
The maximum file size to process, in bytes. Files larger than this size will be ignored.
109+
Maximum file size (in bytes) to consider.
101110
exclude_pattern : Tuple[str, ...]
102-
A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored.
111+
Glob patterns for pruning the file set.
103112
include_pattern : Tuple[str, ...]
104-
A tuple of patterns to include during the analysis. Only files matching these patterns will be processed.
113+
Glob patterns for including files in the output.
105114
branch : str, optional
106-
The branch to clone (optional).
115+
Specific branch to ingest (defaults to the repository's default).
107116
108117
Raises
109118
------
110119
Abort
111120
If there is an error during the execution of the command, this exception is raised to abort the process.
112121
"""
113122
try:
114-
# Combine default and custom ignore patterns
123+
# Normalise pattern containers (the ingest layer expects sets)
115124
exclude_patterns = set(exclude_pattern)
116125
include_patterns = set(include_pattern)
117126

118-
if not output:
127+
# Choose a default output path if none provided
128+
if output is None:
119129
output = OUTPUT_FILE_NAME
120-
summary, _, _ = await ingest_async(source, max_size, include_patterns, exclude_patterns, branch, output=output)
130+
131+
summary, _, _ = await ingest_async(
132+
source=source,
133+
max_file_size=max_size,
134+
include_patterns=include_patterns,
135+
exclude_patterns=exclude_patterns,
136+
branch=branch,
137+
output=output,
138+
)
121139

122140
click.echo(f"Analysis complete! Output written to: {output}")
123141
click.echo("\nSummary:")
124142
click.echo(summary)
125143

126144
except Exception as exc:
145+
# Convert any exception into Click.Abort so that exit status is non-zero
127146
click.echo(f"Error: {exc}", err=True)
128-
raise click.Abort()
147+
raise click.Abort() from exc
129148

130149

131150
if __name__ == "__main__":

src/gitingest/cloning.py

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,13 @@
44
from pathlib import Path
55
from typing import Optional
66

7+
from gitingest.config import DEFAULT_TIMEOUT
78
from gitingest.schemas import CloneConfig
89
from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command
910
from gitingest.utils.timeout_wrapper import async_timeout
1011

11-
TIMEOUT: int = 60
1212

13-
14-
@async_timeout(TIMEOUT)
13+
@async_timeout(DEFAULT_TIMEOUT)
1514
async def clone_repo(config: CloneConfig) -> None:
1615
"""
1716
Clone a repository to a local path based on the provided configuration.
@@ -29,8 +28,6 @@ async def clone_repo(config: CloneConfig) -> None:
2928
------
3029
ValueError
3130
If the repository is not found or if the provided URL is invalid.
32-
OSError
33-
If an error occurs while creating the parent directory for the repository.
3431
"""
3532
# Extract and validate query parameters
3633
url: str = config.url
@@ -40,18 +37,14 @@ async def clone_repo(config: CloneConfig) -> None:
4037
partial_clone: bool = config.subpath != "/"
4138

4239
# Create parent directory if it doesn't exist
43-
parent_dir = Path(local_path).parent
44-
try:
45-
os.makedirs(parent_dir, exist_ok=True)
46-
except OSError as exc:
47-
raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc
40+
await _ensure_directory(Path(local_path).parent)
4841

4942
# Check if the repository exists
5043
if not await check_repo_exists(url):
5144
raise ValueError("Repository not found, make sure it is public")
5245

5346
clone_cmd = ["git", "clone", "--single-branch"]
54-
# TODO re-enable --recurse-submodules
47+
# TODO: Re-enable --recurse-submodules when submodule support is needed
5548

5649
if partial_clone:
5750
clone_cmd += ["--filter=blob:none", "--sparse"]
@@ -67,19 +60,35 @@ async def clone_repo(config: CloneConfig) -> None:
6760
await ensure_git_installed()
6861
await run_command(*clone_cmd)
6962

70-
if commit or partial_clone:
71-
checkout_cmd = ["git", "-C", local_path]
63+
# Checkout the subpath if it is a partial clone
64+
if partial_clone:
65+
subpath = config.subpath.lstrip("/")
66+
if config.blob:
67+
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
68+
subpath = str(Path(subpath).parent.as_posix())
69+
70+
await run_command("git", "-C", local_path, "sparse-checkout", "set", subpath)
7271

73-
if partial_clone:
74-
subpath = config.subpath.lstrip("/")
75-
if config.blob:
76-
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
77-
subpath = str(Path(subpath).parent.as_posix())
72+
# Checkout the commit if it is provided
73+
if commit:
74+
await run_command("git", "-C", local_path, "checkout", commit)
7875

79-
checkout_cmd += ["sparse-checkout", "set", subpath]
8076

81-
if commit:
82-
checkout_cmd += ["checkout", commit]
77+
async def _ensure_directory(path: Path) -> None:
78+
"""
79+
Ensure the directory exists, creating it if necessary.
80+
81+
Parameters
82+
----------
83+
path : Path
84+
The path to ensure exists
8385
84-
# Check out the specific commit and/or subpath
85-
await run_command(*checkout_cmd)
86+
Raises
87+
------
88+
OSError
89+
If the directory cannot be created
90+
"""
91+
try:
92+
os.makedirs(path, exist_ok=True)
93+
except OSError as exc:
94+
raise OSError(f"Failed to create directory {path}: {exc}") from exc

src/gitingest/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal
88
MAX_FILES = 10_000 # Maximum number of files to process
99
MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB
10+
DEFAULT_TIMEOUT = 60 # seconds
1011

1112
OUTPUT_FILE_NAME = "digest.txt"
1213

src/gitingest/schemas/ingestion_schema.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ class CloneConfig:
2929
The branch to clone (default is None).
3030
subpath : str
3131
The subpath to clone from the repository (default is "/").
32+
blob: bool
33+
Whether the repository is a blob (default is False).
3234
"""
3335

3436
url: str

tests/test_repository_clone.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -435,16 +435,23 @@ async def test_clone_with_commit_and_subpath() -> None:
435435
clone_config.local_path,
436436
)
437437

438-
# Verify the sparse-checkout command sets the correct path
438+
# Verify sparse-checkout set
439439
mock_exec.assert_any_call(
440440
"git",
441441
"-C",
442442
clone_config.local_path,
443443
"sparse-checkout",
444444
"set",
445445
"src/docs",
446+
)
447+
448+
# Verify checkout commit
449+
mock_exec.assert_any_call(
450+
"git",
451+
"-C",
452+
clone_config.local_path,
446453
"checkout",
447454
clone_config.commit,
448455
)
449456

450-
assert mock_exec.call_count == 2
457+
assert mock_exec.call_count == 3

0 commit comments

Comments
 (0)