Skip to content

Commit 9231552

Browse files
committed
Changes from background agent bc-f5518da9-7338-4c81-bea6-10cf1734d81b
1 parent 38e52cd commit 9231552

File tree

5 files changed

+208
-193
lines changed

5 files changed

+208
-193
lines changed

=3.1.0

Whitespace-only changes.

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
boto3>=1.28.0 # AWS SDK for S3 support
22
click>=8.0.0
33
fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
4+
gitpython>=3.1.0 # Git operations via Python instead of system calls
45
httpx
56
loguru>=0.7.0
67
pathspec>=0.12.1

src/gitingest/clone.py

Lines changed: 64 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,20 @@
22

33
from __future__ import annotations
44

5+
import asyncio
56
from pathlib import Path
67
from typing import TYPE_CHECKING
78

9+
from git import Repo
810
from gitingest.config import DEFAULT_TIMEOUT
911
from gitingest.utils.git_utils import (
1012
check_repo_exists,
1113
checkout_partial_clone,
1214
create_git_auth_header,
13-
create_git_command,
15+
create_git_command_with_auth,
1416
ensure_git_installed,
1517
is_github_host,
1618
resolve_commit,
17-
run_command,
1819
)
1920
from gitingest.utils.logging_config import get_logger
2021
from gitingest.utils.os_utils import ensure_directory_exists_or_create
@@ -83,41 +84,73 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
8384
commit = await resolve_commit(config, token=token)
8485
logger.debug("Resolved commit", extra={"commit": commit})
8586

86-
clone_cmd = ["git"]
87-
if token and is_github_host(url):
88-
clone_cmd += ["-c", create_git_auth_header(token, url=url)]
89-
90-
clone_cmd += ["clone", "--single-branch", "--no-checkout", "--depth=1"]
91-
if partial_clone:
92-
clone_cmd += ["--filter=blob:none", "--sparse"]
93-
94-
clone_cmd += [url, local_path]
95-
96-
# Clone the repository
97-
logger.info("Executing git clone command", extra={"command": " ".join([*clone_cmd[:-1], "<url>", local_path])})
98-
await run_command(*clone_cmd)
99-
logger.info("Git clone completed successfully")
87+
def perform_clone():
88+
"""Perform the git clone operation using GitPython."""
89+
try:
90+
# Set up clone options
91+
clone_kwargs = {
92+
"single_branch": True,
93+
"depth": 1,
94+
"no_checkout": True,
95+
}
96+
97+
# Add authentication for GitHub repositories
98+
env = None
99+
if token and is_github_host(url):
100+
import os
101+
env = os.environ.copy()
102+
env["GIT_CONFIG_PARAMETERS"] = create_git_auth_header(token, url=url)
103+
104+
# Add filter and sparse options for partial clones
105+
if partial_clone:
106+
clone_kwargs["multi_options"] = ["--filter=blob:none", "--sparse"]
107+
108+
# Clone the repository
109+
logger.info("Executing git clone command")
110+
repo = Repo.clone_from(url, local_path, env=env, **clone_kwargs)
111+
logger.info("Git clone completed successfully")
112+
return repo
113+
114+
except Exception as e:
115+
raise RuntimeError(f"Failed to clone repository: {str(e)}") from e
116+
117+
# Perform the clone operation
118+
repo = await asyncio.get_event_loop().run_in_executor(None, perform_clone)
100119

101120
# Checkout the subpath if it is a partial clone
102121
if partial_clone:
103122
logger.info("Setting up partial clone for subpath", extra={"subpath": config.subpath})
104123
await checkout_partial_clone(config, token=token)
105124
logger.debug("Partial clone setup completed")
106125

107-
git = create_git_command(["git"], local_path, url, token)
108-
109-
# Ensure the commit is locally available
110-
logger.debug("Fetching specific commit", extra={"commit": commit})
111-
await run_command(*git, "fetch", "--depth=1", "origin", commit)
112-
113-
# Write the work-tree at that commit
114-
logger.info("Checking out commit", extra={"commit": commit})
115-
await run_command(*git, "checkout", commit)
116-
117-
# Update submodules
118-
if config.include_submodules:
119-
logger.info("Updating submodules")
120-
await run_command(*git, "submodule", "update", "--init", "--recursive", "--depth=1")
121-
logger.debug("Submodules updated successfully")
126+
def perform_checkout():
127+
"""Perform the checkout operations using GitPython."""
128+
try:
129+
# Fetch the specific commit
130+
logger.debug("Fetching specific commit", extra={"commit": commit})
131+
132+
# Set up authentication for fetch operations
133+
if token and is_github_host(url):
134+
git_cmd = repo.git.with_custom_environment(GIT_CONFIG_PARAMETERS=create_git_auth_header(token, url=url))
135+
else:
136+
git_cmd = repo.git
137+
138+
git_cmd.fetch("--depth=1", "origin", commit)
139+
140+
# Checkout the specific commit
141+
logger.info("Checking out commit", extra={"commit": commit})
142+
repo.git.checkout(commit)
143+
144+
# Update submodules if requested
145+
if config.include_submodules:
146+
logger.info("Updating submodules")
147+
repo.git.submodule("update", "--init", "--recursive", "--depth=1")
148+
logger.debug("Submodules updated successfully")
149+
150+
except Exception as e:
151+
raise RuntimeError(f"Failed during checkout operations: {str(e)}") from e
152+
153+
# Perform checkout operations
154+
await asyncio.get_event_loop().run_in_executor(None, perform_checkout)
122155

123156
logger.info("Git clone operation completed successfully", extra={"local_path": local_path})

src/gitingest/utils/git_utils.py

Lines changed: 95 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from urllib.parse import urlparse
1212

1313
import httpx
14+
from git import Repo, Remote, GitCommandError, InvalidGitRepositoryError
15+
from git.cmd import Git
1416
from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND
1517

1618
from gitingest.utils.compat_func import removesuffix
@@ -47,17 +49,19 @@ def is_github_host(url: str) -> bool:
4749
return hostname.startswith("github.")
4850

4951

50-
async def run_command(*args: str) -> tuple[bytes, bytes]:
51-
"""Execute a shell command asynchronously and return (stdout, stderr) bytes.
52+
async def run_git_command(*args: str, cwd: str | None = None) -> tuple[str, str]:
53+
"""Execute a git command using GitPython and return (stdout, stderr) strings.
5254
5355
Parameters
5456
----------
5557
*args : str
56-
The command and its arguments to execute.
58+
The git command arguments to execute (without the 'git' prefix).
59+
cwd : str | None
60+
The working directory to execute the command in.
5761
5862
Returns
5963
-------
60-
tuple[bytes, bytes]
64+
tuple[str, str]
6165
A tuple containing the stdout and stderr of the command.
6266
6367
Raises
@@ -66,18 +70,32 @@ async def run_command(*args: str) -> tuple[bytes, bytes]:
6670
If command exits with a non-zero status.
6771
6872
"""
69-
# Execute the requested command
70-
proc = await asyncio.create_subprocess_exec(
71-
*args,
72-
stdout=asyncio.subprocess.PIPE,
73-
stderr=asyncio.subprocess.PIPE,
74-
)
75-
stdout, stderr = await proc.communicate()
76-
if proc.returncode != 0:
77-
msg = f"Command failed: {' '.join(args)}\nError: {stderr.decode().strip()}"
78-
raise RuntimeError(msg)
79-
80-
return stdout, stderr
73+
try:
74+
def run_sync():
75+
git_cmd = Git(cwd or ".")
76+
# Handle different git operations
77+
if args[0] == "--version":
78+
return git_cmd.version(), ""
79+
elif args[0] == "config" and len(args) >= 2:
80+
try:
81+
result = git_cmd.config(args[1])
82+
return result, ""
83+
except GitCommandError as e:
84+
return "", str(e)
85+
else:
86+
# For other commands, use the raw execute method
87+
result = git_cmd.execute(list(args))
88+
return result, ""
89+
90+
# Run the synchronous git operation in a thread pool
91+
stdout, stderr = await asyncio.get_event_loop().run_in_executor(None, run_sync)
92+
return stdout, stderr
93+
except GitCommandError as exc:
94+
msg = f"Git command failed: git {' '.join(args)}\nError: {exc.stderr or str(exc)}"
95+
raise RuntimeError(msg) from exc
96+
except Exception as exc:
97+
msg = f"Git command failed: git {' '.join(args)}\nError: {str(exc)}"
98+
raise RuntimeError(msg) from exc
8199

82100

83101
async def ensure_git_installed() -> None:
@@ -92,14 +110,14 @@ async def ensure_git_installed() -> None:
92110
93111
"""
94112
try:
95-
await run_command("git", "--version")
113+
await run_git_command("--version")
96114
except RuntimeError as exc:
97115
msg = "Git is not installed or not accessible. Please install Git first."
98116
raise RuntimeError(msg) from exc
99117
if sys.platform == "win32":
100118
try:
101-
stdout, _ = await run_command("git", "config", "core.longpaths")
102-
if stdout.decode().strip().lower() != "true":
119+
stdout, _ = await run_git_command("config", "core.longpaths")
120+
if stdout.strip().lower() != "true":
103121
logger.warning(
104122
"Git clone may fail on Windows due to long file paths. "
105123
"Consider enabling long path support with: 'git config --global core.longpaths true'. "
@@ -222,61 +240,65 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
222240
msg = f"Invalid fetch type: {ref_type}"
223241
raise ValueError(msg)
224242

225-
cmd = ["git"]
226-
227-
# Add authentication if needed
228-
if token and is_github_host(url):
229-
cmd += ["-c", create_git_auth_header(token, url=url)]
230-
231-
cmd += ["ls-remote"]
243+
await ensure_git_installed()
232244

233-
fetch_tags = ref_type == "tags"
234-
to_fetch = "tags" if fetch_tags else "heads"
245+
def fetch_refs():
246+
git_cmd = Git()
247+
248+
# Set up authentication if needed
249+
if token and is_github_host(url):
250+
git_cmd = git_cmd.with_custom_environment(GIT_CONFIG_PARAMETERS=create_git_auth_header(token, url=url))
235251

236-
cmd += [f"--{to_fetch}"]
252+
fetch_tags = ref_type == "tags"
253+
to_fetch = "tags" if fetch_tags else "heads"
237254

238-
# `--refs` filters out the peeled tag objects (those ending with "^{}") (for tags)
239-
if fetch_tags:
240-
cmd += ["--refs"]
255+
cmd = ["ls-remote", f"--{to_fetch}"]
256+
257+
# `--refs` filters out the peeled tag objects (those ending with "^{}") (for tags)
258+
if fetch_tags:
259+
cmd.append("--refs")
241260

242-
cmd += [url]
261+
cmd.append(url)
262+
263+
try:
264+
result = git_cmd.execute(cmd)
265+
return result
266+
except GitCommandError as e:
267+
raise RuntimeError(f"Failed to fetch {ref_type}: {e.stderr or str(e)}") from e
243268

244-
await ensure_git_installed()
245-
stdout, _ = await run_command(*cmd)
269+
stdout = await asyncio.get_event_loop().run_in_executor(None, fetch_refs)
270+
246271
# For each line in the output:
247272
# - Skip empty lines and lines that don't contain "refs/{to_fetch}/"
248273
# - Extract the branch or tag name after "refs/{to_fetch}/"
249274
return [
250275
line.split(f"refs/{to_fetch}/", 1)[1]
251-
for line in stdout.decode().splitlines()
276+
for line in stdout.splitlines()
252277
if line.strip() and f"refs/{to_fetch}/" in line
253278
]
254279

255280

256-
def create_git_command(base_cmd: list[str], local_path: str, url: str, token: str | None = None) -> list[str]:
257-
"""Create a git command with authentication if needed.
281+
def create_git_command_with_auth(token: str | None, url: str) -> Git:
282+
"""Create a Git command object with authentication if needed.
258283
259284
Parameters
260285
----------
261-
base_cmd : list[str]
262-
The base git command to start with.
263-
local_path : str
264-
The local path where the git command should be executed.
265-
url : str
266-
The repository URL to check if it's a GitHub repository.
267286
token : str | None
268287
GitHub personal access token (PAT) for accessing private repositories.
288+
url : str
289+
The repository URL to check if it's a GitHub repository.
269290
270291
Returns
271292
-------
272-
list[str]
273-
The git command with authentication if needed.
293+
Git
294+
A Git command object with authentication configured if needed.
274295
275296
"""
276-
cmd = [*base_cmd, "-C", local_path]
277297
if token and is_github_host(url):
278-
cmd += ["-c", create_git_auth_header(token, url=url)]
279-
return cmd
298+
# Set authentication through environment
299+
auth_config = create_git_auth_header(token, url=url)
300+
return Git().with_custom_environment(GIT_CONFIG_PARAMETERS=auth_config)
301+
return Git()
280302

281303

282304
def create_git_auth_header(token: str, url: str = "https://github.com") -> str:
@@ -343,8 +365,21 @@ async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None
343365
if config.blob:
344366
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
345367
subpath = str(Path(subpath).parent.as_posix())
346-
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
347-
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
368+
369+
def setup_sparse_checkout():
370+
try:
371+
repo = Repo(config.local_path)
372+
git_cmd = repo.git
373+
374+
# Set up authentication if needed
375+
if token and is_github_host(config.url):
376+
git_cmd = git_cmd.with_custom_environment(GIT_CONFIG_PARAMETERS=create_git_auth_header(token, url=config.url))
377+
378+
git_cmd.execute(["sparse-checkout", "set", subpath])
379+
except Exception as e:
380+
raise RuntimeError(f"Failed to setup sparse checkout: {str(e)}") from e
381+
382+
await asyncio.get_event_loop().run_in_executor(None, setup_sparse_checkout)
348383

349384

350385
async def resolve_commit(config: CloneConfig, token: str | None) -> str:
@@ -400,14 +435,16 @@ async def _resolve_ref_to_sha(url: str, pattern: str, token: str | None = None)
400435
If the ref does not exist in the remote repository.
401436
402437
"""
403-
# Build: git [-c http.<host>/.extraheader=Auth...] ls-remote <url> <pattern>
404-
cmd: list[str] = ["git"]
405-
if token and is_github_host(url):
406-
cmd += ["-c", create_git_auth_header(token, url=url)]
438+
def resolve_ref():
439+
git_cmd = create_git_command_with_auth(token, url)
440+
try:
441+
result = git_cmd.execute(["ls-remote", url, pattern])
442+
return result
443+
except GitCommandError as e:
444+
raise RuntimeError(f"Failed to resolve ref {pattern}: {e.stderr or str(e)}") from e
407445

408-
cmd += ["ls-remote", url, pattern]
409-
stdout, _ = await run_command(*cmd)
410-
lines = stdout.decode().splitlines()
446+
stdout = await asyncio.get_event_loop().run_in_executor(None, resolve_ref)
447+
lines = stdout.splitlines()
411448
sha = _pick_commit_sha(lines)
412449
if not sha:
413450
msg = f"{pattern!r} not found in {url}"

0 commit comments

Comments
 (0)