1111from urllib .parse import urlparse
1212
1313import httpx
14+ from git import Repo , Remote , GitCommandError , InvalidGitRepositoryError
15+ from git .cmd import Git
1416from starlette .status import HTTP_200_OK , HTTP_401_UNAUTHORIZED , HTTP_403_FORBIDDEN , HTTP_404_NOT_FOUND
1517
1618from gitingest .utils .compat_func import removesuffix
@@ -47,17 +49,19 @@ def is_github_host(url: str) -> bool:
4749 return hostname .startswith ("github." )
4850
4951
50- async def run_command (* args : str ) -> tuple [bytes , bytes ]:
51- """Execute a shell command asynchronously and return (stdout, stderr) bytes .
52+ async def run_git_command (* args : str , cwd : str | None = None ) -> tuple [str , str ]:
53+ """Execute a git command using GitPython and return (stdout, stderr) strings .
5254
5355 Parameters
5456 ----------
5557 *args : str
56- The command and its arguments to execute.
58+ The git command arguments to execute (without the 'git' prefix).
59+ cwd : str | None
60+ The working directory to execute the command in.
5761
5862 Returns
5963 -------
60- tuple[bytes, bytes ]
64+ tuple[str, str ]
6165 A tuple containing the stdout and stderr of the command.
6266
6367 Raises
@@ -66,18 +70,32 @@ async def run_command(*args: str) -> tuple[bytes, bytes]:
6670 If command exits with a non-zero status.
6771
6872 """
69- # Execute the requested command
70- proc = await asyncio .create_subprocess_exec (
71- * args ,
72- stdout = asyncio .subprocess .PIPE ,
73- stderr = asyncio .subprocess .PIPE ,
74- )
75- stdout , stderr = await proc .communicate ()
76- if proc .returncode != 0 :
77- msg = f"Command failed: { ' ' .join (args )} \n Error: { stderr .decode ().strip ()} "
78- raise RuntimeError (msg )
79-
80- return stdout , stderr
73+ try :
74+ def run_sync ():
75+ git_cmd = Git (cwd or "." )
76+ # Handle different git operations
77+ if args [0 ] == "--version" :
78+ return git_cmd .version (), ""
79+ elif args [0 ] == "config" and len (args ) >= 2 :
80+ try :
81+ result = git_cmd .config (args [1 ])
82+ return result , ""
83+ except GitCommandError as e :
84+ return "" , str (e )
85+ else :
86+ # For other commands, use the raw execute method
87+ result = git_cmd .execute (list (args ))
88+ return result , ""
89+
90+ # Run the synchronous git operation in a thread pool
91+ stdout , stderr = await asyncio .get_event_loop ().run_in_executor (None , run_sync )
92+ return stdout , stderr
93+ except GitCommandError as exc :
94+ msg = f"Git command failed: git { ' ' .join (args )} \n Error: { exc .stderr or str (exc )} "
95+ raise RuntimeError (msg ) from exc
96+ except Exception as exc :
97+ msg = f"Git command failed: git { ' ' .join (args )} \n Error: { str (exc )} "
98+ raise RuntimeError (msg ) from exc
8199
82100
83101async def ensure_git_installed () -> None :
@@ -92,14 +110,14 @@ async def ensure_git_installed() -> None:
92110
93111 """
94112 try :
95- await run_command ( "git" , "--version" )
113+ await run_git_command ( "--version" )
96114 except RuntimeError as exc :
97115 msg = "Git is not installed or not accessible. Please install Git first."
98116 raise RuntimeError (msg ) from exc
99117 if sys .platform == "win32" :
100118 try :
101- stdout , _ = await run_command ( "git" , "config" , "core.longpaths" )
102- if stdout .decode (). strip ().lower () != "true" :
119+ stdout , _ = await run_git_command ( "config" , "core.longpaths" )
120+ if stdout .strip ().lower () != "true" :
103121 logger .warning (
104122 "Git clone may fail on Windows due to long file paths. "
105123 "Consider enabling long path support with: 'git config --global core.longpaths true'. "
@@ -222,61 +240,65 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
222240 msg = f"Invalid fetch type: { ref_type } "
223241 raise ValueError (msg )
224242
225- cmd = ["git" ]
226-
227- # Add authentication if needed
228- if token and is_github_host (url ):
229- cmd += ["-c" , create_git_auth_header (token , url = url )]
230-
231- cmd += ["ls-remote" ]
243+ await ensure_git_installed ()
232244
233- fetch_tags = ref_type == "tags"
234- to_fetch = "tags" if fetch_tags else "heads"
245+ def fetch_refs ():
246+ git_cmd = Git ()
247+
248+ # Set up authentication if needed
249+ if token and is_github_host (url ):
250+ git_cmd = git_cmd .with_custom_environment (GIT_CONFIG_PARAMETERS = create_git_auth_header (token , url = url ))
235251
236- cmd += [f"--{ to_fetch } " ]
252+ fetch_tags = ref_type == "tags"
253+ to_fetch = "tags" if fetch_tags else "heads"
237254
238- # `--refs` filters out the peeled tag objects (those ending with "^{}") (for tags)
239- if fetch_tags :
240- cmd += ["--refs" ]
255+ cmd = ["ls-remote" , f"--{ to_fetch } " ]
256+
257+ # `--refs` filters out the peeled tag objects (those ending with "^{}") (for tags)
258+ if fetch_tags :
259+ cmd .append ("--refs" )
241260
242- cmd += [url ]
261+ cmd .append (url )
262+
263+ try :
264+ result = git_cmd .execute (cmd )
265+ return result
266+ except GitCommandError as e :
267+ raise RuntimeError (f"Failed to fetch { ref_type } : { e .stderr or str (e )} " ) from e
243268
244- await ensure_git_installed ( )
245- stdout , _ = await run_command ( * cmd )
269+ stdout = await asyncio . get_event_loop (). run_in_executor ( None , fetch_refs )
270+
246271 # For each line in the output:
247272 # - Skip empty lines and lines that don't contain "refs/{to_fetch}/"
248273 # - Extract the branch or tag name after "refs/{to_fetch}/"
249274 return [
250275 line .split (f"refs/{ to_fetch } /" , 1 )[1 ]
251- for line in stdout .decode (). splitlines ()
276+ for line in stdout .splitlines ()
252277 if line .strip () and f"refs/{ to_fetch } /" in line
253278 ]
254279
255280
256- def create_git_command ( base_cmd : list [ str ], local_path : str , url : str , token : str | None = None ) -> list [ str ] :
257- """Create a git command with authentication if needed.
281+ def create_git_command_with_auth ( token : str | None , url : str ) -> Git :
282+ """Create a Git command object with authentication if needed.
258283
259284 Parameters
260285 ----------
261- base_cmd : list[str]
262- The base git command to start with.
263- local_path : str
264- The local path where the git command should be executed.
265- url : str
266- The repository URL to check if it's a GitHub repository.
267286 token : str | None
268287 GitHub personal access token (PAT) for accessing private repositories.
288+ url : str
289+ The repository URL to check if it's a GitHub repository.
269290
270291 Returns
271292 -------
272- list[str]
273- The git command with authentication if needed.
293+ Git
294+ A Git command object with authentication configured if needed.
274295
275296 """
276- cmd = [* base_cmd , "-C" , local_path ]
277297 if token and is_github_host (url ):
278- cmd += ["-c" , create_git_auth_header (token , url = url )]
279- return cmd
298+ # Set authentication through environment
299+ auth_config = create_git_auth_header (token , url = url )
300+ return Git ().with_custom_environment (GIT_CONFIG_PARAMETERS = auth_config )
301+ return Git ()
280302
281303
282304def create_git_auth_header (token : str , url : str = "https://github.com" ) -> str :
@@ -343,8 +365,21 @@ async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None
343365 if config .blob :
344366 # Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
345367 subpath = str (Path (subpath ).parent .as_posix ())
346- checkout_cmd = create_git_command (["git" ], config .local_path , config .url , token )
347- await run_command (* checkout_cmd , "sparse-checkout" , "set" , subpath )
368+
369+ def setup_sparse_checkout ():
370+ try :
371+ repo = Repo (config .local_path )
372+ git_cmd = repo .git
373+
374+ # Set up authentication if needed
375+ if token and is_github_host (config .url ):
376+ git_cmd = git_cmd .with_custom_environment (GIT_CONFIG_PARAMETERS = create_git_auth_header (token , url = config .url ))
377+
378+ git_cmd .execute (["sparse-checkout" , "set" , subpath ])
379+ except Exception as e :
380+ raise RuntimeError (f"Failed to setup sparse checkout: { str (e )} " ) from e
381+
382+ await asyncio .get_event_loop ().run_in_executor (None , setup_sparse_checkout )
348383
349384
350385async def resolve_commit (config : CloneConfig , token : str | None ) -> str :
@@ -400,14 +435,16 @@ async def _resolve_ref_to_sha(url: str, pattern: str, token: str | None = None)
400435 If the ref does not exist in the remote repository.
401436
402437 """
403- # Build: git [-c http.<host>/.extraheader=Auth...] ls-remote <url> <pattern>
404- cmd : list [str ] = ["git" ]
405- if token and is_github_host (url ):
406- cmd += ["-c" , create_git_auth_header (token , url = url )]
438+ def resolve_ref ():
439+ git_cmd = create_git_command_with_auth (token , url )
440+ try :
441+ result = git_cmd .execute (["ls-remote" , url , pattern ])
442+ return result
443+ except GitCommandError as e :
444+ raise RuntimeError (f"Failed to resolve ref { pattern } : { e .stderr or str (e )} " ) from e
407445
408- cmd += ["ls-remote" , url , pattern ]
409- stdout , _ = await run_command (* cmd )
410- lines = stdout .decode ().splitlines ()
446+ stdout = await asyncio .get_event_loop ().run_in_executor (None , resolve_ref )
447+ lines = stdout .splitlines ()
411448 sha = _pick_commit_sha (lines )
412449 if not sha :
413450 msg = f"{ pattern !r} not found in { url } "
0 commit comments