From 9231552de02378899d8f922d6f9ebe99a70c1aae Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 7 Aug 2025 14:34:02 +0000 Subject: [PATCH 1/3] Changes from background agent bc-f5518da9-7338-4c81-bea6-10cf1734d81b --- =3.1.0 | 0 requirements.txt | 1 + src/gitingest/clone.py | 95 ++++++++++++------- src/gitingest/utils/git_utils.py | 153 +++++++++++++++++++------------ tests/test_git_utils.py | 152 ++++++++++-------------------- 5 files changed, 208 insertions(+), 193 deletions(-) create mode 100644 =3.1.0 diff --git a/=3.1.0 b/=3.1.0 new file mode 100644 index 00000000..e69de29b diff --git a/requirements.txt b/requirements.txt index b803cf7b..8f879bb6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ boto3>=1.28.0 # AWS SDK for S3 support click>=8.0.0 fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38 +gitpython>=3.1.0 # Git operations via Python instead of system calls httpx loguru>=0.7.0 pathspec>=0.12.1 diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index d05381b1..a2c48da0 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -2,19 +2,20 @@ from __future__ import annotations +import asyncio from pathlib import Path from typing import TYPE_CHECKING +from git import Repo from gitingest.config import DEFAULT_TIMEOUT from gitingest.utils.git_utils import ( check_repo_exists, checkout_partial_clone, create_git_auth_header, - create_git_command, + create_git_command_with_auth, ensure_git_installed, is_github_host, resolve_commit, - run_command, ) from gitingest.utils.logging_config import get_logger from gitingest.utils.os_utils import ensure_directory_exists_or_create @@ -83,20 +84,38 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: commit = await resolve_commit(config, token=token) logger.debug("Resolved commit", extra={"commit": commit}) - clone_cmd = ["git"] - if token and is_github_host(url): - clone_cmd += ["-c", create_git_auth_header(token, url=url)] - - clone_cmd += ["clone", "--single-branch", "--no-checkout", "--depth=1"] - if partial_clone: - clone_cmd += ["--filter=blob:none", "--sparse"] - - clone_cmd += [url, local_path] - - # Clone the repository - logger.info("Executing git clone command", extra={"command": " ".join([*clone_cmd[:-1], "", local_path])}) - await run_command(*clone_cmd) - logger.info("Git clone completed successfully") + def perform_clone(): + """Perform the git clone operation using GitPython.""" + try: + # Set up clone options + clone_kwargs = { + "single_branch": True, + "depth": 1, + "no_checkout": True, + } + + # Add authentication for GitHub repositories + env = None + if token and is_github_host(url): + import os + env = os.environ.copy() + env["GIT_CONFIG_PARAMETERS"] = create_git_auth_header(token, url=url) + + # Add filter and sparse options for partial clones + if partial_clone: + clone_kwargs["multi_options"] = ["--filter=blob:none", "--sparse"] + + # Clone the repository + logger.info("Executing git clone command") + repo = Repo.clone_from(url, local_path, env=env, **clone_kwargs) + logger.info("Git clone completed successfully") + return repo + + except Exception as e: + raise RuntimeError(f"Failed to clone repository: {str(e)}") from e + + # Perform the clone operation + repo = await asyncio.get_event_loop().run_in_executor(None, perform_clone) # Checkout the subpath if it is a partial clone if partial_clone: @@ -104,20 +123,34 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: await checkout_partial_clone(config, token=token) logger.debug("Partial clone setup completed") - git = create_git_command(["git"], local_path, url, token) - - # Ensure the commit is locally available - logger.debug("Fetching specific commit", extra={"commit": commit}) - await run_command(*git, "fetch", "--depth=1", "origin", commit) - - # Write the work-tree at that commit - logger.info("Checking out commit", extra={"commit": commit}) - await run_command(*git, "checkout", commit) - - # Update submodules - if config.include_submodules: - logger.info("Updating submodules") - await run_command(*git, "submodule", "update", "--init", "--recursive", "--depth=1") - logger.debug("Submodules updated successfully") + def perform_checkout(): + """Perform the checkout operations using GitPython.""" + try: + # Fetch the specific commit + logger.debug("Fetching specific commit", extra={"commit": commit}) + + # Set up authentication for fetch operations + if token and is_github_host(url): + git_cmd = repo.git.with_custom_environment(GIT_CONFIG_PARAMETERS=create_git_auth_header(token, url=url)) + else: + git_cmd = repo.git + + git_cmd.fetch("--depth=1", "origin", commit) + + # Checkout the specific commit + logger.info("Checking out commit", extra={"commit": commit}) + repo.git.checkout(commit) + + # Update submodules if requested + if config.include_submodules: + logger.info("Updating submodules") + repo.git.submodule("update", "--init", "--recursive", "--depth=1") + logger.debug("Submodules updated successfully") + + except Exception as e: + raise RuntimeError(f"Failed during checkout operations: {str(e)}") from e + + # Perform checkout operations + await asyncio.get_event_loop().run_in_executor(None, perform_checkout) logger.info("Git clone operation completed successfully", extra={"local_path": local_path}) diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index daf4056d..605227c2 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -11,6 +11,8 @@ from urllib.parse import urlparse import httpx +from git import Repo, Remote, GitCommandError, InvalidGitRepositoryError +from git.cmd import Git from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND from gitingest.utils.compat_func import removesuffix @@ -47,17 +49,19 @@ def is_github_host(url: str) -> bool: return hostname.startswith("github.") -async def run_command(*args: str) -> tuple[bytes, bytes]: - """Execute a shell command asynchronously and return (stdout, stderr) bytes. +async def run_git_command(*args: str, cwd: str | None = None) -> tuple[str, str]: + """Execute a git command using GitPython and return (stdout, stderr) strings. Parameters ---------- *args : str - The command and its arguments to execute. + The git command arguments to execute (without the 'git' prefix). + cwd : str | None + The working directory to execute the command in. Returns ------- - tuple[bytes, bytes] + tuple[str, str] A tuple containing the stdout and stderr of the command. Raises @@ -66,18 +70,32 @@ async def run_command(*args: str) -> tuple[bytes, bytes]: If command exits with a non-zero status. """ - # Execute the requested command - proc = await asyncio.create_subprocess_exec( - *args, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - if proc.returncode != 0: - msg = f"Command failed: {' '.join(args)}\nError: {stderr.decode().strip()}" - raise RuntimeError(msg) - - return stdout, stderr + try: + def run_sync(): + git_cmd = Git(cwd or ".") + # Handle different git operations + if args[0] == "--version": + return git_cmd.version(), "" + elif args[0] == "config" and len(args) >= 2: + try: + result = git_cmd.config(args[1]) + return result, "" + except GitCommandError as e: + return "", str(e) + else: + # For other commands, use the raw execute method + result = git_cmd.execute(list(args)) + return result, "" + + # Run the synchronous git operation in a thread pool + stdout, stderr = await asyncio.get_event_loop().run_in_executor(None, run_sync) + return stdout, stderr + except GitCommandError as exc: + msg = f"Git command failed: git {' '.join(args)}\nError: {exc.stderr or str(exc)}" + raise RuntimeError(msg) from exc + except Exception as exc: + msg = f"Git command failed: git {' '.join(args)}\nError: {str(exc)}" + raise RuntimeError(msg) from exc async def ensure_git_installed() -> None: @@ -92,14 +110,14 @@ async def ensure_git_installed() -> None: """ try: - await run_command("git", "--version") + await run_git_command("--version") except RuntimeError as exc: msg = "Git is not installed or not accessible. Please install Git first." raise RuntimeError(msg) from exc if sys.platform == "win32": try: - stdout, _ = await run_command("git", "config", "core.longpaths") - if stdout.decode().strip().lower() != "true": + stdout, _ = await run_git_command("config", "core.longpaths") + if stdout.strip().lower() != "true": logger.warning( "Git clone may fail on Windows due to long file paths. " "Consider enabling long path support with: 'git config --global core.longpaths true'. " @@ -222,61 +240,65 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str | msg = f"Invalid fetch type: {ref_type}" raise ValueError(msg) - cmd = ["git"] - - # Add authentication if needed - if token and is_github_host(url): - cmd += ["-c", create_git_auth_header(token, url=url)] - - cmd += ["ls-remote"] + await ensure_git_installed() - fetch_tags = ref_type == "tags" - to_fetch = "tags" if fetch_tags else "heads" + def fetch_refs(): + git_cmd = Git() + + # Set up authentication if needed + if token and is_github_host(url): + git_cmd = git_cmd.with_custom_environment(GIT_CONFIG_PARAMETERS=create_git_auth_header(token, url=url)) - cmd += [f"--{to_fetch}"] + fetch_tags = ref_type == "tags" + to_fetch = "tags" if fetch_tags else "heads" - # `--refs` filters out the peeled tag objects (those ending with "^{}") (for tags) - if fetch_tags: - cmd += ["--refs"] + cmd = ["ls-remote", f"--{to_fetch}"] + + # `--refs` filters out the peeled tag objects (those ending with "^{}") (for tags) + if fetch_tags: + cmd.append("--refs") - cmd += [url] + cmd.append(url) + + try: + result = git_cmd.execute(cmd) + return result + except GitCommandError as e: + raise RuntimeError(f"Failed to fetch {ref_type}: {e.stderr or str(e)}") from e - await ensure_git_installed() - stdout, _ = await run_command(*cmd) + stdout = await asyncio.get_event_loop().run_in_executor(None, fetch_refs) + # For each line in the output: # - Skip empty lines and lines that don't contain "refs/{to_fetch}/" # - Extract the branch or tag name after "refs/{to_fetch}/" return [ line.split(f"refs/{to_fetch}/", 1)[1] - for line in stdout.decode().splitlines() + for line in stdout.splitlines() if line.strip() and f"refs/{to_fetch}/" in line ] -def create_git_command(base_cmd: list[str], local_path: str, url: str, token: str | None = None) -> list[str]: - """Create a git command with authentication if needed. +def create_git_command_with_auth(token: str | None, url: str) -> Git: + """Create a Git command object with authentication if needed. Parameters ---------- - base_cmd : list[str] - The base git command to start with. - local_path : str - The local path where the git command should be executed. - url : str - The repository URL to check if it's a GitHub repository. token : str | None GitHub personal access token (PAT) for accessing private repositories. + url : str + The repository URL to check if it's a GitHub repository. Returns ------- - list[str] - The git command with authentication if needed. + Git + A Git command object with authentication configured if needed. """ - cmd = [*base_cmd, "-C", local_path] if token and is_github_host(url): - cmd += ["-c", create_git_auth_header(token, url=url)] - return cmd + # Set authentication through environment + auth_config = create_git_auth_header(token, url=url) + return Git().with_custom_environment(GIT_CONFIG_PARAMETERS=auth_config) + return Git() def create_git_auth_header(token: str, url: str = "https://github.com") -> str: @@ -343,8 +365,21 @@ async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None if config.blob: # Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt) subpath = str(Path(subpath).parent.as_posix()) - checkout_cmd = create_git_command(["git"], config.local_path, config.url, token) - await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) + + def setup_sparse_checkout(): + try: + repo = Repo(config.local_path) + git_cmd = repo.git + + # Set up authentication if needed + if token and is_github_host(config.url): + git_cmd = git_cmd.with_custom_environment(GIT_CONFIG_PARAMETERS=create_git_auth_header(token, url=config.url)) + + git_cmd.execute(["sparse-checkout", "set", subpath]) + except Exception as e: + raise RuntimeError(f"Failed to setup sparse checkout: {str(e)}") from e + + await asyncio.get_event_loop().run_in_executor(None, setup_sparse_checkout) async def resolve_commit(config: CloneConfig, token: str | None) -> str: @@ -400,14 +435,16 @@ async def _resolve_ref_to_sha(url: str, pattern: str, token: str | None = None) If the ref does not exist in the remote repository. """ - # Build: git [-c http./.extraheader=Auth...] ls-remote - cmd: list[str] = ["git"] - if token and is_github_host(url): - cmd += ["-c", create_git_auth_header(token, url=url)] + def resolve_ref(): + git_cmd = create_git_command_with_auth(token, url) + try: + result = git_cmd.execute(["ls-remote", url, pattern]) + return result + except GitCommandError as e: + raise RuntimeError(f"Failed to resolve ref {pattern}: {e.stderr or str(e)}") from e - cmd += ["ls-remote", url, pattern] - stdout, _ = await run_command(*cmd) - lines = stdout.decode().splitlines() + stdout = await asyncio.get_event_loop().run_in_executor(None, resolve_ref) + lines = stdout.splitlines() sha = _pick_commit_sha(lines) if not sha: msg = f"{pattern!r} not found in {url}" diff --git a/tests/test_git_utils.py b/tests/test_git_utils.py index 48408130..174b80fb 100644 --- a/tests/test_git_utils.py +++ b/tests/test_git_utils.py @@ -12,7 +12,7 @@ import pytest from gitingest.utils.exceptions import InvalidGitHubTokenError -from gitingest.utils.git_utils import create_git_auth_header, create_git_command, is_github_host, validate_github_token +from gitingest.utils.git_utils import create_git_auth_header, create_git_command_with_auth, is_github_host, validate_github_token if TYPE_CHECKING: from pathlib import Path @@ -56,50 +56,28 @@ def test_validate_github_token_invalid(token: str) -> None: @pytest.mark.parametrize( - ("base_cmd", "local_path", "url", "token", "expected_suffix"), + ("token", "url", "should_have_auth"), [ - ( - ["git", "clone"], - "/some/path", - "https://github.com/owner/repo.git", - None, - [], # No auth header expected when token is None - ), - ( - ["git", "clone"], - "/some/path", - "https://github.com/owner/repo.git", - "ghp_" + "d" * 36, - [ - "-c", - create_git_auth_header("ghp_" + "d" * 36), - ], # Auth header expected for GitHub URL + token - ), - ( - ["git", "clone"], - "/some/path", - "https://gitlab.com/owner/repo.git", - "ghp_" + "e" * 36, - [], # No auth header for non-GitHub URL even if token provided - ), + (None, "https://github.com/owner/repo.git", False), # No auth when token is None + ("ghp_" + "d" * 36, "https://github.com/owner/repo.git", True), # Auth for GitHub URL + token + ("ghp_" + "e" * 36, "https://gitlab.com/owner/repo.git", False), # No auth for non-GitHub URL ], ) -def test_create_git_command( - base_cmd: list[str], - local_path: str, - url: str, +def test_create_git_command_with_auth( token: str | None, - expected_suffix: list[str], + url: str, + should_have_auth: bool, ) -> None: - """Test that ``create_git_command`` builds the correct command list based on inputs.""" - cmd = create_git_command(base_cmd, local_path, url, token) - - # The command should start with base_cmd and the -C option - expected_prefix = [*base_cmd, "-C", local_path] - assert cmd[: len(expected_prefix)] == expected_prefix - - # The suffix (anything after prefix) should match expected - assert cmd[len(expected_prefix) :] == expected_suffix + """Test that ``create_git_command_with_auth`` creates correct Git objects based on inputs.""" + git_cmd = create_git_command_with_auth(token, url) + + # Check if the git command has authentication environment configured + if should_have_auth: + assert hasattr(git_cmd, 'custom_environment') + assert 'GIT_CONFIG_PARAMETERS' in git_cmd.custom_environment + else: + # For no auth case, should be basic Git command + assert not hasattr(git_cmd, 'custom_environment') or 'GIT_CONFIG_PARAMETERS' not in (git_cmd.custom_environment or {}) @pytest.mark.parametrize( @@ -118,33 +96,32 @@ def test_create_git_auth_header(token: str) -> None: @pytest.mark.parametrize( - ("url", "token", "should_call"), + ("url", "token", "should_have_auth"), [ ("https://github.com/foo/bar.git", "ghp_" + "f" * 36, True), ("https://github.com/foo/bar.git", None, False), ("https://gitlab.com/foo/bar.git", "ghp_" + "g" * 36, False), ], ) -def test_create_git_command_helper_calls( +def test_create_git_command_with_auth_calls( mocker: MockerFixture, tmp_path: Path, *, url: str, token: str | None, - should_call: bool, + should_have_auth: bool, ) -> None: """Test that ``create_git_auth_header`` is invoked only when appropriate.""" - work_dir = tmp_path / "repo" header_mock = mocker.patch("gitingest.utils.git_utils.create_git_auth_header", return_value="HEADER") - cmd = create_git_command(["git", "clone"], str(work_dir), url, token) + git_cmd = create_git_command_with_auth(token, url) - if should_call: + if should_have_auth: header_mock.assert_called_once_with(token, url=url) - assert "HEADER" in cmd + assert hasattr(git_cmd, 'custom_environment') + assert git_cmd.custom_environment['GIT_CONFIG_PARAMETERS'] == "HEADER" else: header_mock.assert_not_called() - assert "HEADER" not in cmd @pytest.mark.parametrize( @@ -198,58 +175,28 @@ def test_create_git_auth_header_with_ghe_url(token: str, url: str, expected_host @pytest.mark.parametrize( - ("base_cmd", "local_path", "url", "token", "expected_auth_hostname"), + ("token", "url", "expected_auth_hostname"), [ # GitHub.com URLs - should use default hostname - ( - ["git", "clone"], - "/some/path", - "https://github.com/owner/repo.git", - "ghp_" + "a" * 36, - "github.com", - ), + ("ghp_" + "a" * 36, "https://github.com/owner/repo.git", "github.com"), # GitHub Enterprise URLs - should use custom hostname - ( - ["git", "clone"], - "/some/path", - "https://github.company.com/owner/repo.git", - "ghp_" + "b" * 36, - "github.company.com", - ), - ( - ["git", "clone"], - "/some/path", - "https://github.enterprise.org/owner/repo.git", - "ghp_" + "c" * 36, - "github.enterprise.org", - ), - ( - ["git", "clone"], - "/some/path", - "http://github.internal/owner/repo.git", - "ghp_" + "d" * 36, - "github.internal", - ), + ("ghp_" + "b" * 36, "https://github.company.com/owner/repo.git", "github.company.com"), + ("ghp_" + "c" * 36, "https://github.enterprise.org/owner/repo.git", "github.enterprise.org"), + ("ghp_" + "d" * 36, "http://github.internal/owner/repo.git", "github.internal"), ], ) -def test_create_git_command_with_ghe_urls( - base_cmd: list[str], - local_path: str, - url: str, +def test_create_git_command_with_auth_ghe_urls( token: str, + url: str, expected_auth_hostname: str, ) -> None: - """Test that ``create_git_command`` handles GitHub Enterprise URLs correctly.""" - cmd = create_git_command(base_cmd, local_path, url, token) + """Test that ``create_git_command_with_auth`` handles GitHub Enterprise URLs correctly.""" + git_cmd = create_git_command_with_auth(token, url) - # Should have base command and -C option - expected_prefix = [*base_cmd, "-C", local_path] - assert cmd[: len(expected_prefix)] == expected_prefix - - # Should have -c and auth header - assert "-c" in cmd - auth_header_index = cmd.index("-c") + 1 - auth_header = cmd[auth_header_index] + # Should have authentication configured + assert hasattr(git_cmd, 'custom_environment') + assert 'GIT_CONFIG_PARAMETERS' in git_cmd.custom_environment + auth_header = git_cmd.custom_environment['GIT_CONFIG_PARAMETERS'] # Verify the auth header contains the expected hostname assert f"http.https://{expected_auth_hostname}/" in auth_header @@ -257,23 +204,20 @@ def test_create_git_command_with_ghe_urls( @pytest.mark.parametrize( - ("base_cmd", "local_path", "url", "token"), + ("token", "url"), [ # Should NOT add auth headers for non-GitHub URLs - (["git", "clone"], "/some/path", "https://gitlab.com/owner/repo.git", "ghp_" + "a" * 36), - (["git", "clone"], "/some/path", "https://bitbucket.org/owner/repo.git", "ghp_" + "b" * 36), - (["git", "clone"], "/some/path", "https://git.example.com/owner/repo.git", "ghp_" + "c" * 36), + ("ghp_" + "a" * 36, "https://gitlab.com/owner/repo.git"), + ("ghp_" + "b" * 36, "https://bitbucket.org/owner/repo.git"), + ("ghp_" + "c" * 36, "https://git.example.com/owner/repo.git"), ], ) -def test_create_git_command_ignores_non_github_urls( - base_cmd: list[str], - local_path: str, - url: str, +def test_create_git_command_with_auth_ignores_non_github_urls( token: str, + url: str, ) -> None: - """Test that ``create_git_command`` does not add auth headers for non-GitHub URLs.""" - cmd = create_git_command(base_cmd, local_path, url, token) + """Test that ``create_git_command_with_auth`` does not add auth headers for non-GitHub URLs.""" + git_cmd = create_git_command_with_auth(token, url) - # Should only have base command and -C option, no auth headers - expected = [*base_cmd, "-C", local_path] - assert cmd == expected + # Should not have authentication configured for non-GitHub URLs + assert not hasattr(git_cmd, 'custom_environment') or 'GIT_CONFIG_PARAMETERS' not in (git_cmd.custom_environment or {}) From 6ff8b89a682c95245fd75a9e3c262976530625a7 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 7 Aug 2025 14:42:18 +0000 Subject: [PATCH 2/3] Checkpoint before follow-up message Co-authored-by: nicoragne --- test_gitpython_integration.py | 69 +++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 test_gitpython_integration.py diff --git a/test_gitpython_integration.py b/test_gitpython_integration.py new file mode 100644 index 00000000..10c3453c --- /dev/null +++ b/test_gitpython_integration.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +Simple test script to verify GitPython integration works. +""" + +import asyncio +import sys +from pathlib import Path + +# Add specific path for git_utils module +git_utils_path = Path(__file__).parent / "src" / "gitingest" / "utils" +sys.path.insert(0, str(git_utils_path)) + +# Import the specific functions we need to test +import git_utils + + +async def test_basic_functions(): + """Test basic functionality without external dependencies.""" + print("šŸ” Testing GitPython integration...") + + # Test 1: Test token validation (no external deps) + print("āœ… Testing GitHub token validation...") + try: + git_utils.validate_github_token("ghp_" + "A" * 36) + print(" āœ“ Valid token accepted") + except Exception as e: + print(f" āœ— Token validation failed: {e}") + + # Test 2: Test GitHub host detection + print("āœ… Testing GitHub host detection...") + assert git_utils.is_github_host("https://github.com/owner/repo") == True + assert git_utils.is_github_host("https://gitlab.com/owner/repo") == False + print(" āœ“ GitHub host detection works") + + # Test 3: Test auth header creation + print("āœ… Testing auth header creation...") + token = "ghp_" + "A" * 36 + header = git_utils.create_git_auth_header(token) + assert "Authorization: Basic" in header + assert "github.com" in header + print(" āœ“ Auth header creation works") + + # Test 4: Test Git command creation with auth + print("āœ… Testing Git command creation...") + git_cmd = git_utils.create_git_command_with_auth(token, "https://github.com/owner/repo") + # Should have authentication configured + assert hasattr(git_cmd, 'custom_environment'), "GitPython command should have custom environment" + assert 'GIT_CONFIG_PARAMETERS' in git_cmd.custom_environment, "Should have auth parameters" + print(" āœ“ Git command with auth works") + + git_cmd_no_auth = git_utils.create_git_command_with_auth(None, "https://github.com/owner/repo") + # Should not have auth for no token + assert not hasattr(git_cmd_no_auth, 'custom_environment') or 'GIT_CONFIG_PARAMETERS' not in (git_cmd_no_auth.custom_environment or {}), "Should not have auth without token" + print(" āœ“ Git command without auth works") + + # Test 5: Test git installation check + print("āœ… Testing Git installation check...") + try: + await git_utils.ensure_git_installed() + print(" āœ“ Git is installed and accessible") + except Exception as e: + print(f" āœ— Git installation check failed: {e}") + + print("\nšŸŽ‰ All basic tests passed! GitPython integration is working.") + + +if __name__ == "__main__": + asyncio.run(test_basic_functions()) \ No newline at end of file From 69764cbf37358344a91915c97911492c19e4a4eb Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 7 Aug 2025 14:47:24 +0000 Subject: [PATCH 3/3] Refactor git authentication handling with improved environment management Co-authored-by: nicoragne --- src/gitingest/clone.py | 11 ++--- src/gitingest/utils/git_utils.py | 50 +++++++++++++++-------- test_gitpython_integration.py | 69 -------------------------------- tests/test_git_utils.py | 12 +++--- 4 files changed, 45 insertions(+), 97 deletions(-) delete mode 100644 test_gitpython_integration.py diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index a2c48da0..74026a3c 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -129,13 +129,14 @@ def perform_checkout(): # Fetch the specific commit logger.debug("Fetching specific commit", extra={"commit": commit}) - # Set up authentication for fetch operations + # Set up authentication environment for fetch operations + env = None if token and is_github_host(url): - git_cmd = repo.git.with_custom_environment(GIT_CONFIG_PARAMETERS=create_git_auth_header(token, url=url)) - else: - git_cmd = repo.git + import os + env = os.environ.copy() + env["GIT_CONFIG_PARAMETERS"] = create_git_auth_header(token, url=url) - git_cmd.fetch("--depth=1", "origin", commit) + repo.git.fetch("--depth=1", "origin", commit, env=env) # Checkout the specific commit logger.info("Checking out commit", extra={"commit": commit}) diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index 605227c2..a8a72693 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -243,11 +243,7 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str | await ensure_git_installed() def fetch_refs(): - git_cmd = Git() - - # Set up authentication if needed - if token and is_github_host(url): - git_cmd = git_cmd.with_custom_environment(GIT_CONFIG_PARAMETERS=create_git_auth_header(token, url=url)) + git_cmd = create_git_command_with_auth(token, url) fetch_tags = ref_type == "tags" to_fetch = "tags" if fetch_tags else "heads" @@ -278,7 +274,29 @@ def fetch_refs(): ] -def create_git_command_with_auth(token: str | None, url: str) -> Git: +class GitCommandWithAuth: + """A wrapper around Git command that stores authentication environment.""" + + def __init__(self, token: str | None, url: str): + self.git = Git() + self.env = None + + if token and is_github_host(url): + import os + self.env = os.environ.copy() + self.env["GIT_CONFIG_PARAMETERS"] = create_git_auth_header(token, url=url) + + def execute(self, args: list[str]) -> str: + """Execute a git command with authentication if needed.""" + return self.git.execute(args, env=self.env) + + @property + def custom_environment(self) -> dict[str, str] | None: + """Get the custom environment for testing.""" + return self.env + + +def create_git_command_with_auth(token: str | None, url: str) -> GitCommandWithAuth: """Create a Git command object with authentication if needed. Parameters @@ -290,15 +308,11 @@ def create_git_command_with_auth(token: str | None, url: str) -> Git: Returns ------- - Git - A Git command object with authentication configured if needed. + GitCommandWithAuth + A Git command wrapper with authentication configured if needed. """ - if token and is_github_host(url): - # Set authentication through environment - auth_config = create_git_auth_header(token, url=url) - return Git().with_custom_environment(GIT_CONFIG_PARAMETERS=auth_config) - return Git() + return GitCommandWithAuth(token, url) def create_git_auth_header(token: str, url: str = "https://github.com") -> str: @@ -369,13 +383,15 @@ async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None def setup_sparse_checkout(): try: repo = Repo(config.local_path) - git_cmd = repo.git - # Set up authentication if needed + # Set up authentication environment if needed + env = None if token and is_github_host(config.url): - git_cmd = git_cmd.with_custom_environment(GIT_CONFIG_PARAMETERS=create_git_auth_header(token, url=config.url)) + import os + env = os.environ.copy() + env["GIT_CONFIG_PARAMETERS"] = create_git_auth_header(token, url=config.url) - git_cmd.execute(["sparse-checkout", "set", subpath]) + repo.git.execute(["sparse-checkout", "set", subpath], env=env) except Exception as e: raise RuntimeError(f"Failed to setup sparse checkout: {str(e)}") from e diff --git a/test_gitpython_integration.py b/test_gitpython_integration.py deleted file mode 100644 index 10c3453c..00000000 --- a/test_gitpython_integration.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python3 -""" -Simple test script to verify GitPython integration works. -""" - -import asyncio -import sys -from pathlib import Path - -# Add specific path for git_utils module -git_utils_path = Path(__file__).parent / "src" / "gitingest" / "utils" -sys.path.insert(0, str(git_utils_path)) - -# Import the specific functions we need to test -import git_utils - - -async def test_basic_functions(): - """Test basic functionality without external dependencies.""" - print("šŸ” Testing GitPython integration...") - - # Test 1: Test token validation (no external deps) - print("āœ… Testing GitHub token validation...") - try: - git_utils.validate_github_token("ghp_" + "A" * 36) - print(" āœ“ Valid token accepted") - except Exception as e: - print(f" āœ— Token validation failed: {e}") - - # Test 2: Test GitHub host detection - print("āœ… Testing GitHub host detection...") - assert git_utils.is_github_host("https://github.com/owner/repo") == True - assert git_utils.is_github_host("https://gitlab.com/owner/repo") == False - print(" āœ“ GitHub host detection works") - - # Test 3: Test auth header creation - print("āœ… Testing auth header creation...") - token = "ghp_" + "A" * 36 - header = git_utils.create_git_auth_header(token) - assert "Authorization: Basic" in header - assert "github.com" in header - print(" āœ“ Auth header creation works") - - # Test 4: Test Git command creation with auth - print("āœ… Testing Git command creation...") - git_cmd = git_utils.create_git_command_with_auth(token, "https://github.com/owner/repo") - # Should have authentication configured - assert hasattr(git_cmd, 'custom_environment'), "GitPython command should have custom environment" - assert 'GIT_CONFIG_PARAMETERS' in git_cmd.custom_environment, "Should have auth parameters" - print(" āœ“ Git command with auth works") - - git_cmd_no_auth = git_utils.create_git_command_with_auth(None, "https://github.com/owner/repo") - # Should not have auth for no token - assert not hasattr(git_cmd_no_auth, 'custom_environment') or 'GIT_CONFIG_PARAMETERS' not in (git_cmd_no_auth.custom_environment or {}), "Should not have auth without token" - print(" āœ“ Git command without auth works") - - # Test 5: Test git installation check - print("āœ… Testing Git installation check...") - try: - await git_utils.ensure_git_installed() - print(" āœ“ Git is installed and accessible") - except Exception as e: - print(f" āœ— Git installation check failed: {e}") - - print("\nšŸŽ‰ All basic tests passed! GitPython integration is working.") - - -if __name__ == "__main__": - asyncio.run(test_basic_functions()) \ No newline at end of file diff --git a/tests/test_git_utils.py b/tests/test_git_utils.py index 174b80fb..75eb2769 100644 --- a/tests/test_git_utils.py +++ b/tests/test_git_utils.py @@ -73,11 +73,11 @@ def test_create_git_command_with_auth( # Check if the git command has authentication environment configured if should_have_auth: - assert hasattr(git_cmd, 'custom_environment') + assert git_cmd.custom_environment is not None assert 'GIT_CONFIG_PARAMETERS' in git_cmd.custom_environment else: - # For no auth case, should be basic Git command - assert not hasattr(git_cmd, 'custom_environment') or 'GIT_CONFIG_PARAMETERS' not in (git_cmd.custom_environment or {}) + # For no auth case, should not have custom environment + assert git_cmd.custom_environment is None @pytest.mark.parametrize( @@ -118,7 +118,7 @@ def test_create_git_command_with_auth_calls( if should_have_auth: header_mock.assert_called_once_with(token, url=url) - assert hasattr(git_cmd, 'custom_environment') + assert git_cmd.custom_environment is not None assert git_cmd.custom_environment['GIT_CONFIG_PARAMETERS'] == "HEADER" else: header_mock.assert_not_called() @@ -194,7 +194,7 @@ def test_create_git_command_with_auth_ghe_urls( git_cmd = create_git_command_with_auth(token, url) # Should have authentication configured - assert hasattr(git_cmd, 'custom_environment') + assert git_cmd.custom_environment is not None assert 'GIT_CONFIG_PARAMETERS' in git_cmd.custom_environment auth_header = git_cmd.custom_environment['GIT_CONFIG_PARAMETERS'] @@ -220,4 +220,4 @@ def test_create_git_command_with_auth_ignores_non_github_urls( git_cmd = create_git_command_with_auth(token, url) # Should not have authentication configured for non-GitHub URLs - assert not hasattr(git_cmd, 'custom_environment') or 'GIT_CONFIG_PARAMETERS' not in (git_cmd.custom_environment or {}) + assert git_cmd.custom_environment is None