From 1bda9f6535ac122ac8fafa478363b62e69e4d038 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 14 Jun 2025 19:14:03 +0200 Subject: [PATCH 1/7] fix: split sparse-checkout & commit checkout when cloning; refresh docs/CLI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Run `git sparse-checkout set …` and `git checkout ` as two calls—matches Git’s CLI rules and fixes failures. * Tidy clone path creation via _ensure_directory; use DEFAULT_TIMEOUT. * Clarify CLI/help strings and schema docstrings. * Update tests for the new two-step checkout flow. --- src/gitingest/cli.py | 71 ++++++++++++++--------- src/gitingest/cloning.py | 55 ++++++++++-------- src/gitingest/config.py | 1 + src/gitingest/schemas/ingestion_schema.py | 2 + tests/test_repository_clone.py | 11 +++- 5 files changed, 89 insertions(+), 51 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index c7f07d9b..80098370 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -29,17 +29,19 @@ "--exclude-pattern", "-e", multiple=True, - help="""Patterns to exclude. Handles python's arbitrary subset of Unix - shell-style wildcards. See: - https://docs.python.org/3/library/fnmatch.html""", + help=( + "Patterns to exclude. Handles Python's arbitrary subset of Unix shell-style " + "wildcards. See: https://docs.python.org/3/library/fnmatch.html" + ), ) @click.option( "--include-pattern", "-i", multiple=True, - help="""Patterns to include. Handles python's arbitrary subset of Unix - shell-style wildcards. See: - https://docs.python.org/3/library/fnmatch.html""", + help=( + "Patterns to include. Handles Python's arbitrary subset of Unix shell-style " + "wildcards. See: https://docs.python.org/3/library/fnmatch.html" + ), ) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") def main( @@ -58,21 +60,29 @@ def main( Parameters ---------- source : str - The source directory or repository to analyze. + A directory path or a Git repository URL. output : str, optional - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + Output file path. Defaults to `.txt`. max_size : int - The maximum file size to process, in bytes. Files larger than this size will be ignored. + Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] - A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. + Glob patterns for pruning the file set. include_pattern : Tuple[str, ...] - A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + Glob patterns for including files in the output. branch : str, optional - The branch to clone (optional). + Specific branch to ingest (defaults to the repository's default). """ - # Main entry point for the CLI. This function is called when the CLI is run as a script. - asyncio.run(_async_main(source, output, max_size, exclude_pattern, include_pattern, branch)) + + asyncio.run( + _async_main( + source=source, + output=output, + max_size=max_size, + exclude_pattern=exclude_pattern, + include_pattern=include_pattern, + branch=branch, + ) + ) async def _async_main( @@ -92,18 +102,17 @@ async def _async_main( Parameters ---------- source : str - The source directory or repository to analyze. + A directory path or a Git repository URL. output : str, optional - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + Output file path. Defaults to `.txt`. max_size : int - The maximum file size to process, in bytes. Files larger than this size will be ignored. + Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] - A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. + Glob patterns for pruning the file set. include_pattern : Tuple[str, ...] - A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + Glob patterns for including files in the output. branch : str, optional - The branch to clone (optional). + Specific branch to ingest (defaults to the repository's default). Raises ------ @@ -111,21 +120,31 @@ async def _async_main( If there is an error during the execution of the command, this exception is raised to abort the process. """ try: - # Combine default and custom ignore patterns + # Normalise pattern containers (the ingest layer expects sets) exclude_patterns = set(exclude_pattern) include_patterns = set(include_pattern) - if not output: + # Choose a default output path if none provided + if output is None: output = OUTPUT_FILE_NAME - summary, _, _ = await ingest_async(source, max_size, include_patterns, exclude_patterns, branch, output=output) + + summary, _, _ = await ingest_async( + source=source, + max_file_size=max_size, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + branch=branch, + output=output, + ) click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) except Exception as exc: + # Convert any exception into Click.Abort so that exit status is non-zero click.echo(f"Error: {exc}", err=True) - raise click.Abort() + raise click.Abort() from exc if __name__ == "__main__": diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index 79b97cb9..d8eaa133 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -4,14 +4,13 @@ from pathlib import Path from typing import Optional +from gitingest.config import DEFAULT_TIMEOUT from gitingest.schemas import CloneConfig from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command from gitingest.utils.timeout_wrapper import async_timeout -TIMEOUT: int = 60 - -@async_timeout(TIMEOUT) +@async_timeout(DEFAULT_TIMEOUT) async def clone_repo(config: CloneConfig) -> None: """ Clone a repository to a local path based on the provided configuration. @@ -29,8 +28,6 @@ async def clone_repo(config: CloneConfig) -> None: ------ ValueError If the repository is not found or if the provided URL is invalid. - OSError - If an error occurs while creating the parent directory for the repository. """ # Extract and validate query parameters url: str = config.url @@ -40,18 +37,14 @@ async def clone_repo(config: CloneConfig) -> None: partial_clone: bool = config.subpath != "/" # Create parent directory if it doesn't exist - parent_dir = Path(local_path).parent - try: - os.makedirs(parent_dir, exist_ok=True) - except OSError as exc: - raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc + await _ensure_directory(Path(local_path).parent) # Check if the repository exists if not await check_repo_exists(url): raise ValueError("Repository not found, make sure it is public") clone_cmd = ["git", "clone", "--single-branch"] - # TODO re-enable --recurse-submodules + # TODO: Re-enable --recurse-submodules when submodule support is needed if partial_clone: clone_cmd += ["--filter=blob:none", "--sparse"] @@ -67,19 +60,35 @@ async def clone_repo(config: CloneConfig) -> None: await ensure_git_installed() await run_command(*clone_cmd) - if commit or partial_clone: - checkout_cmd = ["git", "-C", local_path] + # Checkout the subpath if it is a partial clone + if partial_clone: + subpath = config.subpath.lstrip("/") + if config.blob: + # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. + subpath = str(Path(subpath).parent.as_posix()) + + await run_command("git", "-C", local_path, "sparse-checkout", "set", subpath) - if partial_clone: - subpath = config.subpath.lstrip("/") - if config.blob: - # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. - subpath = str(Path(subpath).parent.as_posix()) + # Checkout the commit if it is provided + if commit: + await run_command("git", "-C", local_path, "checkout", commit) - checkout_cmd += ["sparse-checkout", "set", subpath] - if commit: - checkout_cmd += ["checkout", commit] +async def _ensure_directory(path: Path) -> None: + """ + Ensure the directory exists, creating it if necessary. + + Parameters + ---------- + path : Path + The path to ensure exists - # Check out the specific commit and/or subpath - await run_command(*checkout_cmd) + Raises + ------ + OSError + If the directory cannot be created + """ + try: + os.makedirs(path, exist_ok=True) + except OSError as exc: + raise OSError(f"Failed to create directory {path}: {exc}") from exc diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 9740713c..3f4e3724 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -7,6 +7,7 @@ MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal MAX_FILES = 10_000 # Maximum number of files to process MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB +DEFAULT_TIMEOUT = 60 # seconds OUTPUT_FILE_NAME = "digest.txt" diff --git a/src/gitingest/schemas/ingestion_schema.py b/src/gitingest/schemas/ingestion_schema.py index 02b1c678..43ea6c42 100644 --- a/src/gitingest/schemas/ingestion_schema.py +++ b/src/gitingest/schemas/ingestion_schema.py @@ -29,6 +29,8 @@ class CloneConfig: The branch to clone (default is None). subpath : str The subpath to clone from the repository (default is "/"). + blob: bool + Whether the repository is a blob (default is False). """ url: str diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index b614d5a4..15d8ea7c 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -435,7 +435,7 @@ async def test_clone_with_commit_and_subpath() -> None: clone_config.local_path, ) - # Verify the sparse-checkout command sets the correct path + # Verify sparse-checkout set mock_exec.assert_any_call( "git", "-C", @@ -443,8 +443,15 @@ async def test_clone_with_commit_and_subpath() -> None: "sparse-checkout", "set", "src/docs", + ) + + # Verify checkout commit + mock_exec.assert_any_call( + "git", + "-C", + clone_config.local_path, "checkout", clone_config.commit, ) - assert mock_exec.call_count == 2 + assert mock_exec.call_count == 3 From 86eab77539df4bd18787955d25cebf116969736e Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 14 Jun 2025 19:29:37 +0200 Subject: [PATCH 2/7] feat(auth): support private GitHub repos & correct sparse-checkout flow * CLI: new `--token/-t` flag (fallback to `GITHUB_TOKEN`) * clone_repo: * injects Basic-auth header when a PAT is supplied * validates PAT format (`github_pat_*`) * git_utils: * `create_git_auth_header`, `validate_github_token`, `create_git_command` * `_check_github_repo_exists` & branch-listing now work with tokens * os_utils.ensure_directory extracted for reuse * tests updated to reflect new call signatures --- src/gitingest/cli.py | 20 ++++ src/gitingest/cloning.py | 61 ++++++------ src/gitingest/entrypoint.py | 11 ++- src/gitingest/query_parsing.py | 14 ++- src/gitingest/utils/git_utils.py | 156 ++++++++++++++++++++++++++++++- src/gitingest/utils/os_utils.py | 24 +++++ tests/test_repository_clone.py | 7 +- 7 files changed, 251 insertions(+), 42 deletions(-) create mode 100644 src/gitingest/utils/os_utils.py diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 80098370..a7b5de98 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -44,6 +44,16 @@ ), ) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") +@click.option( + "--token", + "-t", + envvar="GITHUB_TOKEN", + default=None, + help=( + "GitHub personal access token for accessing private repositories. " + "If omitted, the CLI will look for the GITHUB_TOKEN environment variable." + ), +) def main( source: str, output: Optional[str], @@ -51,6 +61,7 @@ def main( exclude_pattern: Tuple[str, ...], include_pattern: Tuple[str, ...], branch: Optional[str], + token: Optional[str], ): """ Main entry point for the CLI. This function is called when the CLI is run as a script. @@ -71,6 +82,9 @@ def main( Glob patterns for including files in the output. branch : str, optional Specific branch to ingest (defaults to the repository's default). + token: str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. """ asyncio.run( @@ -81,6 +95,7 @@ def main( exclude_pattern=exclude_pattern, include_pattern=include_pattern, branch=branch, + token=token, ) ) @@ -92,6 +107,7 @@ async def _async_main( exclude_pattern: Tuple[str, ...], include_pattern: Tuple[str, ...], branch: Optional[str], + token: Optional[str], ) -> None: """ Analyze a directory or repository and create a text dump of its contents. @@ -113,6 +129,9 @@ async def _async_main( Glob patterns for including files in the output. branch : str, optional Specific branch to ingest (defaults to the repository's default). + token: str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. Raises ------ @@ -135,6 +154,7 @@ async def _async_main( exclude_patterns=exclude_patterns, branch=branch, output=output, + token=token, ) click.echo(f"Analysis complete! Output written to: {output}") diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index d8eaa133..3006d95d 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -1,17 +1,24 @@ """This module contains functions for cloning a Git repository to a local path.""" -import os from pathlib import Path from typing import Optional from gitingest.config import DEFAULT_TIMEOUT from gitingest.schemas import CloneConfig -from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command +from gitingest.utils.git_utils import ( + check_repo_exists, + create_git_auth_header, + create_git_command, + ensure_git_installed, + run_command, + validate_github_token, +) +from gitingest.utils.os_utils import ensure_directory from gitingest.utils.timeout_wrapper import async_timeout @async_timeout(DEFAULT_TIMEOUT) -async def clone_repo(config: CloneConfig) -> None: +async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None: """ Clone a repository to a local path based on the provided configuration. @@ -23,11 +30,15 @@ async def clone_repo(config: CloneConfig) -> None: ---------- config : CloneConfig The configuration for cloning the repository. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Must start with 'github_pat_' for GitHub repositories. Raises ------ ValueError - If the repository is not found or if the provided URL is invalid. + If the repository is not found, if the provided URL is invalid, or if the token format is invalid. """ # Extract and validate query parameters url: str = config.url @@ -36,14 +47,22 @@ async def clone_repo(config: CloneConfig) -> None: branch: Optional[str] = config.branch partial_clone: bool = config.subpath != "/" + # Validate token if provided + if token and url.startswith("https://github.com"): + validate_github_token(token) + # Create parent directory if it doesn't exist - await _ensure_directory(Path(local_path).parent) + await ensure_directory(Path(local_path).parent) # Check if the repository exists - if not await check_repo_exists(url): - raise ValueError("Repository not found, make sure it is public") + if not await check_repo_exists(url, token=token): + raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.") + + clone_cmd = ["git"] + if token and url.startswith("https://github.com"): + clone_cmd += ["-c", create_git_auth_header(token)] - clone_cmd = ["git", "clone", "--single-branch"] + clone_cmd += ["clone", "--single-branch"] # TODO: Re-enable --recurse-submodules when submodule support is needed if partial_clone: @@ -67,28 +86,10 @@ async def clone_repo(config: CloneConfig) -> None: # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. subpath = str(Path(subpath).parent.as_posix()) - await run_command("git", "-C", local_path, "sparse-checkout", "set", subpath) + checkout_cmd = create_git_command(["git"], local_path, url, token) + await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) # Checkout the commit if it is provided if commit: - await run_command("git", "-C", local_path, "checkout", commit) - - -async def _ensure_directory(path: Path) -> None: - """ - Ensure the directory exists, creating it if necessary. - - Parameters - ---------- - path : Path - The path to ensure exists - - Raises - ------ - OSError - If the directory cannot be created - """ - try: - os.makedirs(path, exist_ok=True) - except OSError as exc: - raise OSError(f"Failed to create directory {path}: {exc}") from exc + checkout_cmd = create_git_command(["git"], local_path, url, token) + await run_command(*checkout_cmd, "checkout", commit) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 0af4a4ba..5baf2835 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -17,6 +17,7 @@ async def ingest_async( include_patterns: Optional[Union[str, Set[str]]] = None, exclude_patterns: Optional[Union[str, Set[str]]] = None, branch: Optional[str] = None, + token: Optional[str] = None, output: Optional[str] = None, ) -> Tuple[str, str, str]: """ @@ -39,6 +40,9 @@ async def ingest_async( Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. output : str, optional File path where the summary and content should be written. If `None`, the results are not written to a file. @@ -71,7 +75,7 @@ async def ingest_async( query.branch = selected_branch clone_config = query.extract_clone_config() - clone_coroutine = clone_repo(clone_config) + clone_coroutine = clone_repo(clone_config, token=token) if inspect.iscoroutine(clone_coroutine): if asyncio.get_event_loop().is_running(): @@ -102,6 +106,7 @@ def ingest( include_patterns: Optional[Union[str, Set[str]]] = None, exclude_patterns: Optional[Union[str, Set[str]]] = None, branch: Optional[str] = None, + token: Optional[str] = None, output: Optional[str] = None, ) -> Tuple[str, str, str]: """ @@ -124,6 +129,9 @@ def ingest( Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. output : str, optional File path where the summary and content should be written. If `None`, the results are not written to a file. @@ -146,6 +154,7 @@ def ingest( include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, + token=token, output=output, ) ) diff --git a/src/gitingest/query_parsing.py b/src/gitingest/query_parsing.py index 5d547356..d391e184 100644 --- a/src/gitingest/query_parsing.py +++ b/src/gitingest/query_parsing.py @@ -94,7 +94,7 @@ async def parse_query( ) -async def _parse_remote_repo(source: str) -> IngestionQuery: +async def _parse_remote_repo(source: str, token: Optional[str] = None) -> IngestionQuery: """ Parse a repository URL into a structured query dictionary. @@ -107,6 +107,9 @@ async def _parse_remote_repo(source: str) -> IngestionQuery: ---------- source : str The URL or domain-less slug to parse. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. Returns ------- @@ -128,7 +131,7 @@ async def _parse_remote_repo(source: str) -> IngestionQuery: _validate_host(tmp_host) else: # No scheme, no domain => user typed "user/repo", so we'll guess the domain. - host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source)) + host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source), token=token) source = f"{host}/{source}" source = "https://" + source @@ -285,7 +288,7 @@ def _parse_local_dir_path(path_str: str) -> IngestionQuery: ) -async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: +async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: Optional[str] = None) -> str: """ Attempt to find a valid repository host for the given user_name and repo_name. @@ -295,6 +298,9 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: The username or owner of the repository. repo_name : str The name of the repository. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. Returns ------- @@ -308,6 +314,6 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: """ for domain in KNOWN_GIT_HOSTS: candidate = f"https://{domain}/{user_name}/{repo_name}" - if await check_repo_exists(candidate): + if await check_repo_exists(candidate, token=token if domain == "github.com" else None): return domain raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.") diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index 9ed7c645..e3727167 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -1,7 +1,11 @@ """Utility functions for interacting with Git repositories.""" import asyncio -from typing import List, Tuple +import base64 +import re +from typing import List, Optional, Tuple + +GITHUB_PAT_PATTERN = r"^github_pat_[A-Za-z0-9_]{40,}$" async def run_command(*args: str) -> Tuple[bytes, bytes]: @@ -52,7 +56,7 @@ async def ensure_git_installed() -> None: raise RuntimeError("Git is not installed or not accessible. Please install Git first.") from exc -async def check_repo_exists(url: str) -> bool: +async def check_repo_exists(url: str, token: Optional[str] = None) -> bool: """ Check if a Git repository exists at the provided URL. @@ -60,6 +64,10 @@ async def check_repo_exists(url: str) -> bool: ---------- url : str The URL of the Git repository to check. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Returns ------- bool @@ -70,6 +78,9 @@ async def check_repo_exists(url: str) -> bool: RuntimeError If the curl command returns an unexpected status code. """ + if token and "github.com" in url: + return await _check_github_repo_exists(url, token) + proc = await asyncio.create_subprocess_exec( "curl", "-I", @@ -94,19 +105,93 @@ async def check_repo_exists(url: str) -> bool: raise RuntimeError(f"Unexpected status line: {status_line}") -async def fetch_remote_branch_list(url: str) -> List[str]: +async def _check_github_repo_exists(url: str, token: Optional[str] = None) -> bool: + """ + Return True iff the authenticated user can see `url`. + + Parameters + ---------- + url : str + The URL of the GitHub repository to check. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Returns + ------- + bool + True if the repository exists, False otherwise. + + Raises + ------ + ValueError + If the URL is not a valid GitHub repository URL. + RuntimeError + If the repository is not found, if the provided URL is invalid, or if the token format is invalid. + """ + m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url) + if not m: + raise ValueError(f"Un-recognised GitHub URL: {url!r}") + owner, repo = m.groups() + + api = f"https://api.github.com/repos/{owner}/{repo}" + cmd = [ + "curl", + "--silent", + "--location", + "--write-out", + "%{http_code}", + "-o", + "/dev/null", + "-H", + "Accept: application/vnd.github+json", + ] + if token: + cmd += ["-H", f"Authorization: Bearer {token}"] + cmd.append(api) + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + status = stdout.decode()[-3:] # just the %{http_code} + + if status == "200": + return True + if status == "404": + return False + if status in ("401", "403"): + raise RuntimeError("Token invalid or lacks permissions") + raise RuntimeError(f"GitHub API returned unexpected HTTP {status}") + + +async def fetch_remote_branch_list(url: str, token: Optional[str] = None) -> List[str]: """ Fetch the list of branches from a remote Git repository. + Parameters ---------- url : str The URL of the Git repository to fetch branches from. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Returns ------- List[str] A list of branch names available in the remote repository. """ - fetch_branches_command = ["git", "ls-remote", "--heads", url] + fetch_branches_command = ["git"] + + # Add authentication if needed + if token and "github.com" in url: + fetch_branches_command += ["-c", create_git_auth_header(token)] + + fetch_branches_command += ["ls-remote", "--heads", url] + await ensure_git_installed() stdout, _ = await run_command(*fetch_branches_command) stdout_decoded = stdout.decode() @@ -116,3 +201,66 @@ async def fetch_remote_branch_list(url: str) -> List[str]: for line in stdout_decoded.splitlines() if line.strip() and "refs/heads/" in line ] + + +def create_git_command(base_cmd: List[str], local_path: str, url: str, token: Optional[str] = None) -> List[str]: + """Create a git command with authentication if needed. + + Parameters + ---------- + base_cmd : List[str] + The base git command to start with + local_path : str + The local path where the git command should be executed + url : str + The repository URL to check if it's a GitHub repository + token : Optional[str] + GitHub personal access token for authentication + + Returns + ------- + List[str] + The git command with authentication if needed + """ + cmd = base_cmd + ["-C", local_path] + if token and url.startswith("https://github.com"): + validate_github_token(token) + cmd += ["-c", create_git_auth_header(token)] + return cmd + + +def create_git_auth_header(token: str) -> str: + """Create a Basic authentication header for GitHub git operations. + + Parameters + ---------- + token : str + GitHub personal access token + + Returns + ------- + str + The git config command for setting the authentication header + """ + basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() + return f"http.https://github.com/.extraheader=Authorization: Basic {basic}" + + +def validate_github_token(token: str) -> None: + """Validate the format of a GitHub Personal Access Token. + + Parameters + ---------- + token : str + The GitHub token to validate + + Raises + ------ + ValueError + If the token format is invalid + """ + if not re.match(GITHUB_PAT_PATTERN, token): + raise ValueError( + "Invalid GitHub token format. Token should start with 'github_pat_' " + "followed by at least 40 characters of letters, numbers, and underscores." + ) diff --git a/src/gitingest/utils/os_utils.py b/src/gitingest/utils/os_utils.py new file mode 100644 index 00000000..a2d49916 --- /dev/null +++ b/src/gitingest/utils/os_utils.py @@ -0,0 +1,24 @@ +"""Utility functions for working with the operating system.""" + +import os +from pathlib import Path + + +async def ensure_directory(path: Path) -> None: + """ + Ensure the directory exists, creating it if necessary. + + Parameters + ---------- + path : Path + The path to ensure exists + + Raises + ------ + OSError + If the directory cannot be created + """ + try: + os.makedirs(path, exist_ok=True) + except OSError as exc: + raise OSError(f"Failed to create directory {path}: {exc}") from exc diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index 15d8ea7c..b57d737e 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -12,9 +12,10 @@ import pytest -from gitingest.cloning import check_repo_exists, clone_repo +from gitingest.cloning import clone_repo from gitingest.schemas import CloneConfig from gitingest.utils.exceptions import AsyncTimeoutError +from gitingest.utils.git_utils import check_repo_exists @pytest.mark.asyncio @@ -41,7 +42,7 @@ async def test_clone_with_commit() -> None: await clone_repo(clone_config) - mock_check.assert_called_once_with(clone_config.url) + mock_check.assert_called_once_with(clone_config.url, token=None) assert mock_exec.call_count == 2 # Clone and checkout calls @@ -69,7 +70,7 @@ async def test_clone_without_commit() -> None: await clone_repo(query) - mock_check.assert_called_once_with(query.url) + mock_check.assert_called_once_with(query.url, token=None) assert mock_exec.call_count == 1 # Only clone call From e8156a9b9bd849178610c7a661201e4d22820deb Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 15 Jun 2025 22:19:54 +0200 Subject: [PATCH 3/7] allow git PAT to start with gth_ --- src/gitingest/cloning.py | 2 +- src/gitingest/utils/git_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index 3006d95d..284b353e 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -33,7 +33,7 @@ async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None: token : str, optional GitHub personal-access token (PAT). Needed when *source* refers to a **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - Must start with 'github_pat_' for GitHub repositories. + Must start with 'github_pat_' or 'gph_' for GitHub repositories. Raises ------ diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index e3727167..b75cbc2c 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -5,7 +5,7 @@ import re from typing import List, Optional, Tuple -GITHUB_PAT_PATTERN = r"^github_pat_[A-Za-z0-9_]{40,}$" +GITHUB_PAT_PATTERN = r"^(?:github_pat_|gph_)[A-Za-z0-9_]{40,}$" async def run_command(*args: str) -> Tuple[bytes, bytes]: @@ -261,6 +261,6 @@ def validate_github_token(token: str) -> None: """ if not re.match(GITHUB_PAT_PATTERN, token): raise ValueError( - "Invalid GitHub token format. Token should start with 'github_pat_' " + "Invalid GitHub token format. Token should start with 'github_pat_' or 'gph_' " "followed by at least 40 characters of letters, numbers, and underscores." ) From 1dd729095525d3f45a08947c662ed964cf01fe1b Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 15 Jun 2025 22:28:58 +0200 Subject: [PATCH 4/7] fix GITHUB_PAT_PATTERN and add instructions to README --- README.md | 5 +++++ src/gitingest/utils/git_utils.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b4d28ebf..402d87cf 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ You can also replace `hub` with `ingest` in any GitHub URL to access the corresp ## 📚 Requirements - Python 3.7+ +- For private repositories: A GitHub Personal Access Token (PAT). You can generate one at [https://github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens) (Profile → Settings → Developer Settings → Personal Access Tokens → Fine-grained Tokens) ### 📦 Installation @@ -83,6 +84,10 @@ gitingest /path/to/directory # From URL gitingest https://github.com/cyclotruc/gitingest +# For private repositories, use the --token option +# Get your token from https://github.com/settings/personal-access-tokens +gitingest https://github.com/username/private-repo --token github_pat_... + # See more options gitingest --help ``` diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index b75cbc2c..cb8bc4f4 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -5,7 +5,7 @@ import re from typing import List, Optional, Tuple -GITHUB_PAT_PATTERN = r"^(?:github_pat_|gph_)[A-Za-z0-9_]{40,}$" +GITHUB_PAT_PATTERN = r"^(?:github_pat_|gph_)[A-Za-z0-9_]{36,}$" async def run_command(*args: str) -> Tuple[bytes, bytes]: @@ -262,5 +262,5 @@ def validate_github_token(token: str) -> None: if not re.match(GITHUB_PAT_PATTERN, token): raise ValueError( "Invalid GitHub token format. Token should start with 'github_pat_' or 'gph_' " - "followed by at least 40 characters of letters, numbers, and underscores." + "followed by at least 36 characters of letters, numbers, and underscores." ) From b3b034d4f6d08110a5e79d8e17be375c79b9210c Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 15 Jun 2025 22:32:58 +0200 Subject: [PATCH 5/7] fix gph_ to ghp_ --- src/gitingest/utils/git_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index cb8bc4f4..b3346996 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -5,7 +5,7 @@ import re from typing import List, Optional, Tuple -GITHUB_PAT_PATTERN = r"^(?:github_pat_|gph_)[A-Za-z0-9_]{36,}$" +GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$" async def run_command(*args: str) -> Tuple[bytes, bytes]: @@ -261,6 +261,6 @@ def validate_github_token(token: str) -> None: """ if not re.match(GITHUB_PAT_PATTERN, token): raise ValueError( - "Invalid GitHub token format. Token should start with 'github_pat_' or 'gph_' " + "Invalid GitHub token format. Token should start with 'github_pat_' or 'ghp_' " "followed by at least 36 characters of letters, numbers, and underscores." ) From 611bdffda5075d2b317c06139d10cc8d649d55ed Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 15 Jun 2025 22:39:15 +0200 Subject: [PATCH 6/7] docs: add GITHUB_TOKEN env var example to README --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 402d87cf..ba69b0a9 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,10 @@ gitingest https://github.com/cyclotruc/gitingest # Get your token from https://github.com/settings/personal-access-tokens gitingest https://github.com/username/private-repo --token github_pat_... +# Or set it as an environment variable +export GITHUB_TOKEN=github_pat_... +gitingest https://github.com/username/private-repo + # See more options gitingest --help ``` From 586bc1c905abb87605a534afdc06ab2d2455892b Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 15 Jun 2025 22:44:14 +0200 Subject: [PATCH 7/7] add GITHUB_TOKEN environment variable also in code --- src/gitingest/entrypoint.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 5baf2835..cfabb461 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -2,6 +2,7 @@ import asyncio import inspect +import os import shutil from typing import Optional, Set, Tuple, Union @@ -61,6 +62,9 @@ async def ingest_async( """ repo_cloned = False + if not token: + token = os.getenv("GITHUB_TOKEN") + try: query: IngestionQuery = await parse_query( source=source,