diff --git a/README.md b/README.md index b4d28ebf..ba69b0a9 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ You can also replace `hub` with `ingest` in any GitHub URL to access the corresp ## 📚 Requirements - Python 3.7+ +- For private repositories: A GitHub Personal Access Token (PAT). You can generate one at [https://github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens) (Profile → Settings → Developer Settings → Personal Access Tokens → Fine-grained Tokens) ### 📦 Installation @@ -83,6 +84,14 @@ gitingest /path/to/directory # From URL gitingest https://github.com/cyclotruc/gitingest +# For private repositories, use the --token option +# Get your token from https://github.com/settings/personal-access-tokens +gitingest https://github.com/username/private-repo --token github_pat_... + +# Or set it as an environment variable +export GITHUB_TOKEN=github_pat_... +gitingest https://github.com/username/private-repo + # See more options gitingest --help ``` diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index c7f07d9b..a7b5de98 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -29,19 +29,31 @@ "--exclude-pattern", "-e", multiple=True, - help="""Patterns to exclude. Handles python's arbitrary subset of Unix - shell-style wildcards. See: - https://docs.python.org/3/library/fnmatch.html""", + help=( + "Patterns to exclude. Handles Python's arbitrary subset of Unix shell-style " + "wildcards. See: https://docs.python.org/3/library/fnmatch.html" + ), ) @click.option( "--include-pattern", "-i", multiple=True, - help="""Patterns to include. Handles python's arbitrary subset of Unix - shell-style wildcards. See: - https://docs.python.org/3/library/fnmatch.html""", + help=( + "Patterns to include. Handles Python's arbitrary subset of Unix shell-style " + "wildcards. See: https://docs.python.org/3/library/fnmatch.html" + ), ) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") +@click.option( + "--token", + "-t", + envvar="GITHUB_TOKEN", + default=None, + help=( + "GitHub personal access token for accessing private repositories. " + "If omitted, the CLI will look for the GITHUB_TOKEN environment variable." + ), +) def main( source: str, output: Optional[str], @@ -49,6 +61,7 @@ def main( exclude_pattern: Tuple[str, ...], include_pattern: Tuple[str, ...], branch: Optional[str], + token: Optional[str], ): """ Main entry point for the CLI. This function is called when the CLI is run as a script. @@ -58,21 +71,33 @@ def main( Parameters ---------- source : str - The source directory or repository to analyze. + A directory path or a Git repository URL. output : str, optional - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + Output file path. Defaults to `.txt`. max_size : int - The maximum file size to process, in bytes. Files larger than this size will be ignored. + Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] - A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. + Glob patterns for pruning the file set. include_pattern : Tuple[str, ...] - A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + Glob patterns for including files in the output. branch : str, optional - The branch to clone (optional). + Specific branch to ingest (defaults to the repository's default). + token: str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. """ - # Main entry point for the CLI. This function is called when the CLI is run as a script. - asyncio.run(_async_main(source, output, max_size, exclude_pattern, include_pattern, branch)) + + asyncio.run( + _async_main( + source=source, + output=output, + max_size=max_size, + exclude_pattern=exclude_pattern, + include_pattern=include_pattern, + branch=branch, + token=token, + ) + ) async def _async_main( @@ -82,6 +107,7 @@ async def _async_main( exclude_pattern: Tuple[str, ...], include_pattern: Tuple[str, ...], branch: Optional[str], + token: Optional[str], ) -> None: """ Analyze a directory or repository and create a text dump of its contents. @@ -92,18 +118,20 @@ async def _async_main( Parameters ---------- source : str - The source directory or repository to analyze. + A directory path or a Git repository URL. output : str, optional - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + Output file path. Defaults to `.txt`. max_size : int - The maximum file size to process, in bytes. Files larger than this size will be ignored. + Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] - A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. + Glob patterns for pruning the file set. include_pattern : Tuple[str, ...] - A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + Glob patterns for including files in the output. branch : str, optional - The branch to clone (optional). + Specific branch to ingest (defaults to the repository's default). + token: str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. Raises ------ @@ -111,21 +139,32 @@ async def _async_main( If there is an error during the execution of the command, this exception is raised to abort the process. """ try: - # Combine default and custom ignore patterns + # Normalise pattern containers (the ingest layer expects sets) exclude_patterns = set(exclude_pattern) include_patterns = set(include_pattern) - if not output: + # Choose a default output path if none provided + if output is None: output = OUTPUT_FILE_NAME - summary, _, _ = await ingest_async(source, max_size, include_patterns, exclude_patterns, branch, output=output) + + summary, _, _ = await ingest_async( + source=source, + max_file_size=max_size, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + branch=branch, + output=output, + token=token, + ) click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) except Exception as exc: + # Convert any exception into Click.Abort so that exit status is non-zero click.echo(f"Error: {exc}", err=True) - raise click.Abort() + raise click.Abort() from exc if __name__ == "__main__": diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index 79b97cb9..284b353e 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -1,18 +1,24 @@ """This module contains functions for cloning a Git repository to a local path.""" -import os from pathlib import Path from typing import Optional +from gitingest.config import DEFAULT_TIMEOUT from gitingest.schemas import CloneConfig -from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command +from gitingest.utils.git_utils import ( + check_repo_exists, + create_git_auth_header, + create_git_command, + ensure_git_installed, + run_command, + validate_github_token, +) +from gitingest.utils.os_utils import ensure_directory from gitingest.utils.timeout_wrapper import async_timeout -TIMEOUT: int = 60 - -@async_timeout(TIMEOUT) -async def clone_repo(config: CloneConfig) -> None: +@async_timeout(DEFAULT_TIMEOUT) +async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None: """ Clone a repository to a local path based on the provided configuration. @@ -24,13 +30,15 @@ async def clone_repo(config: CloneConfig) -> None: ---------- config : CloneConfig The configuration for cloning the repository. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Must start with 'github_pat_' or 'gph_' for GitHub repositories. Raises ------ ValueError - If the repository is not found or if the provided URL is invalid. - OSError - If an error occurs while creating the parent directory for the repository. + If the repository is not found, if the provided URL is invalid, or if the token format is invalid. """ # Extract and validate query parameters url: str = config.url @@ -39,19 +47,23 @@ async def clone_repo(config: CloneConfig) -> None: branch: Optional[str] = config.branch partial_clone: bool = config.subpath != "/" + # Validate token if provided + if token and url.startswith("https://github.com"): + validate_github_token(token) + # Create parent directory if it doesn't exist - parent_dir = Path(local_path).parent - try: - os.makedirs(parent_dir, exist_ok=True) - except OSError as exc: - raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc + await ensure_directory(Path(local_path).parent) # Check if the repository exists - if not await check_repo_exists(url): - raise ValueError("Repository not found, make sure it is public") + if not await check_repo_exists(url, token=token): + raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.") - clone_cmd = ["git", "clone", "--single-branch"] - # TODO re-enable --recurse-submodules + clone_cmd = ["git"] + if token and url.startswith("https://github.com"): + clone_cmd += ["-c", create_git_auth_header(token)] + + clone_cmd += ["clone", "--single-branch"] + # TODO: Re-enable --recurse-submodules when submodule support is needed if partial_clone: clone_cmd += ["--filter=blob:none", "--sparse"] @@ -67,19 +79,17 @@ async def clone_repo(config: CloneConfig) -> None: await ensure_git_installed() await run_command(*clone_cmd) - if commit or partial_clone: - checkout_cmd = ["git", "-C", local_path] - - if partial_clone: - subpath = config.subpath.lstrip("/") - if config.blob: - # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. - subpath = str(Path(subpath).parent.as_posix()) - - checkout_cmd += ["sparse-checkout", "set", subpath] - - if commit: - checkout_cmd += ["checkout", commit] - - # Check out the specific commit and/or subpath - await run_command(*checkout_cmd) + # Checkout the subpath if it is a partial clone + if partial_clone: + subpath = config.subpath.lstrip("/") + if config.blob: + # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. + subpath = str(Path(subpath).parent.as_posix()) + + checkout_cmd = create_git_command(["git"], local_path, url, token) + await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) + + # Checkout the commit if it is provided + if commit: + checkout_cmd = create_git_command(["git"], local_path, url, token) + await run_command(*checkout_cmd, "checkout", commit) diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 9740713c..3f4e3724 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -7,6 +7,7 @@ MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal MAX_FILES = 10_000 # Maximum number of files to process MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB +DEFAULT_TIMEOUT = 60 # seconds OUTPUT_FILE_NAME = "digest.txt" diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 0af4a4ba..cfabb461 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -2,6 +2,7 @@ import asyncio import inspect +import os import shutil from typing import Optional, Set, Tuple, Union @@ -17,6 +18,7 @@ async def ingest_async( include_patterns: Optional[Union[str, Set[str]]] = None, exclude_patterns: Optional[Union[str, Set[str]]] = None, branch: Optional[str] = None, + token: Optional[str] = None, output: Optional[str] = None, ) -> Tuple[str, str, str]: """ @@ -39,6 +41,9 @@ async def ingest_async( Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. output : str, optional File path where the summary and content should be written. If `None`, the results are not written to a file. @@ -57,6 +62,9 @@ async def ingest_async( """ repo_cloned = False + if not token: + token = os.getenv("GITHUB_TOKEN") + try: query: IngestionQuery = await parse_query( source=source, @@ -71,7 +79,7 @@ async def ingest_async( query.branch = selected_branch clone_config = query.extract_clone_config() - clone_coroutine = clone_repo(clone_config) + clone_coroutine = clone_repo(clone_config, token=token) if inspect.iscoroutine(clone_coroutine): if asyncio.get_event_loop().is_running(): @@ -102,6 +110,7 @@ def ingest( include_patterns: Optional[Union[str, Set[str]]] = None, exclude_patterns: Optional[Union[str, Set[str]]] = None, branch: Optional[str] = None, + token: Optional[str] = None, output: Optional[str] = None, ) -> Tuple[str, str, str]: """ @@ -124,6 +133,9 @@ def ingest( Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. output : str, optional File path where the summary and content should be written. If `None`, the results are not written to a file. @@ -146,6 +158,7 @@ def ingest( include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, + token=token, output=output, ) ) diff --git a/src/gitingest/query_parsing.py b/src/gitingest/query_parsing.py index 5d547356..d391e184 100644 --- a/src/gitingest/query_parsing.py +++ b/src/gitingest/query_parsing.py @@ -94,7 +94,7 @@ async def parse_query( ) -async def _parse_remote_repo(source: str) -> IngestionQuery: +async def _parse_remote_repo(source: str, token: Optional[str] = None) -> IngestionQuery: """ Parse a repository URL into a structured query dictionary. @@ -107,6 +107,9 @@ async def _parse_remote_repo(source: str) -> IngestionQuery: ---------- source : str The URL or domain-less slug to parse. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. Returns ------- @@ -128,7 +131,7 @@ async def _parse_remote_repo(source: str) -> IngestionQuery: _validate_host(tmp_host) else: # No scheme, no domain => user typed "user/repo", so we'll guess the domain. - host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source)) + host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source), token=token) source = f"{host}/{source}" source = "https://" + source @@ -285,7 +288,7 @@ def _parse_local_dir_path(path_str: str) -> IngestionQuery: ) -async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: +async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: Optional[str] = None) -> str: """ Attempt to find a valid repository host for the given user_name and repo_name. @@ -295,6 +298,9 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: The username or owner of the repository. repo_name : str The name of the repository. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. Returns ------- @@ -308,6 +314,6 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: """ for domain in KNOWN_GIT_HOSTS: candidate = f"https://{domain}/{user_name}/{repo_name}" - if await check_repo_exists(candidate): + if await check_repo_exists(candidate, token=token if domain == "github.com" else None): return domain raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.") diff --git a/src/gitingest/schemas/ingestion_schema.py b/src/gitingest/schemas/ingestion_schema.py index 02b1c678..43ea6c42 100644 --- a/src/gitingest/schemas/ingestion_schema.py +++ b/src/gitingest/schemas/ingestion_schema.py @@ -29,6 +29,8 @@ class CloneConfig: The branch to clone (default is None). subpath : str The subpath to clone from the repository (default is "/"). + blob: bool + Whether the repository is a blob (default is False). """ url: str diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index 9ed7c645..b3346996 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -1,7 +1,11 @@ """Utility functions for interacting with Git repositories.""" import asyncio -from typing import List, Tuple +import base64 +import re +from typing import List, Optional, Tuple + +GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$" async def run_command(*args: str) -> Tuple[bytes, bytes]: @@ -52,7 +56,7 @@ async def ensure_git_installed() -> None: raise RuntimeError("Git is not installed or not accessible. Please install Git first.") from exc -async def check_repo_exists(url: str) -> bool: +async def check_repo_exists(url: str, token: Optional[str] = None) -> bool: """ Check if a Git repository exists at the provided URL. @@ -60,6 +64,10 @@ async def check_repo_exists(url: str) -> bool: ---------- url : str The URL of the Git repository to check. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Returns ------- bool @@ -70,6 +78,9 @@ async def check_repo_exists(url: str) -> bool: RuntimeError If the curl command returns an unexpected status code. """ + if token and "github.com" in url: + return await _check_github_repo_exists(url, token) + proc = await asyncio.create_subprocess_exec( "curl", "-I", @@ -94,19 +105,93 @@ async def check_repo_exists(url: str) -> bool: raise RuntimeError(f"Unexpected status line: {status_line}") -async def fetch_remote_branch_list(url: str) -> List[str]: +async def _check_github_repo_exists(url: str, token: Optional[str] = None) -> bool: + """ + Return True iff the authenticated user can see `url`. + + Parameters + ---------- + url : str + The URL of the GitHub repository to check. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Returns + ------- + bool + True if the repository exists, False otherwise. + + Raises + ------ + ValueError + If the URL is not a valid GitHub repository URL. + RuntimeError + If the repository is not found, if the provided URL is invalid, or if the token format is invalid. + """ + m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url) + if not m: + raise ValueError(f"Un-recognised GitHub URL: {url!r}") + owner, repo = m.groups() + + api = f"https://api.github.com/repos/{owner}/{repo}" + cmd = [ + "curl", + "--silent", + "--location", + "--write-out", + "%{http_code}", + "-o", + "/dev/null", + "-H", + "Accept: application/vnd.github+json", + ] + if token: + cmd += ["-H", f"Authorization: Bearer {token}"] + cmd.append(api) + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + status = stdout.decode()[-3:] # just the %{http_code} + + if status == "200": + return True + if status == "404": + return False + if status in ("401", "403"): + raise RuntimeError("Token invalid or lacks permissions") + raise RuntimeError(f"GitHub API returned unexpected HTTP {status}") + + +async def fetch_remote_branch_list(url: str, token: Optional[str] = None) -> List[str]: """ Fetch the list of branches from a remote Git repository. + Parameters ---------- url : str The URL of the Git repository to fetch branches from. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Returns ------- List[str] A list of branch names available in the remote repository. """ - fetch_branches_command = ["git", "ls-remote", "--heads", url] + fetch_branches_command = ["git"] + + # Add authentication if needed + if token and "github.com" in url: + fetch_branches_command += ["-c", create_git_auth_header(token)] + + fetch_branches_command += ["ls-remote", "--heads", url] + await ensure_git_installed() stdout, _ = await run_command(*fetch_branches_command) stdout_decoded = stdout.decode() @@ -116,3 +201,66 @@ async def fetch_remote_branch_list(url: str) -> List[str]: for line in stdout_decoded.splitlines() if line.strip() and "refs/heads/" in line ] + + +def create_git_command(base_cmd: List[str], local_path: str, url: str, token: Optional[str] = None) -> List[str]: + """Create a git command with authentication if needed. + + Parameters + ---------- + base_cmd : List[str] + The base git command to start with + local_path : str + The local path where the git command should be executed + url : str + The repository URL to check if it's a GitHub repository + token : Optional[str] + GitHub personal access token for authentication + + Returns + ------- + List[str] + The git command with authentication if needed + """ + cmd = base_cmd + ["-C", local_path] + if token and url.startswith("https://github.com"): + validate_github_token(token) + cmd += ["-c", create_git_auth_header(token)] + return cmd + + +def create_git_auth_header(token: str) -> str: + """Create a Basic authentication header for GitHub git operations. + + Parameters + ---------- + token : str + GitHub personal access token + + Returns + ------- + str + The git config command for setting the authentication header + """ + basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() + return f"http.https://github.com/.extraheader=Authorization: Basic {basic}" + + +def validate_github_token(token: str) -> None: + """Validate the format of a GitHub Personal Access Token. + + Parameters + ---------- + token : str + The GitHub token to validate + + Raises + ------ + ValueError + If the token format is invalid + """ + if not re.match(GITHUB_PAT_PATTERN, token): + raise ValueError( + "Invalid GitHub token format. Token should start with 'github_pat_' or 'ghp_' " + "followed by at least 36 characters of letters, numbers, and underscores." + ) diff --git a/src/gitingest/utils/os_utils.py b/src/gitingest/utils/os_utils.py new file mode 100644 index 00000000..a2d49916 --- /dev/null +++ b/src/gitingest/utils/os_utils.py @@ -0,0 +1,24 @@ +"""Utility functions for working with the operating system.""" + +import os +from pathlib import Path + + +async def ensure_directory(path: Path) -> None: + """ + Ensure the directory exists, creating it if necessary. + + Parameters + ---------- + path : Path + The path to ensure exists + + Raises + ------ + OSError + If the directory cannot be created + """ + try: + os.makedirs(path, exist_ok=True) + except OSError as exc: + raise OSError(f"Failed to create directory {path}: {exc}") from exc diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index b614d5a4..b57d737e 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -12,9 +12,10 @@ import pytest -from gitingest.cloning import check_repo_exists, clone_repo +from gitingest.cloning import clone_repo from gitingest.schemas import CloneConfig from gitingest.utils.exceptions import AsyncTimeoutError +from gitingest.utils.git_utils import check_repo_exists @pytest.mark.asyncio @@ -41,7 +42,7 @@ async def test_clone_with_commit() -> None: await clone_repo(clone_config) - mock_check.assert_called_once_with(clone_config.url) + mock_check.assert_called_once_with(clone_config.url, token=None) assert mock_exec.call_count == 2 # Clone and checkout calls @@ -69,7 +70,7 @@ async def test_clone_without_commit() -> None: await clone_repo(query) - mock_check.assert_called_once_with(query.url) + mock_check.assert_called_once_with(query.url, token=None) assert mock_exec.call_count == 1 # Only clone call @@ -435,7 +436,7 @@ async def test_clone_with_commit_and_subpath() -> None: clone_config.local_path, ) - # Verify the sparse-checkout command sets the correct path + # Verify sparse-checkout set mock_exec.assert_any_call( "git", "-C", @@ -443,8 +444,15 @@ async def test_clone_with_commit_and_subpath() -> None: "sparse-checkout", "set", "src/docs", + ) + + # Verify checkout commit + mock_exec.assert_any_call( + "git", + "-C", + clone_config.local_path, "checkout", clone_config.commit, ) - assert mock_exec.call_count == 2 + assert mock_exec.call_count == 3