Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 65 additions & 26 deletions src/gitingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,39 @@
"--exclude-pattern",
"-e",
multiple=True,
help="""Patterns to exclude. Handles python's arbitrary subset of Unix
shell-style wildcards. See:
https://docs.python.org/3/library/fnmatch.html""",
help=(
"Patterns to exclude. Handles Python's arbitrary subset of Unix shell-style "
"wildcards. See: https://docs.python.org/3/library/fnmatch.html"
),
)
@click.option(
"--include-pattern",
"-i",
multiple=True,
help="""Patterns to include. Handles python's arbitrary subset of Unix
shell-style wildcards. See:
https://docs.python.org/3/library/fnmatch.html""",
help=(
"Patterns to include. Handles Python's arbitrary subset of Unix shell-style "
"wildcards. See: https://docs.python.org/3/library/fnmatch.html"
),
)
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
@click.option(
"--token",
"-t",
envvar="GITHUB_TOKEN",
default=None,
help=(
"GitHub personal access token for accessing private repositories. "
"If omitted, the CLI will look for the GITHUB_TOKEN environment variable."
),
)
def main(
source: str,
output: Optional[str],
max_size: int,
exclude_pattern: Tuple[str, ...],
include_pattern: Tuple[str, ...],
branch: Optional[str],
token: Optional[str],
):
"""
Main entry point for the CLI. This function is called when the CLI is run as a script.
Expand All @@ -58,21 +71,33 @@ def main(
Parameters
----------
source : str
The source directory or repository to analyze.
A directory path or a Git repository URL.
output : str, optional
The path where the output file will be written. If not specified, the output will be written
to a file named `<repo_name>.txt` in the current directory.
Output file path. Defaults to `<repo_name>.txt`.
max_size : int
The maximum file size to process, in bytes. Files larger than this size will be ignored.
Maximum file size (in bytes) to consider.
exclude_pattern : Tuple[str, ...]
A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored.
Glob patterns for pruning the file set.
include_pattern : Tuple[str, ...]
A tuple of patterns to include during the analysis. Only files matching these patterns will be processed.
Glob patterns for including files in the output.
branch : str, optional
The branch to clone (optional).
Specific branch to ingest (defaults to the repository's default).
token: str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
"""
# Main entry point for the CLI. This function is called when the CLI is run as a script.
asyncio.run(_async_main(source, output, max_size, exclude_pattern, include_pattern, branch))

asyncio.run(
_async_main(
source=source,
output=output,
max_size=max_size,
exclude_pattern=exclude_pattern,
include_pattern=include_pattern,
branch=branch,
token=token,
)
)


async def _async_main(
Expand All @@ -82,6 +107,7 @@ async def _async_main(
exclude_pattern: Tuple[str, ...],
include_pattern: Tuple[str, ...],
branch: Optional[str],
token: Optional[str],
) -> None:
"""
Analyze a directory or repository and create a text dump of its contents.
Expand All @@ -92,40 +118,53 @@ async def _async_main(
Parameters
----------
source : str
The source directory or repository to analyze.
A directory path or a Git repository URL.
output : str, optional
The path where the output file will be written. If not specified, the output will be written
to a file named `<repo_name>.txt` in the current directory.
Output file path. Defaults to `<repo_name>.txt`.
max_size : int
The maximum file size to process, in bytes. Files larger than this size will be ignored.
Maximum file size (in bytes) to consider.
exclude_pattern : Tuple[str, ...]
A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored.
Glob patterns for pruning the file set.
include_pattern : Tuple[str, ...]
A tuple of patterns to include during the analysis. Only files matching these patterns will be processed.
Glob patterns for including files in the output.
branch : str, optional
The branch to clone (optional).
Specific branch to ingest (defaults to the repository's default).
token: str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.

Raises
------
Abort
If there is an error during the execution of the command, this exception is raised to abort the process.
"""
try:
# Combine default and custom ignore patterns
# Normalise pattern containers (the ingest layer expects sets)
exclude_patterns = set(exclude_pattern)
include_patterns = set(include_pattern)

if not output:
# Choose a default output path if none provided
if output is None:
output = OUTPUT_FILE_NAME
summary, _, _ = await ingest_async(source, max_size, include_patterns, exclude_patterns, branch, output=output)

summary, _, _ = await ingest_async(
source=source,
max_file_size=max_size,
include_patterns=include_patterns,
exclude_patterns=exclude_patterns,
branch=branch,
output=output,
token=token,
)

click.echo(f"Analysis complete! Output written to: {output}")
click.echo("\nSummary:")
click.echo(summary)

except Exception as exc:
# Convert any exception into Click.Abort so that exit status is non-zero
click.echo(f"Error: {exc}", err=True)
raise click.Abort()
raise click.Abort() from exc


if __name__ == "__main__":
Expand Down
78 changes: 44 additions & 34 deletions src/gitingest/cloning.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
"""This module contains functions for cloning a Git repository to a local path."""

import os
from pathlib import Path
from typing import Optional

from gitingest.config import DEFAULT_TIMEOUT
from gitingest.schemas import CloneConfig
from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command
from gitingest.utils.git_utils import (
check_repo_exists,
create_git_auth_header,
create_git_command,
ensure_git_installed,
run_command,
validate_github_token,
)
from gitingest.utils.os_utils import ensure_directory
from gitingest.utils.timeout_wrapper import async_timeout

TIMEOUT: int = 60


@async_timeout(TIMEOUT)
async def clone_repo(config: CloneConfig) -> None:
@async_timeout(DEFAULT_TIMEOUT)
async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None:
"""
Clone a repository to a local path based on the provided configuration.

Expand All @@ -24,13 +30,15 @@ async def clone_repo(config: CloneConfig) -> None:
----------
config : CloneConfig
The configuration for cloning the repository.
token : str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
Must start with 'github_pat_' for GitHub repositories.

Raises
------
ValueError
If the repository is not found or if the provided URL is invalid.
OSError
If an error occurs while creating the parent directory for the repository.
If the repository is not found, if the provided URL is invalid, or if the token format is invalid.
"""
# Extract and validate query parameters
url: str = config.url
Expand All @@ -39,19 +47,23 @@ async def clone_repo(config: CloneConfig) -> None:
branch: Optional[str] = config.branch
partial_clone: bool = config.subpath != "/"

# Validate token if provided
if token and url.startswith("https://github.com"):
validate_github_token(token)

# Create parent directory if it doesn't exist
parent_dir = Path(local_path).parent
try:
os.makedirs(parent_dir, exist_ok=True)
except OSError as exc:
raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc
await ensure_directory(Path(local_path).parent)

# Check if the repository exists
if not await check_repo_exists(url):
raise ValueError("Repository not found, make sure it is public")
if not await check_repo_exists(url, token=token):
raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.")

clone_cmd = ["git", "clone", "--single-branch"]
# TODO re-enable --recurse-submodules
clone_cmd = ["git"]
if token and url.startswith("https://github.com"):
clone_cmd += ["-c", create_git_auth_header(token)]

clone_cmd += ["clone", "--single-branch"]
# TODO: Re-enable --recurse-submodules when submodule support is needed

if partial_clone:
clone_cmd += ["--filter=blob:none", "--sparse"]
Expand All @@ -67,19 +79,17 @@ async def clone_repo(config: CloneConfig) -> None:
await ensure_git_installed()
await run_command(*clone_cmd)

if commit or partial_clone:
checkout_cmd = ["git", "-C", local_path]

if partial_clone:
subpath = config.subpath.lstrip("/")
if config.blob:
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
subpath = str(Path(subpath).parent.as_posix())

checkout_cmd += ["sparse-checkout", "set", subpath]

if commit:
checkout_cmd += ["checkout", commit]

# Check out the specific commit and/or subpath
await run_command(*checkout_cmd)
# Checkout the subpath if it is a partial clone
if partial_clone:
subpath = config.subpath.lstrip("/")
if config.blob:
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
subpath = str(Path(subpath).parent.as_posix())

checkout_cmd = create_git_command(["git"], local_path, url, token)
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)

# Checkout the commit if it is provided
if commit:
checkout_cmd = create_git_command(["git"], local_path, url, token)
await run_command(*checkout_cmd, "checkout", commit)
1 change: 1 addition & 0 deletions src/gitingest/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal
MAX_FILES = 10_000 # Maximum number of files to process
MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB
DEFAULT_TIMEOUT = 60 # seconds

OUTPUT_FILE_NAME = "digest.txt"

Expand Down
11 changes: 10 additions & 1 deletion src/gitingest/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ async def ingest_async(
include_patterns: Optional[Union[str, Set[str]]] = None,
exclude_patterns: Optional[Union[str, Set[str]]] = None,
branch: Optional[str] = None,
token: Optional[str] = None,
output: Optional[str] = None,
) -> Tuple[str, str, str]:
"""
Expand All @@ -39,6 +40,9 @@ async def ingest_async(
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
branch : str, optional
The branch to clone and ingest. If `None`, the default branch is used.
token : str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
output : str, optional
File path where the summary and content should be written. If `None`, the results are not written to a file.

Expand Down Expand Up @@ -71,7 +75,7 @@ async def ingest_async(
query.branch = selected_branch

clone_config = query.extract_clone_config()
clone_coroutine = clone_repo(clone_config)
clone_coroutine = clone_repo(clone_config, token=token)

if inspect.iscoroutine(clone_coroutine):
if asyncio.get_event_loop().is_running():
Expand Down Expand Up @@ -102,6 +106,7 @@ def ingest(
include_patterns: Optional[Union[str, Set[str]]] = None,
exclude_patterns: Optional[Union[str, Set[str]]] = None,
branch: Optional[str] = None,
token: Optional[str] = None,
output: Optional[str] = None,
) -> Tuple[str, str, str]:
"""
Expand All @@ -124,6 +129,9 @@ def ingest(
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
branch : str, optional
The branch to clone and ingest. If `None`, the default branch is used.
token : str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
output : str, optional
File path where the summary and content should be written. If `None`, the results are not written to a file.

Expand All @@ -146,6 +154,7 @@ def ingest(
include_patterns=include_patterns,
exclude_patterns=exclude_patterns,
branch=branch,
token=token,
output=output,
)
)
14 changes: 10 additions & 4 deletions src/gitingest/query_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ async def parse_query(
)


async def _parse_remote_repo(source: str) -> IngestionQuery:
async def _parse_remote_repo(source: str, token: Optional[str] = None) -> IngestionQuery:
"""
Parse a repository URL into a structured query dictionary.

Expand All @@ -107,6 +107,9 @@ async def _parse_remote_repo(source: str) -> IngestionQuery:
----------
source : str
The URL or domain-less slug to parse.
token : str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.

Returns
-------
Expand All @@ -128,7 +131,7 @@ async def _parse_remote_repo(source: str) -> IngestionQuery:
_validate_host(tmp_host)
else:
# No scheme, no domain => user typed "user/repo", so we'll guess the domain.
host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source))
host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source), token=token)
source = f"{host}/{source}"

source = "https://" + source
Expand Down Expand Up @@ -285,7 +288,7 @@ def _parse_local_dir_path(path_str: str) -> IngestionQuery:
)


async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: Optional[str] = None) -> str:
"""
Attempt to find a valid repository host for the given user_name and repo_name.

Expand All @@ -295,6 +298,9 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
The username or owner of the repository.
repo_name : str
The name of the repository.
token : str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.

Returns
-------
Expand All @@ -308,6 +314,6 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
"""
for domain in KNOWN_GIT_HOSTS:
candidate = f"https://{domain}/{user_name}/{repo_name}"
if await check_repo_exists(candidate):
if await check_repo_exists(candidate, token=token if domain == "github.com" else None):
return domain
raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.")
Loading
Loading