Skip to content

Commit 86eab77

Browse files
feat(auth): support private GitHub repos & correct sparse-checkout flow
* CLI: new `--token/-t` flag (fallback to `GITHUB_TOKEN`) * clone_repo: * injects Basic-auth header when a PAT is supplied * validates PAT format (`github_pat_*`) * git_utils: * `create_git_auth_header`, `validate_github_token`, `create_git_command` * `_check_github_repo_exists` & branch-listing now work with tokens * os_utils.ensure_directory extracted for reuse * tests updated to reflect new call signatures
1 parent 1bda9f6 commit 86eab77

File tree

7 files changed

+251
-42
lines changed

7 files changed

+251
-42
lines changed

src/gitingest/cli.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,24 @@
4444
),
4545
)
4646
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
47+
@click.option(
48+
"--token",
49+
"-t",
50+
envvar="GITHUB_TOKEN",
51+
default=None,
52+
help=(
53+
"GitHub personal access token for accessing private repositories. "
54+
"If omitted, the CLI will look for the GITHUB_TOKEN environment variable."
55+
),
56+
)
4757
def main(
4858
source: str,
4959
output: Optional[str],
5060
max_size: int,
5161
exclude_pattern: Tuple[str, ...],
5262
include_pattern: Tuple[str, ...],
5363
branch: Optional[str],
64+
token: Optional[str],
5465
):
5566
"""
5667
Main entry point for the CLI. This function is called when the CLI is run as a script.
@@ -71,6 +82,9 @@ def main(
7182
Glob patterns for including files in the output.
7283
branch : str, optional
7384
Specific branch to ingest (defaults to the repository's default).
85+
token: str, optional
86+
GitHub personal-access token (PAT). Needed when *source* refers to a
87+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
7488
"""
7589

7690
asyncio.run(
@@ -81,6 +95,7 @@ def main(
8195
exclude_pattern=exclude_pattern,
8296
include_pattern=include_pattern,
8397
branch=branch,
98+
token=token,
8499
)
85100
)
86101

@@ -92,6 +107,7 @@ async def _async_main(
92107
exclude_pattern: Tuple[str, ...],
93108
include_pattern: Tuple[str, ...],
94109
branch: Optional[str],
110+
token: Optional[str],
95111
) -> None:
96112
"""
97113
Analyze a directory or repository and create a text dump of its contents.
@@ -113,6 +129,9 @@ async def _async_main(
113129
Glob patterns for including files in the output.
114130
branch : str, optional
115131
Specific branch to ingest (defaults to the repository's default).
132+
token: str, optional
133+
GitHub personal-access token (PAT). Needed when *source* refers to a
134+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
116135
117136
Raises
118137
------
@@ -135,6 +154,7 @@ async def _async_main(
135154
exclude_patterns=exclude_patterns,
136155
branch=branch,
137156
output=output,
157+
token=token,
138158
)
139159

140160
click.echo(f"Analysis complete! Output written to: {output}")

src/gitingest/cloning.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
11
"""This module contains functions for cloning a Git repository to a local path."""
22

3-
import os
43
from pathlib import Path
54
from typing import Optional
65

76
from gitingest.config import DEFAULT_TIMEOUT
87
from gitingest.schemas import CloneConfig
9-
from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command
8+
from gitingest.utils.git_utils import (
9+
check_repo_exists,
10+
create_git_auth_header,
11+
create_git_command,
12+
ensure_git_installed,
13+
run_command,
14+
validate_github_token,
15+
)
16+
from gitingest.utils.os_utils import ensure_directory
1017
from gitingest.utils.timeout_wrapper import async_timeout
1118

1219

1320
@async_timeout(DEFAULT_TIMEOUT)
14-
async def clone_repo(config: CloneConfig) -> None:
21+
async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None:
1522
"""
1623
Clone a repository to a local path based on the provided configuration.
1724
@@ -23,11 +30,15 @@ async def clone_repo(config: CloneConfig) -> None:
2330
----------
2431
config : CloneConfig
2532
The configuration for cloning the repository.
33+
token : str, optional
34+
GitHub personal-access token (PAT). Needed when *source* refers to a
35+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
36+
Must start with 'github_pat_' for GitHub repositories.
2637
2738
Raises
2839
------
2940
ValueError
30-
If the repository is not found or if the provided URL is invalid.
41+
If the repository is not found, if the provided URL is invalid, or if the token format is invalid.
3142
"""
3243
# Extract and validate query parameters
3344
url: str = config.url
@@ -36,14 +47,22 @@ async def clone_repo(config: CloneConfig) -> None:
3647
branch: Optional[str] = config.branch
3748
partial_clone: bool = config.subpath != "/"
3849

50+
# Validate token if provided
51+
if token and url.startswith("https://github.com"):
52+
validate_github_token(token)
53+
3954
# Create parent directory if it doesn't exist
40-
await _ensure_directory(Path(local_path).parent)
55+
await ensure_directory(Path(local_path).parent)
4156

4257
# Check if the repository exists
43-
if not await check_repo_exists(url):
44-
raise ValueError("Repository not found, make sure it is public")
58+
if not await check_repo_exists(url, token=token):
59+
raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.")
60+
61+
clone_cmd = ["git"]
62+
if token and url.startswith("https://github.com"):
63+
clone_cmd += ["-c", create_git_auth_header(token)]
4564

46-
clone_cmd = ["git", "clone", "--single-branch"]
65+
clone_cmd += ["clone", "--single-branch"]
4766
# TODO: Re-enable --recurse-submodules when submodule support is needed
4867

4968
if partial_clone:
@@ -67,28 +86,10 @@ async def clone_repo(config: CloneConfig) -> None:
6786
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
6887
subpath = str(Path(subpath).parent.as_posix())
6988

70-
await run_command("git", "-C", local_path, "sparse-checkout", "set", subpath)
89+
checkout_cmd = create_git_command(["git"], local_path, url, token)
90+
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
7191

7292
# Checkout the commit if it is provided
7393
if commit:
74-
await run_command("git", "-C", local_path, "checkout", commit)
75-
76-
77-
async def _ensure_directory(path: Path) -> None:
78-
"""
79-
Ensure the directory exists, creating it if necessary.
80-
81-
Parameters
82-
----------
83-
path : Path
84-
The path to ensure exists
85-
86-
Raises
87-
------
88-
OSError
89-
If the directory cannot be created
90-
"""
91-
try:
92-
os.makedirs(path, exist_ok=True)
93-
except OSError as exc:
94-
raise OSError(f"Failed to create directory {path}: {exc}") from exc
94+
checkout_cmd = create_git_command(["git"], local_path, url, token)
95+
await run_command(*checkout_cmd, "checkout", commit)

src/gitingest/entrypoint.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ async def ingest_async(
1717
include_patterns: Optional[Union[str, Set[str]]] = None,
1818
exclude_patterns: Optional[Union[str, Set[str]]] = None,
1919
branch: Optional[str] = None,
20+
token: Optional[str] = None,
2021
output: Optional[str] = None,
2122
) -> Tuple[str, str, str]:
2223
"""
@@ -39,6 +40,9 @@ async def ingest_async(
3940
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
4041
branch : str, optional
4142
The branch to clone and ingest. If `None`, the default branch is used.
43+
token : str, optional
44+
GitHub personal-access token (PAT). Needed when *source* refers to a
45+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
4246
output : str, optional
4347
File path where the summary and content should be written. If `None`, the results are not written to a file.
4448
@@ -71,7 +75,7 @@ async def ingest_async(
7175
query.branch = selected_branch
7276

7377
clone_config = query.extract_clone_config()
74-
clone_coroutine = clone_repo(clone_config)
78+
clone_coroutine = clone_repo(clone_config, token=token)
7579

7680
if inspect.iscoroutine(clone_coroutine):
7781
if asyncio.get_event_loop().is_running():
@@ -102,6 +106,7 @@ def ingest(
102106
include_patterns: Optional[Union[str, Set[str]]] = None,
103107
exclude_patterns: Optional[Union[str, Set[str]]] = None,
104108
branch: Optional[str] = None,
109+
token: Optional[str] = None,
105110
output: Optional[str] = None,
106111
) -> Tuple[str, str, str]:
107112
"""
@@ -124,6 +129,9 @@ def ingest(
124129
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
125130
branch : str, optional
126131
The branch to clone and ingest. If `None`, the default branch is used.
132+
token : str, optional
133+
GitHub personal-access token (PAT). Needed when *source* refers to a
134+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
127135
output : str, optional
128136
File path where the summary and content should be written. If `None`, the results are not written to a file.
129137
@@ -146,6 +154,7 @@ def ingest(
146154
include_patterns=include_patterns,
147155
exclude_patterns=exclude_patterns,
148156
branch=branch,
157+
token=token,
149158
output=output,
150159
)
151160
)

src/gitingest/query_parsing.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ async def parse_query(
9494
)
9595

9696

97-
async def _parse_remote_repo(source: str) -> IngestionQuery:
97+
async def _parse_remote_repo(source: str, token: Optional[str] = None) -> IngestionQuery:
9898
"""
9999
Parse a repository URL into a structured query dictionary.
100100
@@ -107,6 +107,9 @@ async def _parse_remote_repo(source: str) -> IngestionQuery:
107107
----------
108108
source : str
109109
The URL or domain-less slug to parse.
110+
token : str, optional
111+
GitHub personal-access token (PAT). Needed when *source* refers to a
112+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
110113
111114
Returns
112115
-------
@@ -128,7 +131,7 @@ async def _parse_remote_repo(source: str) -> IngestionQuery:
128131
_validate_host(tmp_host)
129132
else:
130133
# No scheme, no domain => user typed "user/repo", so we'll guess the domain.
131-
host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source))
134+
host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source), token=token)
132135
source = f"{host}/{source}"
133136

134137
source = "https://" + source
@@ -285,7 +288,7 @@ def _parse_local_dir_path(path_str: str) -> IngestionQuery:
285288
)
286289

287290

288-
async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
291+
async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: Optional[str] = None) -> str:
289292
"""
290293
Attempt to find a valid repository host for the given user_name and repo_name.
291294
@@ -295,6 +298,9 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
295298
The username or owner of the repository.
296299
repo_name : str
297300
The name of the repository.
301+
token : str, optional
302+
GitHub personal-access token (PAT). Needed when *source* refers to a
303+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
298304
299305
Returns
300306
-------
@@ -308,6 +314,6 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
308314
"""
309315
for domain in KNOWN_GIT_HOSTS:
310316
candidate = f"https://{domain}/{user_name}/{repo_name}"
311-
if await check_repo_exists(candidate):
317+
if await check_repo_exists(candidate, token=token if domain == "github.com" else None):
312318
return domain
313319
raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.")

0 commit comments

Comments
 (0)