Skip to content

Commit 68f2132

Browse files
authored
Merge branch 'main' into fix/ui-pat-section-layout
2 parents 5d0bd1e + af95bae commit 68f2132

File tree

4 files changed

+192
-18
lines changed

4 files changed

+192
-18
lines changed

src/gitingest/cloning.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22

33
from pathlib import Path
44
from typing import Optional
5+
from urllib.parse import urlparse
56

67
from gitingest.config import DEFAULT_TIMEOUT
78
from gitingest.schemas import CloneConfig
89
from gitingest.utils.git_utils import (
10+
_is_github_host,
911
check_repo_exists,
1012
create_git_auth_header,
1113
create_git_command,
@@ -48,7 +50,7 @@ async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None:
4850
partial_clone: bool = config.subpath != "/"
4951

5052
# Validate token if provided
51-
if token and url.startswith("https://github.com"):
53+
if token and _is_github_host(url):
5254
validate_github_token(token)
5355

5456
# Create parent directory if it doesn't exist
@@ -59,8 +61,14 @@ async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None:
5961
raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.")
6062

6163
clone_cmd = ["git"]
62-
if token and url.startswith("https://github.com"):
63-
clone_cmd += ["-c", create_git_auth_header(token)]
64+
if token and _is_github_host(url):
65+
# Only pass URL if it's not the default github.com to maintain backward compatibility
66+
67+
parsed = urlparse(url)
68+
if parsed.hostname == "github.com":
69+
clone_cmd += ["-c", create_git_auth_header(token)]
70+
else:
71+
clone_cmd += ["-c", create_git_auth_header(token, url)]
6472

6573
clone_cmd += ["clone", "--single-branch"]
6674
# TODO: Re-enable --recurse-submodules when submodule support is needed

src/gitingest/utils/git_utils.py

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,32 @@
44
import base64
55
import re
66
from typing import List, Optional, Tuple
7+
from urllib.parse import urlparse
78

89
from gitingest.utils.exceptions import InvalidGitHubTokenError
910

1011
GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$"
1112

1213

14+
def _is_github_host(url: str) -> bool:
15+
"""
16+
Check if a URL is from a GitHub host (github.com or GitHub Enterprise).
17+
18+
Parameters
19+
----------
20+
url : str
21+
The URL to check
22+
23+
Returns
24+
-------
25+
bool
26+
True if the URL is from a GitHub host, False otherwise
27+
"""
28+
parsed = urlparse(url)
29+
hostname = parsed.hostname or ""
30+
return hostname == "github.com" or hostname.startswith("github.")
31+
32+
1333
async def run_command(*args: str) -> Tuple[bytes, bytes]:
1434
"""
1535
Execute a shell command asynchronously and return (stdout, stderr) bytes.
@@ -80,7 +100,7 @@ async def check_repo_exists(url: str, token: Optional[str] = None) -> bool:
80100
RuntimeError
81101
If the curl command returns an unexpected status code.
82102
"""
83-
if token and "github.com" in url:
103+
if token and _is_github_host(url):
84104
return await _check_github_repo_exists(url, token)
85105

86106
proc = await asyncio.create_subprocess_exec(
@@ -131,12 +151,18 @@ async def _check_github_repo_exists(url: str, token: Optional[str] = None) -> bo
131151
RuntimeError
132152
If the repository is not found, if the provided URL is invalid, or if the token format is invalid.
133153
"""
134-
m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
154+
m = re.match(r"https?://github\.([^/]*)/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
135155
if not m:
136-
raise ValueError(f"Un-recognised GitHub URL: {url!r}")
137-
owner, repo = m.groups()
138-
139-
api = f"https://api.github.com/repos/{owner}/{repo}"
156+
m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
157+
if not m:
158+
raise ValueError(f"Un-recognised GitHub URL: {url!r}")
159+
owner, repo = m.groups()
160+
api = f"https://api.github.com/repos/{owner}/{repo}"
161+
else:
162+
_, owner, repo = m.groups()
163+
164+
parsed = urlparse(url)
165+
api = f"https://{parsed.hostname}/api/v3/repos/{owner}/{repo}"
140166
cmd = [
141167
"curl",
142168
"--silent",
@@ -189,8 +215,14 @@ async def fetch_remote_branch_list(url: str, token: Optional[str] = None) -> Lis
189215
fetch_branches_command = ["git"]
190216

191217
# Add authentication if needed
192-
if token and "github.com" in url:
193-
fetch_branches_command += ["-c", create_git_auth_header(token)]
218+
if token and _is_github_host(url):
219+
# Only pass URL if it's not the default github.com to maintain backward compatibility
220+
221+
parsed = urlparse(url)
222+
if parsed.hostname == "github.com":
223+
fetch_branches_command += ["-c", create_git_auth_header(token)]
224+
else:
225+
fetch_branches_command += ["-c", create_git_auth_header(token, url)]
194226

195227
fetch_branches_command += ["ls-remote", "--heads", url]
196228

@@ -225,27 +257,39 @@ def create_git_command(base_cmd: List[str], local_path: str, url: str, token: Op
225257
The git command with authentication if needed
226258
"""
227259
cmd = base_cmd + ["-C", local_path]
228-
if token and url.startswith("https://github.com"):
260+
if token and _is_github_host(url):
229261
validate_github_token(token)
230-
cmd += ["-c", create_git_auth_header(token)]
262+
# Only pass URL if it's not the default github.com to maintain backward compatibility
263+
264+
parsed = urlparse(url)
265+
if parsed.hostname == "github.com":
266+
cmd += ["-c", create_git_auth_header(token)]
267+
else:
268+
cmd += ["-c", create_git_auth_header(token, url)]
231269
return cmd
232270

233271

234-
def create_git_auth_header(token: str) -> str:
272+
def create_git_auth_header(token: str, url: str = "https://github.com") -> str:
235273
"""Create a Basic authentication header for GitHub git operations.
236274
237275
Parameters
238276
----------
239277
token : str
240278
GitHub personal access token
279+
url : str
280+
The GitHub URL to create the authentication header for.
281+
Defaults to "https://github.com".
241282
242283
Returns
243284
-------
244285
str
245286
The git config command for setting the authentication header
246287
"""
288+
289+
parsed = urlparse(url)
290+
hostname = parsed.hostname or "github.com"
247291
basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode()
248-
return f"http.https://github.com/.extraheader=Authorization: Basic {basic}"
292+
return f"http.https://{hostname}/.extraheader=Authorization: Basic {basic}"
249293

250294

251295
def validate_github_token(token: str) -> None:

src/gitingest/utils/query_parser_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,8 @@ def _looks_like_git_host(host: str) -> bool:
8585
"""
8686
Check if the given host looks like a Git host.
8787
88-
The current heuristic returns `True` when the host starts with `git.` (e.g. `git.example.com`) or starts with
89-
`gitlab.` (e.g. `gitlab.company.com`).
88+
The current heuristic returns `True` when the host starts with `git.` (e.g. `git.example.com`), starts with
89+
`gitlab.` (e.g. `gitlab.company.com`), or starts with `github.` (e.g. `github.company.com` for GitHub Enterprise).
9090
9191
Parameters
9292
----------
@@ -99,7 +99,7 @@ def _looks_like_git_host(host: str) -> bool:
9999
True if the host looks like a Git host, otherwise False.
100100
"""
101101
host = host.lower()
102-
return host.startswith(("git.", "gitlab."))
102+
return host.startswith(("git.", "gitlab.", "github."))
103103

104104

105105
def _validate_url_scheme(scheme: str) -> None:

tests/test_git_utils.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from gitingest.utils.exceptions import InvalidGitHubTokenError
1313
from gitingest.utils.git_utils import (
14+
_is_github_host,
1415
create_git_auth_header,
1516
create_git_command,
1617
validate_github_token,
@@ -140,3 +141,124 @@ def test_create_git_command_helper_calls(mocker, url, token, should_call):
140141
header_mock.assert_not_called()
141142
# HEADER should not be included in command list
142143
assert "HEADER" not in cmd
144+
145+
146+
@pytest.mark.parametrize(
147+
"url, expected",
148+
[
149+
# GitHub.com URLs
150+
("https://github.com/owner/repo.git", True),
151+
("http://github.com/owner/repo.git", True),
152+
("https://github.com/owner/repo", True),
153+
# GitHub Enterprise URLs
154+
("https://github.company.com/owner/repo.git", True),
155+
("https://github.enterprise.org/owner/repo.git", True),
156+
("http://github.internal/owner/repo.git", True),
157+
("https://github.example.co.uk/owner/repo.git", True),
158+
# Non-GitHub URLs
159+
("https://gitlab.com/owner/repo.git", False),
160+
("https://bitbucket.org/owner/repo.git", False),
161+
("https://git.example.com/owner/repo.git", False),
162+
("https://mygithub.com/owner/repo.git", False), # doesn't start with "github."
163+
("https://subgithub.com/owner/repo.git", False),
164+
("https://example.com/github/repo.git", False),
165+
# Edge cases
166+
("", False),
167+
("not-a-url", False),
168+
("ftp://github.com/owner/repo.git", True), # Different protocol but still github.com
169+
],
170+
)
171+
def test_is_github_host(url, expected):
172+
"""_is_github_host should correctly identify GitHub and GitHub Enterprise URLs."""
173+
assert _is_github_host(url) == expected
174+
175+
176+
@pytest.mark.parametrize(
177+
"token, url, expected_hostname",
178+
[
179+
# GitHub.com URLs (default)
180+
("ghp_" + "a" * 36, "https://github.com", "github.com"),
181+
("ghp_" + "a" * 36, "https://github.com/owner/repo.git", "github.com"),
182+
# GitHub Enterprise URLs
183+
("ghp_" + "b" * 36, "https://github.company.com", "github.company.com"),
184+
("ghp_" + "c" * 36, "https://github.enterprise.org/owner/repo.git", "github.enterprise.org"),
185+
("ghp_" + "d" * 36, "http://github.internal", "github.internal"),
186+
],
187+
)
188+
def test_create_git_auth_header_with_ghe_url(token, url, expected_hostname):
189+
"""create_git_auth_header should handle GitHub Enterprise URLs correctly."""
190+
header = create_git_auth_header(token, url)
191+
expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode()
192+
expected = f"http.https://{expected_hostname}/.extraheader=Authorization: Basic {expected_basic}"
193+
assert header == expected
194+
195+
196+
@pytest.mark.parametrize(
197+
"base_cmd, local_path, url, token, expected_auth_hostname",
198+
[
199+
# GitHub.com URLs - should use default hostname
200+
(
201+
["git", "clone"],
202+
"/some/path",
203+
"https://github.com/owner/repo.git",
204+
"ghp_" + "a" * 36,
205+
"github.com",
206+
),
207+
# GitHub Enterprise URLs - should use custom hostname
208+
(
209+
["git", "clone"],
210+
"/some/path",
211+
"https://github.company.com/owner/repo.git",
212+
"ghp_" + "b" * 36,
213+
"github.company.com",
214+
),
215+
(
216+
["git", "clone"],
217+
"/some/path",
218+
"https://github.enterprise.org/owner/repo.git",
219+
"ghp_" + "c" * 36,
220+
"github.enterprise.org",
221+
),
222+
(
223+
["git", "clone"],
224+
"/some/path",
225+
"http://github.internal/owner/repo.git",
226+
"ghp_" + "d" * 36,
227+
"github.internal",
228+
),
229+
],
230+
)
231+
def test_create_git_command_with_ghe_urls(base_cmd, local_path, url, token, expected_auth_hostname):
232+
"""create_git_command should handle GitHub Enterprise URLs correctly."""
233+
cmd = create_git_command(base_cmd, local_path, url, token)
234+
235+
# Should have base command and -C option
236+
expected_prefix = base_cmd + ["-C", local_path]
237+
assert cmd[: len(expected_prefix)] == expected_prefix
238+
239+
# Should have -c and auth header
240+
assert "-c" in cmd
241+
auth_header_index = cmd.index("-c") + 1
242+
auth_header = cmd[auth_header_index]
243+
244+
# Verify the auth header contains the expected hostname
245+
assert f"http.https://{expected_auth_hostname}/" in auth_header
246+
assert "Authorization: Basic" in auth_header
247+
248+
249+
@pytest.mark.parametrize(
250+
"base_cmd, local_path, url, token",
251+
[
252+
# Should NOT add auth headers for non-GitHub URLs
253+
(["git", "clone"], "/some/path", "https://gitlab.com/owner/repo.git", "ghp_" + "a" * 36),
254+
(["git", "clone"], "/some/path", "https://bitbucket.org/owner/repo.git", "ghp_" + "b" * 36),
255+
(["git", "clone"], "/some/path", "https://git.example.com/owner/repo.git", "ghp_" + "c" * 36),
256+
],
257+
)
258+
def test_create_git_command_ignores_non_github_urls(base_cmd, local_path, url, token):
259+
"""create_git_command should not add auth headers for non-GitHub URLs."""
260+
cmd = create_git_command(base_cmd, local_path, url, token)
261+
262+
# Should only have base command and -C option, no auth headers
263+
expected = base_cmd + ["-C", local_path]
264+
assert cmd == expected

0 commit comments

Comments
 (0)