Skip to content

Commit 9bdee8f

Browse files
feat: make parser domain-agnostic to support multiple Git hosts
- added list of known domains/Git hosts in `query_parser.py` - fixed bug from [#115](#115): corrected case handling for URL components—scheme, domain, username, and repository are case-insensitive, but paths beyond (e.g., file names, branches) are case-sensitive - implemented `try_domains_for_user_and_repo` in `query_parser.py` to iteratively guess the correct domain until success or supported hosts are exhausted - added helper functions `_get_user_and_repo_from_path`, `_validate_host`, and `_validate_scheme` in `query_parser.py` - extended `_parse_repo_source` in `query_parser.py` to be Git host agnostic by using `try_domains_for_user_and_repo` - added tests `test_parse_url_unsupported_host` and `test_parse_query_with_branch` in `test_query_parser.py` - created new file `test_git_host_agnostic.py` to verify domain/Git host agnostic behavior
1 parent a57f614 commit 9bdee8f

File tree

4 files changed

+251
-52
lines changed

4 files changed

+251
-52
lines changed

src/gitingest/query_parser.py

Lines changed: 157 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,16 @@
1111
from config import TMP_BASE_PATH
1212
from gitingest.exceptions import InvalidPatternError
1313
from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
14+
from gitingest.repository_clone import _check_repo_exists
1415

15-
HEX_DIGITS = set(string.hexdigits)
16+
HEX_DIGITS: set[str] = set(string.hexdigits)
17+
18+
KNOWN_GIT_HOSTS: list[str] = [
19+
"github.com",
20+
"gitlab.com",
21+
"bitbucket.org",
22+
"gitea.com",
23+
]
1624

1725

1826
async def parse_query(
@@ -48,16 +56,16 @@ async def parse_query(
4856
A dictionary containing the parsed query parameters, including 'max_file_size',
4957
'ignore_patterns', and 'include_patterns'.
5058
"""
51-
# Normalize and clean up the source string to make it case-insensitive
52-
source = source.lower().strip()
5359

5460
# Determine the parsing method based on the source type
55-
if from_web or source.startswith("https://") or "github.com" in source:
56-
query = _parse_repo_source(source)
61+
if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
62+
# We either have a full URL or a domain-less slug
63+
query = await _parse_repo_source(source)
5764
else:
65+
# Local path scenario
5866
query = _parse_path(source)
5967

60-
# Process ignore patterns
68+
# Combine ignore patterns
6169
ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy()
6270
if ignore_patterns:
6371
ignore_patterns_list += _parse_patterns(ignore_patterns)
@@ -69,7 +77,6 @@ async def parse_query(
6977
else:
7078
parsed_include = None
7179

72-
# Update the query dictionary with max_file_size and processed patterns
7380
query.update(
7481
{
7582
"max_file_size": max_file_size,
@@ -80,52 +87,54 @@ async def parse_query(
8087
return query
8188

8289

83-
def _parse_repo_source(url: str) -> dict[str, Any]:
90+
async def _parse_repo_source(source: str) -> dict[str, Any]:
8491
"""
85-
Parse a GitHub repository URL into a structured query dictionary.
92+
Parse a repository URL into a structured query dictionary.
8693
87-
This function extracts relevant information from a GitHub URL, such as the username,
88-
repository name, commit, branch, and subpath, and returns them in a structured format.
94+
If source is:
95+
- A fully qualified URL (https://gitlab.com/...), parse & verify that domain
96+
- A URL missing 'https://' (gitlab.com/...), add 'https://' and parse
97+
- A 'slug' (like 'pandas-dev/pandas'), attempt known domains until we find one that exists.
8998
9099
Parameters
91100
----------
92-
url : str
93-
The GitHub URL to parse.
101+
source : str
102+
The URL or domain-less slug to parse.
94103
95104
Returns
96105
-------
97106
dict[str, Any]
98-
A dictionary containing the parsed details of the GitHub repository, including
99-
the username, repository name, commit, branch, and other relevant information.
100-
101-
Raises
102-
------
103-
ValueError
104-
If the URL is invalid or does not correspond to a valid Git repository.
107+
A dictionary containing the parsed details of the repository, including the username,
108+
repository name, commit, branch, and other relevant information.
105109
"""
106-
# Clean up the URL
107-
url = url.split(" ")[0] # remove trailing text
108-
url = unquote(url) # decode URL-encoded characters
110+
source = unquote(source)
109111

110-
if not url.startswith(("https://", "http://")):
111-
url = "https://" + url
112+
# Attempt to parse
113+
parsed_url = urlparse(source)
112114

113-
# Parse URL and reconstruct it without query parameters and fragments
114-
parsed_url = urlparse(url)
115-
url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
115+
if parsed_url.scheme:
116+
_validate_scheme(parsed_url.scheme)
117+
_validate_host(parsed_url.netloc.lower())
116118

117-
# Extract domain and path
118-
url_parts = url.split("/")
119-
domain = url_parts[2]
120-
path_parts = url_parts[3:]
119+
else: # Will be of the form 'host/user/repo' or 'user/repo'
120+
tmp_host = source.split("/")[0].lower()
121+
if "." in tmp_host:
122+
_validate_host(tmp_host)
123+
else:
124+
# No scheme, no domain => user typed "user/repo", so we'll guess the domain.
125+
host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source))
126+
source = f"{host}/{source}"
121127

122-
if len(path_parts) < 2:
123-
raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.")
128+
source = "https://" + source
129+
parsed_url = urlparse(source)
130+
131+
host = parsed_url.netloc.lower()
132+
user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path)
124133

125-
user_name = path_parts[0]
126-
repo_name = path_parts[1]
127134
_id = str(uuid.uuid4())
128135
slug = f"{user_name}-{repo_name}"
136+
local_path = Path(TMP_BASE_PATH) / _id / slug
137+
url = f"https://{host}/{user_name}/{repo_name}"
129138

130139
parsed = {
131140
"user_name": user_name,
@@ -134,31 +143,39 @@ def _parse_repo_source(url: str) -> dict[str, Any]:
134143
"branch": None,
135144
"commit": None,
136145
"subpath": "/",
137-
"local_path": Path(TMP_BASE_PATH) / _id / slug,
138-
"url": f"https://{domain}/{user_name}/{repo_name}",
139-
"slug": slug,
146+
"local_path": local_path,
147+
"url": url,
148+
"slug": slug, # e.g. "pandas-dev-pandas"
140149
"id": _id,
141150
}
142151

143-
# If this is an issues page or pull requests, return early without processing subpath
144-
if len(path_parts) > 2 and (path_parts[2] == "issues" or path_parts[2] == "pull"):
152+
remaining_parts = parsed_url.path.strip("/").split("/")[2:]
153+
154+
if not remaining_parts:
145155
return parsed
146156

157+
possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob'
158+
147159
# If no extra path parts, just return
148-
if len(path_parts) < 4:
160+
if not remaining_parts:
161+
return parsed
162+
163+
# If this is an issues page or pull requests, return early without processing subpath
164+
if remaining_parts and possible_type in ("issues", "pull"):
149165
return parsed
150166

151-
parsed["type"] = path_parts[2] # Usually 'tree' or 'blob'
152-
commit = path_parts[3]
167+
parsed["type"] = possible_type
153168

154-
if _is_valid_git_commit_hash(commit):
155-
parsed["commit"] = commit
156-
if len(path_parts) > 4:
157-
parsed["subpath"] += "/".join(path_parts[4:])
169+
# Commit or branch
170+
commit_or_branch = remaining_parts.pop(0)
171+
if _is_valid_git_commit_hash(commit_or_branch):
172+
parsed["commit"] = commit_or_branch
158173
else:
159-
parsed["branch"] = commit
160-
if len(path_parts) > 4:
161-
parsed["subpath"] += "/".join(path_parts[4:])
174+
parsed["branch"] = commit_or_branch
175+
176+
# Subpath if anything left
177+
if remaining_parts:
178+
parsed["subpath"] += "/".join(remaining_parts)
162179

163180
return parsed
164181

@@ -314,3 +331,92 @@ def _is_valid_pattern(pattern: str) -> bool:
314331
True if the pattern is valid, otherwise False.
315332
"""
316333
return all(c.isalnum() or c in "-_./+*" for c in pattern)
334+
335+
336+
async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
337+
"""
338+
Attempt to find a valid repository host for the given user_name and repo_name.
339+
340+
Parameters
341+
----------
342+
user_name : str
343+
The username or owner of the repository.
344+
repo_name : str
345+
The name of the repository.
346+
347+
Returns
348+
-------
349+
str
350+
The domain of the valid repository host.
351+
352+
Raises
353+
------
354+
ValueError
355+
If no valid repository host is found for the given user_name and repo_name.
356+
"""
357+
for domain in KNOWN_GIT_HOSTS:
358+
candidate = f"https://{domain}/{user_name}/{repo_name}"
359+
if await _check_repo_exists(candidate):
360+
return domain
361+
raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.")
362+
363+
364+
def _get_user_and_repo_from_path(path: str) -> tuple[str, str]:
365+
"""
366+
Extract the user and repository names from a given path.
367+
368+
Parameters
369+
----------
370+
path : str
371+
The path to extract the user and repository names from.
372+
373+
Returns
374+
-------
375+
tuple[str, str]
376+
A tuple containing the user and repository names.
377+
378+
Raises
379+
------
380+
ValueError
381+
If the path does not contain at least two parts.
382+
"""
383+
path_parts = path.lower().strip("/").split("/")
384+
if len(path_parts) < 2:
385+
raise ValueError(f"Invalid repository URL '{path}'")
386+
return path_parts[0], path_parts[1]
387+
388+
389+
def _validate_host(host: str) -> None:
390+
"""
391+
Validate the given host against the known Git hosts.
392+
393+
Parameters
394+
----------
395+
host : str
396+
The host to validate.
397+
398+
Raises
399+
------
400+
ValueError
401+
If the host is not a known Git host.
402+
"""
403+
if host not in KNOWN_GIT_HOSTS:
404+
raise ValueError(f"Unknown domain '{host}' in URL")
405+
406+
407+
def _validate_scheme(scheme: str) -> None:
408+
"""
409+
Validate the given scheme against the known schemes.
410+
411+
Parameters
412+
----------
413+
scheme : str
414+
The scheme to validate.
415+
416+
Raises
417+
------
418+
ValueError
419+
If the scheme is not 'http' or 'https'.
420+
"""
421+
if scheme not in ("https", "http"):
422+
raise ValueError(f"Invalid URL scheme '{scheme}' in URL")

src/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ async def process_folder(folder: Path) -> None:
7878
# Extract owner and repository name from the filename
7979
if txt_files and "-" in (filename := txt_files[0].stem):
8080
owner, repo = filename.split("-", 1)
81-
repo_url = f"https://github.com/{owner}/{repo}"
81+
repo_url = f"{owner}/{repo}"
8282
with open("history.txt", mode="a", encoding="utf-8") as history:
8383
history.write(f"{repo_url}\n")
8484

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
""" Tests to verify that the query parser is Git host agnostic. """
2+
3+
import pytest
4+
5+
from gitingest.query_parser import parse_query
6+
7+
8+
@pytest.mark.parametrize(
9+
"urls, expected_user, expected_repo, expected_url",
10+
[
11+
(
12+
[
13+
"https://github.com/tiangolo/fastapi",
14+
"github.com/tiangolo/fastapi",
15+
"tiangolo/fastapi",
16+
],
17+
"tiangolo",
18+
"fastapi",
19+
"https://github.com/tiangolo/fastapi",
20+
),
21+
(
22+
[
23+
"https://gitlab.com/gitlab-org/gitlab-runner",
24+
"gitlab.com/gitlab-org/gitlab-runner",
25+
"gitlab-org/gitlab-runner",
26+
],
27+
"gitlab-org",
28+
"gitlab-runner",
29+
"https://gitlab.com/gitlab-org/gitlab-runner",
30+
),
31+
(
32+
[
33+
"https://bitbucket.org/na-dna/llm-knowledge-share",
34+
"bitbucket.org/na-dna/llm-knowledge-share",
35+
"na-dna/llm-knowledge-share",
36+
],
37+
"na-dna",
38+
"llm-knowledge-share",
39+
"https://bitbucket.org/na-dna/llm-knowledge-share",
40+
),
41+
(
42+
[
43+
"https://gitea.com/xorm/xorm",
44+
"gitea.com/xorm/xorm",
45+
"xorm/xorm",
46+
],
47+
"xorm",
48+
"xorm",
49+
"https://gitea.com/xorm/xorm",
50+
),
51+
],
52+
)
53+
@pytest.mark.asyncio
54+
async def test_parse_query_without_host(
55+
urls: list[str],
56+
expected_user: str,
57+
expected_repo: str,
58+
expected_url: str,
59+
) -> None:
60+
for url in urls:
61+
result = await parse_query(url, max_file_size=50, from_web=True)
62+
# Common assertions for all cases
63+
assert result["user_name"] == expected_user
64+
assert result["repo_name"] == expected_repo
65+
assert result["url"] == expected_url
66+
assert result["slug"] == f"{expected_user}-{expected_repo}"
67+
assert result["id"] is not None
68+
assert result["subpath"] == "/"
69+
assert result["branch"] is None
70+
assert result["commit"] is None
71+
assert result["type"] is None

tests/test_query_parser.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,3 +252,25 @@ async def test_parse_url_with_query_and_fragment() -> None:
252252
assert result["user_name"] == "user"
253253
assert result["repo_name"] == "repo"
254254
assert result["url"] == "https://github.com/user/repo" # URL should be cleaned
255+
256+
257+
async def test_parse_url_unsupported_host() -> None:
258+
url = "https://only-domain.com"
259+
with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"):
260+
await _parse_repo_source(url)
261+
262+
263+
async def test_parse_query_with_branch() -> None:
264+
url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
265+
result = await parse_query(url, max_file_size=10**9, from_web=True)
266+
assert result["user_name"] == "pandas-dev"
267+
assert result["repo_name"] == "pandas"
268+
assert result["url"] == "https://github.com/pandas-dev/pandas"
269+
assert result["slug"] == "pandas-dev-pandas"
270+
assert result["id"] is not None
271+
print('result["subpath"]', result["subpath"])
272+
print("/.github/ISSUE_TEMPLATE/documentation_improvement.yaml")
273+
assert result["subpath"] == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
274+
assert result["branch"] == "2.2.x"
275+
assert result["commit"] is None
276+
assert result["type"] == "blob"

0 commit comments

Comments
 (0)