1111from config import TMP_BASE_PATH
1212from gitingest .exceptions import InvalidPatternError
1313from gitingest .ignore_patterns import DEFAULT_IGNORE_PATTERNS
14+ from gitingest .repository_clone import _check_repo_exists
1415
15- HEX_DIGITS = set (string .hexdigits )
16+ HEX_DIGITS : set [str ] = set (string .hexdigits )
17+
18+ KNOWN_GIT_HOSTS : list [str ] = [
19+ "github.com" ,
20+ "gitlab.com" ,
21+ "bitbucket.org" ,
22+ "gitea.com" ,
23+ ]
1624
1725
1826async def parse_query (
@@ -48,16 +56,16 @@ async def parse_query(
4856 A dictionary containing the parsed query parameters, including 'max_file_size',
4957 'ignore_patterns', and 'include_patterns'.
5058 """
51- # Normalize and clean up the source string to make it case-insensitive
52- source = source .lower ().strip ()
5359
5460 # Determine the parsing method based on the source type
55- if from_web or source .startswith ("https://" ) or "github.com" in source :
56- query = _parse_repo_source (source )
61+ if from_web or urlparse (source ).scheme in ("https" , "http" ) or any (h in source for h in KNOWN_GIT_HOSTS ):
62+ # We either have a full URL or a domain-less slug
63+ query = await _parse_repo_source (source )
5764 else :
65+ # Local path scenario
5866 query = _parse_path (source )
5967
60- # Process ignore patterns
68+ # Combine ignore patterns
6169 ignore_patterns_list = DEFAULT_IGNORE_PATTERNS .copy ()
6270 if ignore_patterns :
6371 ignore_patterns_list += _parse_patterns (ignore_patterns )
@@ -69,7 +77,6 @@ async def parse_query(
6977 else :
7078 parsed_include = None
7179
72- # Update the query dictionary with max_file_size and processed patterns
7380 query .update (
7481 {
7582 "max_file_size" : max_file_size ,
@@ -80,52 +87,54 @@ async def parse_query(
8087 return query
8188
8289
83- def _parse_repo_source (url : str ) -> dict [str , Any ]:
90+ async def _parse_repo_source (source : str ) -> dict [str , Any ]:
8491 """
85- Parse a GitHub repository URL into a structured query dictionary.
92+ Parse a repository URL into a structured query dictionary.
8693
87- This function extracts relevant information from a GitHub URL, such as the username,
88- repository name, commit, branch, and subpath, and returns them in a structured format.
94+ If source is:
95+ - A fully qualified URL (https://gitlab.com/...), parse & verify that domain
96+ - A URL missing 'https://' (gitlab.com/...), add 'https://' and parse
97+ - A 'slug' (like 'pandas-dev/pandas'), attempt known domains until we find one that exists.
8998
9099 Parameters
91100 ----------
92- url : str
93- The GitHub URL to parse.
101+ source : str
102+ The URL or domain-less slug to parse.
94103
95104 Returns
96105 -------
97106 dict[str, Any]
98- A dictionary containing the parsed details of the GitHub repository, including
99- the username, repository name, commit, branch, and other relevant information.
100-
101- Raises
102- ------
103- ValueError
104- If the URL is invalid or does not correspond to a valid Git repository.
107+ A dictionary containing the parsed details of the repository, including the username,
108+ repository name, commit, branch, and other relevant information.
105109 """
106- # Clean up the URL
107- url = url .split (" " )[0 ] # remove trailing text
108- url = unquote (url ) # decode URL-encoded characters
110+ source = unquote (source )
109111
110- if not url . startswith (( "https://" , "http://" )):
111- url = "https://" + url
112+ # Attempt to parse
113+ parsed_url = urlparse ( source )
112114
113- # Parse URL and reconstruct it without query parameters and fragments
114- parsed_url = urlparse ( url )
115- url = f" { parsed_url .scheme } :// { parsed_url . netloc } { parsed_url . path } "
115+ if parsed_url . scheme :
116+ _validate_scheme ( parsed_url . scheme )
117+ _validate_host ( parsed_url .netloc . lower ())
116118
117- # Extract domain and path
118- url_parts = url .split ("/" )
119- domain = url_parts [2 ]
120- path_parts = url_parts [3 :]
119+ else : # Will be of the form 'host/user/repo' or 'user/repo'
120+ tmp_host = source .split ("/" )[0 ].lower ()
121+ if "." in tmp_host :
122+ _validate_host (tmp_host )
123+ else :
124+ # No scheme, no domain => user typed "user/repo", so we'll guess the domain.
125+ host = await try_domains_for_user_and_repo (* _get_user_and_repo_from_path (source ))
126+ source = f"{ host } /{ source } "
121127
122- if len (path_parts ) < 2 :
123- raise ValueError ("Invalid repository URL. Please provide a valid Git repository URL." )
128+ source = "https://" + source
129+ parsed_url = urlparse (source )
130+
131+ host = parsed_url .netloc .lower ()
132+ user_name , repo_name = _get_user_and_repo_from_path (parsed_url .path )
124133
125- user_name = path_parts [0 ]
126- repo_name = path_parts [1 ]
127134 _id = str (uuid .uuid4 ())
128135 slug = f"{ user_name } -{ repo_name } "
136+ local_path = Path (TMP_BASE_PATH ) / _id / slug
137+ url = f"https://{ host } /{ user_name } /{ repo_name } "
129138
130139 parsed = {
131140 "user_name" : user_name ,
@@ -134,31 +143,39 @@ def _parse_repo_source(url: str) -> dict[str, Any]:
134143 "branch" : None ,
135144 "commit" : None ,
136145 "subpath" : "/" ,
137- "local_path" : Path ( TMP_BASE_PATH ) / _id / slug ,
138- "url" : f"https:// { domain } / { user_name } / { repo_name } " ,
139- "slug" : slug ,
146+ "local_path" : local_path ,
147+ "url" : url ,
148+ "slug" : slug , # e.g. "pandas-dev-pandas"
140149 "id" : _id ,
141150 }
142151
143- # If this is an issues page or pull requests, return early without processing subpath
144- if len (path_parts ) > 2 and (path_parts [2 ] == "issues" or path_parts [2 ] == "pull" ):
152+ remaining_parts = parsed_url .path .strip ("/" ).split ("/" )[2 :]
153+
154+ if not remaining_parts :
145155 return parsed
146156
157+ possible_type = remaining_parts .pop (0 ) # e.g. 'issues', 'pull', 'tree', 'blob'
158+
147159 # If no extra path parts, just return
148- if len (path_parts ) < 4 :
160+ if not remaining_parts :
161+ return parsed
162+
163+ # If this is an issues page or pull requests, return early without processing subpath
164+ if remaining_parts and possible_type in ("issues" , "pull" ):
149165 return parsed
150166
151- parsed ["type" ] = path_parts [2 ] # Usually 'tree' or 'blob'
152- commit = path_parts [3 ]
167+ parsed ["type" ] = possible_type
153168
154- if _is_valid_git_commit_hash ( commit ):
155- parsed [ "commit" ] = commit
156- if len ( path_parts ) > 4 :
157- parsed ["subpath " ] += "/" . join ( path_parts [ 4 :])
169+ # Commit or branch
170+ commit_or_branch = remaining_parts . pop ( 0 )
171+ if _is_valid_git_commit_hash ( commit_or_branch ) :
172+ parsed ["commit " ] = commit_or_branch
158173 else :
159- parsed ["branch" ] = commit
160- if len (path_parts ) > 4 :
161- parsed ["subpath" ] += "/" .join (path_parts [4 :])
174+ parsed ["branch" ] = commit_or_branch
175+
176+ # Subpath if anything left
177+ if remaining_parts :
178+ parsed ["subpath" ] += "/" .join (remaining_parts )
162179
163180 return parsed
164181
@@ -314,3 +331,92 @@ def _is_valid_pattern(pattern: str) -> bool:
314331 True if the pattern is valid, otherwise False.
315332 """
316333 return all (c .isalnum () or c in "-_./+*" for c in pattern )
334+
335+
336+ async def try_domains_for_user_and_repo (user_name : str , repo_name : str ) -> str :
337+ """
338+ Attempt to find a valid repository host for the given user_name and repo_name.
339+
340+ Parameters
341+ ----------
342+ user_name : str
343+ The username or owner of the repository.
344+ repo_name : str
345+ The name of the repository.
346+
347+ Returns
348+ -------
349+ str
350+ The domain of the valid repository host.
351+
352+ Raises
353+ ------
354+ ValueError
355+ If no valid repository host is found for the given user_name and repo_name.
356+ """
357+ for domain in KNOWN_GIT_HOSTS :
358+ candidate = f"https://{ domain } /{ user_name } /{ repo_name } "
359+ if await _check_repo_exists (candidate ):
360+ return domain
361+ raise ValueError (f"Could not find a valid repository host for '{ user_name } /{ repo_name } '." )
362+
363+
364+ def _get_user_and_repo_from_path (path : str ) -> tuple [str , str ]:
365+ """
366+ Extract the user and repository names from a given path.
367+
368+ Parameters
369+ ----------
370+ path : str
371+ The path to extract the user and repository names from.
372+
373+ Returns
374+ -------
375+ tuple[str, str]
376+ A tuple containing the user and repository names.
377+
378+ Raises
379+ ------
380+ ValueError
381+ If the path does not contain at least two parts.
382+ """
383+ path_parts = path .lower ().strip ("/" ).split ("/" )
384+ if len (path_parts ) < 2 :
385+ raise ValueError (f"Invalid repository URL '{ path } '" )
386+ return path_parts [0 ], path_parts [1 ]
387+
388+
389+ def _validate_host (host : str ) -> None :
390+ """
391+ Validate the given host against the known Git hosts.
392+
393+ Parameters
394+ ----------
395+ host : str
396+ The host to validate.
397+
398+ Raises
399+ ------
400+ ValueError
401+ If the host is not a known Git host.
402+ """
403+ if host not in KNOWN_GIT_HOSTS :
404+ raise ValueError (f"Unknown domain '{ host } ' in URL" )
405+
406+
407+ def _validate_scheme (scheme : str ) -> None :
408+ """
409+ Validate the given scheme against the known schemes.
410+
411+ Parameters
412+ ----------
413+ scheme : str
414+ The scheme to validate.
415+
416+ Raises
417+ ------
418+ ValueError
419+ If the scheme is not 'http' or 'https'.
420+ """
421+ if scheme not in ("https" , "http" ):
422+ raise ValueError (f"Invalid URL scheme '{ scheme } ' in URL" )
0 commit comments