11import os
22import uuid
3- from typing import Any , Dict , List , Optional , Union
43
4+ from urllib .parse import unquote
5+ from typing import Any , Dict , List , Optional , Union
56from gitingest .ignore_patterns import DEFAULT_IGNORE_PATTERNS
67
78TMP_BASE_PATH = "../tmp"
@@ -22,6 +23,8 @@ def parse_url(url: str) -> Dict[str, Any]:
2223 }
2324
2425 url = url .split (" " )[0 ]
26+ url = unquote (url ) # Decode URL-encoded characters
27+
2528 if not url .startswith ('https://' ):
2629 url = 'https://' + url
2730
@@ -36,19 +39,34 @@ def parse_url(url: str) -> Dict[str, Any]:
3639 parsed ["user_name" ] = path_parts [0 ]
3740 parsed ["repo_name" ] = path_parts [1 ]
3841
39- # Keep original URL format
42+ # Keep original URL format but with decoded components
4043 parsed ["url" ] = f"https://{ domain } /{ parsed ['user_name' ]} /{ parsed ['repo_name' ]} "
4144 parsed ['slug' ] = f"{ parsed ['user_name' ]} -{ parsed ['repo_name' ]} "
4245 parsed ["id" ] = str (uuid .uuid4 ())
4346 parsed ["local_path" ] = f"{ TMP_BASE_PATH } /{ parsed ['id' ]} /{ parsed ['slug' ]} "
4447
4548 if len (path_parts ) > 3 :
46- parsed ["type" ] = path_parts [2 ]
47- parsed ["branch" ] = path_parts [3 ]
48- if len (parsed ['branch' ]) == 40 and all (c in '0123456789abcdefABCDEF' for c in parsed ['branch' ]):
49- parsed ["commit" ] = parsed ['branch' ]
5049
51- parsed ["subpath" ] = "/" + "/" .join (path_parts [4 :])
50+ parsed ["type" ] = path_parts [2 ] # Usually 'tree' or 'blob'
51+
52+ # Find the commit hash or reconstruct the branch name
53+ remaining_parts = path_parts [3 :]
54+ if remaining_parts [0 ] and len (remaining_parts [0 ]) == 40 and all (c in '0123456789abcdefABCDEF' for c in remaining_parts [0 ]):
55+ parsed ["commit" ] = remaining_parts [0 ]
56+ parsed ["subpath" ] = "/" + "/" .join (remaining_parts [1 :]) if len (remaining_parts ) > 1 else "/"
57+ else :
58+ # Handle branch names with slashes and special characters
59+ for i , part in enumerate (remaining_parts ):
60+ if part in ('tree' , 'blob' ):
61+ # Found another type indicator, everything before this was the branch name
62+ parsed ["branch" ] = "/" .join (remaining_parts [:i ])
63+ parsed ["subpath" ] = "/" + "/" .join (remaining_parts [i + 2 :]) if len (remaining_parts ) > i + 2 else "/"
64+ break
65+ else :
66+ # No additional type indicator found, assume everything is part of the branch name
67+ parsed ["branch" ] = "/" .join (remaining_parts )
68+ parsed ["subpath" ] = "/"
69+
5270
5371 return parsed
5472
0 commit comments