Skip to content

Commit c281db3

Browse files
author
Dai Hung PHAM
committed
fix(ignore-patterns): resolve .git and nested path exclusions with improved .gitignore parsing
1 parent 880b6c3 commit c281db3

File tree

2 files changed

+153
-10
lines changed

2 files changed

+153
-10
lines changed

src/gitingest/ingest_from_query.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,28 @@ def should_include(path: str, base_path: str, include_patterns: List[str]) -> bo
1919
return include
2020

2121
def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool:
22-
rel_path = path.replace(base_path, "").lstrip(os.sep)
22+
"""
23+
Check if a file or directory should be ignored.
24+
25+
Args:
26+
path (str): Path to check.
27+
base_path (str): Root base path.
28+
ignore_patterns (List[str]): List of patterns to ignore.
29+
30+
Returns:
31+
bool: True if the path should be ignored.
32+
"""
33+
rel_path = os.path.relpath(path, base_path).replace("\\", "/")
2334
for pattern in ignore_patterns:
24-
if pattern == '':
25-
continue
26-
if fnmatch(rel_path, pattern):
35+
if fnmatch(rel_path, pattern) or fnmatch(os.path.basename(path), pattern):
36+
return True
37+
# Special case for directories ending with /
38+
if os.path.isdir(path) and fnmatch(rel_path + '/', pattern):
2739
return True
2840
return False
2941

42+
43+
3044
def is_safe_symlink(symlink_path: str, base_path: str) -> bool:
3145
"""Check if a symlink points to a location within the base directory."""
3246
try:
@@ -96,8 +110,10 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int =
96110
try:
97111
for item in os.listdir(path):
98112
item_path = os.path.join(path, item)
113+
print(f"Checking path: {path}")
99114

100115
if should_exclude(item_path, base_path, ignore_patterns):
116+
print(f"Checking path: {path}")
101117
continue
102118

103119
is_file = os.path.isfile(item_path)

src/gitingest/parse_query.py

Lines changed: 133 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,40 +93,81 @@ def parse_url(url: str) -> dict:
9393
parsed["subpath"] = "/" + "/".join(path_parts[4:])
9494
return parsed
9595

96+
### 📝 **Normalize Pattern**
9697
def normalize_pattern(pattern: str) -> str:
98+
"""
99+
Normalize a pattern by stripping and formatting.
100+
101+
Args:
102+
pattern (str): The ignore pattern.
103+
104+
Returns:
105+
str: Normalized pattern.
106+
"""
97107
pattern = pattern.strip()
98108
pattern = pattern.lstrip(os.sep)
99109
if pattern.endswith(os.sep):
100110
pattern += "*"
101111
return pattern
102112

113+
### 📝 **Parse Patterns**
103114
def parse_patterns(pattern: Union[List[str], str]) -> List[str]:
115+
"""
116+
Parse and validate patterns.
117+
118+
Args:
119+
pattern (Union[List[str], str]): Patterns to parse.
120+
121+
Returns:
122+
List[str]: Parsed patterns.
123+
"""
104124
if isinstance(pattern, list):
105125
pattern = ",".join(pattern)
106126

107127
for p in pattern.split(","):
108128
if not all(c.isalnum() or c in "-_./+*" for c in p.strip()):
109-
raise ValueError(f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed.")
110-
patterns = [normalize_pattern(p) for p in pattern.split(",")]
111-
return patterns
129+
raise ValueError(
130+
f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed."
131+
)
132+
return [normalize_pattern(p) for p in pattern.split(",")]
112133

134+
### 📝 **Override Ignore Patterns**
113135
def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]:
136+
"""
137+
Remove include patterns from ignore patterns.
138+
139+
Args:
140+
ignore_patterns (List[str]): Ignore patterns.
141+
include_patterns (List[str]): Include patterns.
142+
143+
Returns:
144+
List[str]: Updated ignore patterns.
145+
"""
114146
for pattern in include_patterns:
115147
if pattern in ignore_patterns:
116148
ignore_patterns.remove(pattern)
117149
return ignore_patterns
118150

119151

152+
### 📝 **Parse Path**
120153
def parse_path(path: str) -> dict:
121-
122-
query = {
154+
"""
155+
Parse a local file path.
156+
157+
Args:
158+
path (str): File path.
159+
160+
Returns:
161+
dict: Parsed path details.
162+
"""
163+
return {
123164
"local_path": os.path.abspath(path),
124165
"slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path),
125166
"subpath": "/",
126167
"id": str(uuid.uuid4()),
127168
"url": None,
128169
}
129-
return query
170+
130171

131172
def parse_query(source: str, max_file_size: int, from_web: bool, include_patterns: Union[List[str], str] = None, ignore_patterns: Union[List[str], str] = None) -> dict:
132173
if from_web:
@@ -154,3 +195,89 @@ def parse_query(source: str, max_file_size: int, from_web: bool, include_pattern
154195

155196
return query
156197

198+
### 📝 **Parse .gitignore**
199+
def parse_gitignore(gitignore_path: str) -> List[str]:
200+
"""
201+
Parse .gitignore and return ignore patterns.
202+
203+
Args:
204+
gitignore_path (str): Path to the .gitignore file.
205+
206+
Returns:
207+
List[str]: List of ignore patterns.
208+
"""
209+
ignore_patterns = []
210+
if os.path.exists(gitignore_path):
211+
with open(gitignore_path, 'r', encoding='utf-8') as file:
212+
for line in file:
213+
line = line.strip()
214+
if line and not line.startswith('#'):
215+
# Ensure directory patterns end with '/'
216+
if os.path.isdir(os.path.join(os.path.dirname(gitignore_path), line)):
217+
line = line.rstrip('/') + '/'
218+
ignore_patterns.append(line)
219+
return ignore_patterns
220+
221+
222+
### 📝 **Parse Query**
223+
def parse_query(source: str, max_file_size: int, from_web: bool,
224+
include_patterns: Union[List[str], str] = None,
225+
ignore_patterns: Union[List[str], str] = None) -> dict:
226+
"""
227+
Parse the query and apply ignore patterns.
228+
229+
Args:
230+
source (str): Source path or URL.
231+
max_file_size (int): Maximum file size.
232+
from_web (bool): Web source or local.
233+
include_patterns (Union[List[str], str]): Include patterns.
234+
ignore_patterns (Union[List[str], str]): Ignore patterns.
235+
236+
Returns:
237+
dict: Query object with patterns.
238+
"""
239+
if from_web:
240+
query = parse_url(source)
241+
else:
242+
query = parse_path(source)
243+
244+
query['max_file_size'] = max_file_size
245+
246+
# Start with default ignore patterns
247+
final_ignore_patterns = DEFAULT_IGNORE_PATTERNS.copy()
248+
249+
# Load from .gitignore
250+
gitignore_path = os.path.join(query['local_path'], '.gitignore')
251+
print(f"find .gitignore on project --> {gitignore_path}")
252+
253+
if os.path.exists(gitignore_path):
254+
gitignore_patterns = parse_gitignore(gitignore_path)
255+
final_ignore_patterns.extend(gitignore_patterns)
256+
print(f"\n🛡️ Patterns from: {gitignore_path}")
257+
for pattern in gitignore_patterns:
258+
print(f" - {pattern}")
259+
# Add user-defined ignore patterns
260+
if ignore_patterns:
261+
final_ignore_patterns.extend(parse_patterns(ignore_patterns))
262+
263+
# Handle include patterns
264+
if include_patterns:
265+
include_patterns = parse_patterns(include_patterns)
266+
final_ignore_patterns = override_ignore_patterns(final_ignore_patterns, include_patterns)
267+
268+
query['ignore_patterns'] = final_ignore_patterns
269+
query['include_patterns'] = include_patterns
270+
# 🖨️ Print patterns to the console
271+
print("\n🛡️ Applied Ignore Patterns:")
272+
for pattern in final_ignore_patterns:
273+
print(f" - {pattern}")
274+
275+
if include_patterns:
276+
print("\n✅ Included Patterns:")
277+
for pattern in include_patterns:
278+
print(f" - {pattern}")
279+
else:
280+
print("\n✅ Included Patterns: None")
281+
282+
return query
283+
return query

0 commit comments

Comments
 (0)