Skip to content

Commit 814a15d

Browse files
author
Dai Hung PHAM
committed
feat: Enhance URL parsing and repository ingestion robustness
1 parent 2477e70 commit 814a15d

File tree

5 files changed

+252
-187
lines changed

5 files changed

+252
-187
lines changed

src/gitingest/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from gitingest.clone import clone_repo
1+
from gitingest.clone import clone_repo, CloneConfig
22
from gitingest.ingest import ingest
33
from gitingest.ingest_from_query import ingest_from_query
44
from gitingest.parse_query import parse_query

src/gitingest/cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import pathlib
23

34
import click
45
import sys

src/gitingest/ingest.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
from pathlib import Path
55
import io
66
import sys
7+
from typing import Union
78

89
# Import other modules from the package
910
from gitingest.parse_query import parse_query
10-
from gitingest.clone import clone_repo
11+
from gitingest.clone import clone_repo, CloneConfig
1112
from gitingest.ingest_from_query import ingest_from_query
1213

1314
def setup_encoding():
@@ -17,8 +18,8 @@ def setup_encoding():
1718
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
1819

1920
def ingest(source: str, max_file_size: int = 10 * 1024 * 1024,
20-
include_patterns: Union[List[str], str] = None,
21-
exclude_patterns: Union[List[str], str] = None,
21+
include_patterns: Union[list[str], str] = None,
22+
exclude_patterns: Union[list[str], str] = None,
2223
output: str = None) -> tuple[str, str, str]:
2324
"""
2425
Analyze and create a text dump of source contents.

src/gitingest/ingest_from_query.py

Lines changed: 68 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,38 @@ def should_include(path: str, base_path: str, include_patterns: list[str]) -> bo
2020

2121

2222
def should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bool:
23+
"""
24+
Check if a path should be excluded based on ignore patterns.
25+
Supports full directory exclusions with nested content.
26+
"""
2327
rel_path = path.replace(base_path, "").lstrip(os.sep)
28+
rel_path = rel_path.replace('\\', '/') # Normalize path separators
29+
2430
for pattern in ignore_patterns:
2531
if pattern == "":
2632
continue
27-
if fnmatch(rel_path, pattern):
28-
return True
33+
34+
# If pattern is a directory (ends with /), all its contents should be excluded
35+
is_dir_pattern = pattern.endswith('/')
36+
pattern_base = pattern.rstrip('/')
37+
38+
# For directory patterns, exclude both the directory itself and all its contents
39+
if is_dir_pattern:
40+
# Check if path is or starts with the directory pattern
41+
if rel_path == pattern_base or rel_path.startswith(f"{pattern_base}/"):
42+
return True
43+
# Also check parent directories
44+
rel_parts = rel_path.split('/')
45+
for i in range(len(rel_parts)):
46+
if '/'.join(rel_parts[:i+1]) == pattern_base:
47+
return True
48+
else:
49+
# For file patterns, use standard fnmatch
50+
if fnmatch(rel_path, pattern):
51+
return True
52+
2953
return False
3054

31-
3255
def is_safe_symlink(symlink_path: str, base_path: str) -> bool:
3356
"""Check if a symlink points to a location within the base directory."""
3457
try:
@@ -71,6 +94,29 @@ def scan_directory(
7194
if stats is None:
7295
stats = {"total_files": 0, "total_size": 0}
7396

97+
# Convert to absolute paths and normalize slashes, remove trailing slashes
98+
path = os.path.abspath(os.path.normpath(path.rstrip('\\/')))
99+
base_path = os.path.abspath(os.path.normpath(query["local_path"].rstrip('\\/')))
100+
101+
# Check if path exists and is a directory
102+
if not os.path.exists(path):
103+
print(f"Path does not exist: {path}")
104+
return None
105+
106+
if not os.path.isdir(path):
107+
print(f"Path is not a directory: {path}")
108+
return None
109+
110+
# Check if path is same as or subdirectory of base_path
111+
try:
112+
relative = os.path.relpath(path, base_path)
113+
if relative.startswith('..'):
114+
print(f"Skipping path outside target directory: {path}")
115+
return None
116+
except ValueError:
117+
print(f"Skipping path outside target directory: {path}")
118+
return None
119+
74120
if depth > MAX_DIRECTORY_DEPTH:
75121
print(f"Skipping deep directory: {path} (max depth {MAX_DIRECTORY_DEPTH} reached)")
76122
return None
@@ -102,16 +148,17 @@ def scan_directory(
102148
}
103149

104150
ignore_patterns = query["ignore_patterns"]
105-
base_path = query["local_path"]
106151
include_patterns = query["include_patterns"]
107152

108153
try:
109-
for item in os.listdir(path):
110-
item_path = os.path.join(path, item)
111-
print(f"Checking path: {path}")
154+
items = sorted(os.listdir(path)) # Sort for consistent ordering
155+
for item in items:
156+
item_path = os.path.normpath(os.path.join(path, item))
157+
158+
print(f"Checking path: {item_path}") # Show what we're actually checking
112159

113160
if should_exclude(item_path, base_path, ignore_patterns):
114-
print(f"Checking path: {path}")
161+
print(f"Skipping excluded path: {item_path}")
115162
continue
116163

117164
is_file = os.path.isfile(item_path)
@@ -120,22 +167,6 @@ def scan_directory(
120167
result["ignore_content"] = True
121168
continue
122169

123-
# Handle symlinks
124-
if os.path.islink(item_path):
125-
if not is_safe_symlink(item_path, base_path):
126-
print(f"Skipping symlink that points outside base directory: {item_path}")
127-
continue
128-
real_path = os.path.realpath(item_path)
129-
if real_path in seen_paths:
130-
print(f"Skipping already visited symlink target: {item_path}")
131-
continue
132-
133-
if os.path.isfile(real_path):
134-
file_size = os.path.getsize(real_path)
135-
if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES:
136-
print(f"Skipping file {item_path}: would exceed total size limit")
137-
continue
138-
139170
stats["total_files"] += 1
140171
stats["total_size"] += file_size
141172

@@ -396,11 +427,19 @@ def ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]:
396427

397428
def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]:
398429
"""Main entry point for analyzing a codebase directory or single file."""
399-
path = os.path.join(query["local_path"], query["subpath"].lstrip(os.sep))
400-
if not os.path.exists(path) and not os.path.exists(os.path.dirname(path)):
401-
raise ValueError(f"{query['subpath']} cannot be found")
402-
430+
# Normalize the path properly, remove trailing slashes
431+
path = os.path.abspath(os.path.normpath(
432+
os.path.join(query["local_path"].rstrip('\\/'),
433+
query["subpath"].lstrip(os.sep).rstrip('\\/'))
434+
))
435+
436+
if not os.path.exists(path):
437+
raise ValueError(f"Path does not exist: {path}")
438+
403439
if query.get("type") == "blob":
404440
return ingest_single_file(path, query)
405441

406-
return ingest_directory(path, query)
442+
if not os.path.isdir(path):
443+
raise ValueError(f"Path is not a directory: {path}")
444+
445+
return ingest_directory(path, query)

0 commit comments

Comments
 (0)