@@ -20,15 +20,38 @@ def should_include(path: str, base_path: str, include_patterns: list[str]) -> bo
2020
2121
2222def should_exclude (path : str , base_path : str , ignore_patterns : list [str ]) -> bool :
23+ """
24+ Check if a path should be excluded based on ignore patterns.
25+ Supports full directory exclusions with nested content.
26+ """
2327 rel_path = path .replace (base_path , "" ).lstrip (os .sep )
28+ rel_path = rel_path .replace ('\\ ' , '/' ) # Normalize path separators
29+
2430 for pattern in ignore_patterns :
2531 if pattern == "" :
2632 continue
27- if fnmatch (rel_path , pattern ):
28- return True
33+
34+ # If pattern is a directory (ends with /), all its contents should be excluded
35+ is_dir_pattern = pattern .endswith ('/' )
36+ pattern_base = pattern .rstrip ('/' )
37+
38+ # For directory patterns, exclude both the directory itself and all its contents
39+ if is_dir_pattern :
40+ # Check if path is or starts with the directory pattern
41+ if rel_path == pattern_base or rel_path .startswith (f"{ pattern_base } /" ):
42+ return True
43+ # Also check parent directories
44+ rel_parts = rel_path .split ('/' )
45+ for i in range (len (rel_parts )):
46+ if '/' .join (rel_parts [:i + 1 ]) == pattern_base :
47+ return True
48+ else :
49+ # For file patterns, use standard fnmatch
50+ if fnmatch (rel_path , pattern ):
51+ return True
52+
2953 return False
3054
31-
3255def is_safe_symlink (symlink_path : str , base_path : str ) -> bool :
3356 """Check if a symlink points to a location within the base directory."""
3457 try :
@@ -71,6 +94,29 @@ def scan_directory(
7194 if stats is None :
7295 stats = {"total_files" : 0 , "total_size" : 0 }
7396
97+ # Convert to absolute paths and normalize slashes, remove trailing slashes
98+ path = os .path .abspath (os .path .normpath (path .rstrip ('\\ /' )))
99+ base_path = os .path .abspath (os .path .normpath (query ["local_path" ].rstrip ('\\ /' )))
100+
101+ # Check if path exists and is a directory
102+ if not os .path .exists (path ):
103+ print (f"Path does not exist: { path } " )
104+ return None
105+
106+ if not os .path .isdir (path ):
107+ print (f"Path is not a directory: { path } " )
108+ return None
109+
110+ # Check if path is same as or subdirectory of base_path
111+ try :
112+ relative = os .path .relpath (path , base_path )
113+ if relative .startswith ('..' ):
114+ print (f"Skipping path outside target directory: { path } " )
115+ return None
116+ except ValueError :
117+ print (f"Skipping path outside target directory: { path } " )
118+ return None
119+
74120 if depth > MAX_DIRECTORY_DEPTH :
75121 print (f"Skipping deep directory: { path } (max depth { MAX_DIRECTORY_DEPTH } reached)" )
76122 return None
@@ -102,16 +148,17 @@ def scan_directory(
102148 }
103149
104150 ignore_patterns = query ["ignore_patterns" ]
105- base_path = query ["local_path" ]
106151 include_patterns = query ["include_patterns" ]
107152
108153 try :
109- for item in os .listdir (path ):
110- item_path = os .path .join (path , item )
111- print (f"Checking path: { path } " )
154+ items = sorted (os .listdir (path )) # Sort for consistent ordering
155+ for item in items :
156+ item_path = os .path .normpath (os .path .join (path , item ))
157+
158+ print (f"Checking path: { item_path } " ) # Show what we're actually checking
112159
113160 if should_exclude (item_path , base_path , ignore_patterns ):
114- print (f"Checking path: { path } " )
161+ print (f"Skipping excluded path: { item_path } " )
115162 continue
116163
117164 is_file = os .path .isfile (item_path )
@@ -120,22 +167,6 @@ def scan_directory(
120167 result ["ignore_content" ] = True
121168 continue
122169
123- # Handle symlinks
124- if os .path .islink (item_path ):
125- if not is_safe_symlink (item_path , base_path ):
126- print (f"Skipping symlink that points outside base directory: { item_path } " )
127- continue
128- real_path = os .path .realpath (item_path )
129- if real_path in seen_paths :
130- print (f"Skipping already visited symlink target: { item_path } " )
131- continue
132-
133- if os .path .isfile (real_path ):
134- file_size = os .path .getsize (real_path )
135- if stats ["total_size" ] + file_size > MAX_TOTAL_SIZE_BYTES :
136- print (f"Skipping file { item_path } : would exceed total size limit" )
137- continue
138-
139170 stats ["total_files" ] += 1
140171 stats ["total_size" ] += file_size
141172
@@ -396,11 +427,19 @@ def ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]:
396427
397428def ingest_from_query (query : dict [str , Any ]) -> tuple [str , str , str ]:
398429 """Main entry point for analyzing a codebase directory or single file."""
399- path = os .path .join (query ["local_path" ], query ["subpath" ].lstrip (os .sep ))
400- if not os .path .exists (path ) and not os .path .exists (os .path .dirname (path )):
401- raise ValueError (f"{ query ['subpath' ]} cannot be found" )
402-
430+ # Normalize the path properly, remove trailing slashes
431+ path = os .path .abspath (os .path .normpath (
432+ os .path .join (query ["local_path" ].rstrip ('\\ /' ),
433+ query ["subpath" ].lstrip (os .sep ).rstrip ('\\ /' ))
434+ ))
435+
436+ if not os .path .exists (path ):
437+ raise ValueError (f"Path does not exist: { path } " )
438+
403439 if query .get ("type" ) == "blob" :
404440 return ingest_single_file (path , query )
405441
406- return ingest_directory (path , query )
442+ if not os .path .isdir (path ):
443+ raise ValueError (f"Path is not a directory: { path } " )
444+
445+ return ingest_directory (path , query )
0 commit comments