Skip to content

Commit 45da897

Browse files
committed
fix: traverse directories to allow pattern matching of files within them
1 parent d36b3a0 commit 45da897

File tree

4 files changed

+221
-7
lines changed

4 files changed

+221
-7
lines changed

src/gitingest/cli.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,34 @@
1313

1414
@click.command()
1515
@click.argument("source", type=str, default=".")
16-
@click.option("--output", "-o", default=None, help="Output file path (default: <repo_name>.txt in current directory)")
17-
@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes")
18-
@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude")
19-
@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include")
16+
@click.option(
17+
"--output",
18+
"-o",
19+
default=None,
20+
help="Output file path (default: <repo_name>.txt in current directory)",
21+
)
22+
@click.option(
23+
"--max-size",
24+
"-s",
25+
default=MAX_FILE_SIZE,
26+
help="Maximum file size to process in bytes",
27+
)
28+
@click.option(
29+
"--exclude-pattern",
30+
"-e",
31+
multiple=True,
32+
help="""Patterns to exclude. Handles python's arbitrary subset of Unix
33+
shell-style wildcards. See:
34+
https://docs.python.org/3/library/fnmatch.html""",
35+
)
36+
@click.option(
37+
"--include-pattern",
38+
"-i",
39+
multiple=True,
40+
help="""Patterns to include. Handles python's arbitrary subset of Unit
41+
shell-style wildcards. See:
42+
https://docs.python.org/3/library/fnmatch.html""",
43+
)
2044
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
2145
def main(
2246
source: str,
@@ -27,10 +51,11 @@ def main(
2751
branch: Optional[str],
2852
):
2953
"""
30-
Main entry point for the CLI. This function is called when the CLI is run as a script.
54+
Main entry point for the CLI. This function is called when the CLI is run as a script.
3155
3256
It calls the async main function to run the command.
3357
58+
\b
3459
Parameters
3560
----------
3661
source : str

src/gitingest/ingestion.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,10 @@ def _process_node(
202202
query=query,
203203
stats=stats,
204204
)
205+
206+
if not child_directory_node.children:
207+
continue
208+
205209
node.children.append(child_directory_node)
206210
node.size += child_directory_node.size
207211
node.file_count += child_directory_node.file_count

src/gitingest/utils/ingestion_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) ->
3333
return False
3434

3535
rel_str = str(rel_path)
36+
37+
# if path is a directory, include it by default
3638
if path.is_dir():
37-
rel_str += "/"
39+
return True
3840

3941
for pattern in include_patterns:
4042
if fnmatch(rel_str, pattern):

tests/test_ingestion.py

Lines changed: 184 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
"""
77

88
from pathlib import Path
9+
from typing import TypedDict
10+
11+
import pytest
912

1013
from gitingest.ingestion import ingest_query
1114
from gitingest.query_parsing import IngestionQuery
@@ -42,5 +45,185 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) ->
4245
# TODO: Additional tests:
4346
# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"].
4447
# - Edge cases with weird file names or deep subdirectory structures.
45-
# TODO : def test_include_txt_pattern
4648
# TODO : def test_include_nonexistent_extension
49+
50+
51+
class PatternScenario(TypedDict):
52+
include_patterns: set[str]
53+
ignore_patterns: set[str]
54+
expected_num_files: int
55+
expected_content: set[str]
56+
expected_structure: set[str]
57+
expected_not_structure: set[str]
58+
59+
60+
@pytest.mark.parametrize(
61+
"pattern_scenario",
62+
[
63+
pytest.param(
64+
PatternScenario(
65+
{
66+
"include_patterns": {"file2.py", "dir2/file_dir2.txt"},
67+
"ignore_patterns": {*()},
68+
"expected_num_files": 2,
69+
"expected_content": {"file2.py", "dir2/file_dir2.txt"},
70+
"expected_structure": {"test_repo/", "dir2/"},
71+
"expected_not_structure": {"src/", "subdir/", "dir1/"},
72+
}
73+
),
74+
id="include-explicit-files",
75+
),
76+
pytest.param(
77+
PatternScenario(
78+
{
79+
"include_patterns": {
80+
"file1.txt",
81+
"file2.py",
82+
"file_dir1.txt",
83+
"*/file_dir2.txt",
84+
},
85+
"ignore_patterns": {*()},
86+
"expected_num_files": 3,
87+
"expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"},
88+
"expected_structure": {"test_repo/", "dir2/"},
89+
"expected_not_structure": {"src/", "subdir/", "dir1/"},
90+
}
91+
),
92+
id="include-wildcard-directory",
93+
),
94+
pytest.param(
95+
PatternScenario(
96+
{
97+
"include_patterns": {"*.py"},
98+
"ignore_patterns": {*()},
99+
"expected_num_files": 3,
100+
"expected_content": {
101+
"file2.py",
102+
"src/subfile2.py",
103+
"src/subdir/file_subdir.py",
104+
},
105+
"expected_structure": {"test_repo/", "src/", "subdir/"},
106+
"expected_not_structure": {"dir1/", "dir2/"},
107+
}
108+
),
109+
id="include-wildcard-files",
110+
),
111+
pytest.param(
112+
PatternScenario(
113+
{
114+
"include_patterns": {"**/file_dir2.txt", "src/**/*.py"},
115+
"ignore_patterns": {*()},
116+
"expected_num_files": 2,
117+
"expected_content": {
118+
"dir2/file_dir2.txt",
119+
"src/subdir/file_subdir.py",
120+
},
121+
"expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"},
122+
"expected_not_structure": {"dir1/"},
123+
}
124+
),
125+
id="include-recursive-wildcard",
126+
),
127+
pytest.param(
128+
PatternScenario(
129+
{
130+
"include_patterns": {*()},
131+
"ignore_patterns": {"file2.py", "dir2/file_dir2.txt"},
132+
"expected_num_files": 6,
133+
"expected_content": {
134+
"file1.txt",
135+
"src/subfile1.txt",
136+
"src/subfile2.py",
137+
"src/subdir/file_subdir.txt",
138+
"src/subdir/file_subdir.py",
139+
"dir1/file_dir1.txt",
140+
},
141+
"expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"},
142+
"expected_not_structure": {"dir2/"},
143+
}
144+
),
145+
id="exclude-explicit-files",
146+
),
147+
pytest.param(
148+
PatternScenario(
149+
{
150+
"include_patterns": {*()},
151+
"ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"},
152+
"expected_num_files": 5,
153+
"expected_content": {
154+
"src/subfile1.txt",
155+
"src/subfile2.py",
156+
"src/subdir/file_subdir.txt",
157+
"src/subdir/file_subdir.py",
158+
"dir2/file_dir2.txt",
159+
},
160+
"expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"},
161+
"expected_not_structure": {"dir1/"},
162+
}
163+
),
164+
id="exclude-wildcard-directory",
165+
),
166+
pytest.param(
167+
PatternScenario(
168+
{
169+
"include_patterns": {*()},
170+
"ignore_patterns": {"src/**/*.py"},
171+
"expected_num_files": 7,
172+
"expected_content": {
173+
"file1.txt",
174+
"file2.py",
175+
"src/subfile1.txt",
176+
"src/subfile2.py",
177+
"src/subdir/file_subdir.txt",
178+
"dir1/file_dir1.txt",
179+
"dir2/file_dir2.txt",
180+
},
181+
"expected_structure": {
182+
"test_repo/",
183+
"dir1/",
184+
"dir2/",
185+
"src/",
186+
"subdir/",
187+
},
188+
"expected_not_structure": {*()},
189+
}
190+
),
191+
id="exclude-recursive-wildcard",
192+
),
193+
],
194+
)
195+
def test_include_ignore_patterns(
196+
temp_directory: Path,
197+
sample_query: IngestionQuery,
198+
pattern_scenario: PatternScenario,
199+
) -> None:
200+
"""
201+
Test `ingest_query` to ensure included and ignored paths are included and ignored respectively.
202+
203+
Given a directory with .txt and .py files, and a set of include patterns or a set of ignore patterns:
204+
When `ingest_query` is invoked,
205+
Then it should produce a summary string listing the files analyzed and a combined content string.
206+
"""
207+
208+
sample_query.local_path = temp_directory
209+
sample_query.subpath = "/"
210+
sample_query.type = None
211+
sample_query.include_patterns = pattern_scenario["include_patterns"] or None
212+
sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] or None
213+
214+
summary, structure, content = ingest_query(sample_query)
215+
216+
assert "Repository: test_user/test_repo" in summary
217+
assert f"Files analyzed: {pattern_scenario["expected_num_files"]}" in summary
218+
219+
# Check presence of key files in the content
220+
for expected_content_item in pattern_scenario["expected_content"]:
221+
assert expected_content_item in content
222+
223+
# check presence of included directories in structure
224+
for expected_structure_item in pattern_scenario["expected_structure"]:
225+
assert expected_structure_item in structure
226+
227+
# check non-presence of non-included directories in structure
228+
for expected_not_structure_item in pattern_scenario["expected_not_structure"]:
229+
assert expected_not_structure_item not in structure

0 commit comments

Comments
 (0)