From 45da897821edddf11ce89ffd7f007a3c664f0143 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 17 Apr 2025 19:23:06 -0600 Subject: [PATCH 1/5] fix: traverse directories to allow pattern matching of files within them --- src/gitingest/cli.py | 35 ++++- src/gitingest/ingestion.py | 4 + src/gitingest/utils/ingestion_utils.py | 4 +- tests/test_ingestion.py | 185 ++++++++++++++++++++++++- 4 files changed, 221 insertions(+), 7 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index b691fd7f..78ddf4f1 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -13,10 +13,34 @@ @click.command() @click.argument("source", type=str, default=".") -@click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") -@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes") -@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude") -@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include") +@click.option( + "--output", + "-o", + default=None, + help="Output file path (default: .txt in current directory)", +) +@click.option( + "--max-size", + "-s", + default=MAX_FILE_SIZE, + help="Maximum file size to process in bytes", +) +@click.option( + "--exclude-pattern", + "-e", + multiple=True, + help="""Patterns to exclude. Handles python's arbitrary subset of Unix + shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html""", +) +@click.option( + "--include-pattern", + "-i", + multiple=True, + help="""Patterns to include. Handles python's arbitrary subset of Unit + shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html""", +) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") def main( source: str, @@ -27,10 +51,11 @@ def main( branch: Optional[str], ): """ - Main entry point for the CLI. This function is called when the CLI is run as a script. + Main entry point for the CLI. This function is called when the CLI is run as a script. It calls the async main function to run the command. + \b Parameters ---------- source : str diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index d3005250..ec378978 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -202,6 +202,10 @@ def _process_node( query=query, stats=stats, ) + + if not child_directory_node.children: + continue + node.children.append(child_directory_node) node.size += child_directory_node.size node.file_count += child_directory_node.file_count diff --git a/src/gitingest/utils/ingestion_utils.py b/src/gitingest/utils/ingestion_utils.py index b4bb552c..9ce2ae72 100644 --- a/src/gitingest/utils/ingestion_utils.py +++ b/src/gitingest/utils/ingestion_utils.py @@ -33,8 +33,10 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> return False rel_str = str(rel_path) + + # if path is a directory, include it by default if path.is_dir(): - rel_str += "/" + return True for pattern in include_patterns: if fnmatch(rel_str, pattern): diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 3e991f8f..1ddac966 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -6,6 +6,9 @@ """ from pathlib import Path +from typing import TypedDict + +import pytest from gitingest.ingestion import ingest_query from gitingest.query_parsing import IngestionQuery @@ -42,5 +45,185 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> # TODO: Additional tests: # - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"]. # - Edge cases with weird file names or deep subdirectory structures. -# TODO : def test_include_txt_pattern # TODO : def test_include_nonexistent_extension + + +class PatternScenario(TypedDict): + include_patterns: set[str] + ignore_patterns: set[str] + expected_num_files: int + expected_content: set[str] + expected_structure: set[str] + expected_not_structure: set[str] + + +@pytest.mark.parametrize( + "pattern_scenario", + [ + pytest.param( + PatternScenario( + { + "include_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "ignore_patterns": {*()}, + "expected_num_files": 2, + "expected_content": {"file2.py", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/", "dir1/"}, + } + ), + id="include-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": { + "file1.txt", + "file2.py", + "file_dir1.txt", + "*/file_dir2.txt", + }, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/", "dir1/"}, + } + ), + id="include-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": { + "file2.py", + "src/subfile2.py", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/", "dir2/"}, + } + ), + id="include-wildcard-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"**/file_dir2.txt", "src/**/*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 2, + "expected_content": { + "dir2/file_dir2.txt", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="include-recursive-wildcard", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "expected_num_files": 6, + "expected_content": { + "file1.txt", + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir1/file_dir1.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"}, + "expected_not_structure": {"dir2/"}, + } + ), + id="exclude-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"}, + "expected_num_files": 5, + "expected_content": { + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir2/file_dir2.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="exclude-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"src/**/*.py"}, + "expected_num_files": 7, + "expected_content": { + "file1.txt", + "file2.py", + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "dir1/file_dir1.txt", + "dir2/file_dir2.txt", + }, + "expected_structure": { + "test_repo/", + "dir1/", + "dir2/", + "src/", + "subdir/", + }, + "expected_not_structure": {*()}, + } + ), + id="exclude-recursive-wildcard", + ), + ], +) +def test_include_ignore_patterns( + temp_directory: Path, + sample_query: IngestionQuery, + pattern_scenario: PatternScenario, +) -> None: + """ + Test `ingest_query` to ensure included and ignored paths are included and ignored respectively. + + Given a directory with .txt and .py files, and a set of include patterns or a set of ignore patterns: + When `ingest_query` is invoked, + Then it should produce a summary string listing the files analyzed and a combined content string. + """ + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = pattern_scenario["include_patterns"] or None + sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] or None + + summary, structure, content = ingest_query(sample_query) + + assert "Repository: test_user/test_repo" in summary + assert f"Files analyzed: {pattern_scenario["expected_num_files"]}" in summary + + # Check presence of key files in the content + for expected_content_item in pattern_scenario["expected_content"]: + assert expected_content_item in content + + # check presence of included directories in structure + for expected_structure_item in pattern_scenario["expected_structure"]: + assert expected_structure_item in structure + + # check non-presence of non-included directories in structure + for expected_not_structure_item in pattern_scenario["expected_not_structure"]: + assert expected_not_structure_item not in structure From 68e1375ed5b37e8481f2821fe92097d8e2956662 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 17 Apr 2025 23:24:06 -0600 Subject: [PATCH 2/5] typo --- src/gitingest/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 78ddf4f1..d7c93839 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -37,7 +37,7 @@ "--include-pattern", "-i", multiple=True, - help="""Patterns to include. Handles python's arbitrary subset of Unit + help="""Patterns to include. Handles python's arbitrary subset of Unix shell-style wildcards. See: https://docs.python.org/3/library/fnmatch.html""", ) From 051620d1c6a836d402073b42969e081e3b16a817 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 17 Apr 2025 23:38:29 -0600 Subject: [PATCH 3/5] use regex instead of formatted strings --- tests/test_ingestion.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 1ddac966..6ef8ce45 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -5,6 +5,7 @@ including filtering patterns and subpaths. """ +import re from pathlib import Path from typing import TypedDict @@ -214,7 +215,9 @@ def test_include_ignore_patterns( summary, structure, content = ingest_query(sample_query) assert "Repository: test_user/test_repo" in summary - assert f"Files analyzed: {pattern_scenario["expected_num_files"]}" in summary + num_files_regex = re.compile(r"^Files analyzed: (\d+)$", re.MULTILINE) + assert (num_files_match := num_files_regex.search(summary)) is not None + assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"] # Check presence of key files in the content for expected_content_item in pattern_scenario["expected_content"]: From 80a48dd02f69f515c1f2ca3ff28df64f8f83bb2c Mon Sep 17 00:00:00 2001 From: Aaron Date: Fri, 18 Apr 2025 09:40:27 -0600 Subject: [PATCH 4/5] fix new style typing syntax --- tests/test_ingestion.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 6ef8ce45..3d829b4a 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -7,7 +7,7 @@ import re from pathlib import Path -from typing import TypedDict +from typing import Set, TypedDict import pytest @@ -50,12 +50,12 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> class PatternScenario(TypedDict): - include_patterns: set[str] - ignore_patterns: set[str] + include_patterns: Set[str] + ignore_patterns: Set[str] expected_num_files: int - expected_content: set[str] - expected_structure: set[str] - expected_not_structure: set[str] + expected_content: Set[str] + expected_structure: Set[str] + expected_not_structure: Set[str] @pytest.mark.parametrize( From 1a8650cdee092b12a702cd90d366cd8465b07535 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Fri, 13 Jun 2025 17:21:38 +0200 Subject: [PATCH 5/5] Update src/gitingest/cli.py --- src/gitingest/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index d7c93839..c7f07d9b 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -55,7 +55,6 @@ def main( It calls the async main function to run the command. - \b Parameters ---------- source : str