diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index b691fd7f..c7f07d9b 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -13,10 +13,34 @@ @click.command() @click.argument("source", type=str, default=".") -@click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") -@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes") -@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude") -@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include") +@click.option( + "--output", + "-o", + default=None, + help="Output file path (default: .txt in current directory)", +) +@click.option( + "--max-size", + "-s", + default=MAX_FILE_SIZE, + help="Maximum file size to process in bytes", +) +@click.option( + "--exclude-pattern", + "-e", + multiple=True, + help="""Patterns to exclude. Handles python's arbitrary subset of Unix + shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html""", +) +@click.option( + "--include-pattern", + "-i", + multiple=True, + help="""Patterns to include. Handles python's arbitrary subset of Unix + shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html""", +) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") def main( source: str, @@ -27,7 +51,7 @@ def main( branch: Optional[str], ): """ - Main entry point for the CLI. This function is called when the CLI is run as a script. + Main entry point for the CLI. This function is called when the CLI is run as a script. It calls the async main function to run the command. diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index d3005250..ec378978 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -202,6 +202,10 @@ def _process_node( query=query, stats=stats, ) + + if not child_directory_node.children: + continue + node.children.append(child_directory_node) node.size += child_directory_node.size node.file_count += child_directory_node.file_count diff --git a/src/gitingest/utils/ingestion_utils.py b/src/gitingest/utils/ingestion_utils.py index b4bb552c..9ce2ae72 100644 --- a/src/gitingest/utils/ingestion_utils.py +++ b/src/gitingest/utils/ingestion_utils.py @@ -33,8 +33,10 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> return False rel_str = str(rel_path) + + # if path is a directory, include it by default if path.is_dir(): - rel_str += "/" + return True for pattern in include_patterns: if fnmatch(rel_str, pattern): diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 3e991f8f..3d829b4a 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -5,7 +5,11 @@ including filtering patterns and subpaths. """ +import re from pathlib import Path +from typing import Set, TypedDict + +import pytest from gitingest.ingestion import ingest_query from gitingest.query_parsing import IngestionQuery @@ -42,5 +46,187 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> # TODO: Additional tests: # - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"]. # - Edge cases with weird file names or deep subdirectory structures. -# TODO : def test_include_txt_pattern # TODO : def test_include_nonexistent_extension + + +class PatternScenario(TypedDict): + include_patterns: Set[str] + ignore_patterns: Set[str] + expected_num_files: int + expected_content: Set[str] + expected_structure: Set[str] + expected_not_structure: Set[str] + + +@pytest.mark.parametrize( + "pattern_scenario", + [ + pytest.param( + PatternScenario( + { + "include_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "ignore_patterns": {*()}, + "expected_num_files": 2, + "expected_content": {"file2.py", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/", "dir1/"}, + } + ), + id="include-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": { + "file1.txt", + "file2.py", + "file_dir1.txt", + "*/file_dir2.txt", + }, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/", "dir1/"}, + } + ), + id="include-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": { + "file2.py", + "src/subfile2.py", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/", "dir2/"}, + } + ), + id="include-wildcard-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"**/file_dir2.txt", "src/**/*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 2, + "expected_content": { + "dir2/file_dir2.txt", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="include-recursive-wildcard", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "expected_num_files": 6, + "expected_content": { + "file1.txt", + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir1/file_dir1.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"}, + "expected_not_structure": {"dir2/"}, + } + ), + id="exclude-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"}, + "expected_num_files": 5, + "expected_content": { + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir2/file_dir2.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="exclude-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"src/**/*.py"}, + "expected_num_files": 7, + "expected_content": { + "file1.txt", + "file2.py", + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "dir1/file_dir1.txt", + "dir2/file_dir2.txt", + }, + "expected_structure": { + "test_repo/", + "dir1/", + "dir2/", + "src/", + "subdir/", + }, + "expected_not_structure": {*()}, + } + ), + id="exclude-recursive-wildcard", + ), + ], +) +def test_include_ignore_patterns( + temp_directory: Path, + sample_query: IngestionQuery, + pattern_scenario: PatternScenario, +) -> None: + """ + Test `ingest_query` to ensure included and ignored paths are included and ignored respectively. + + Given a directory with .txt and .py files, and a set of include patterns or a set of ignore patterns: + When `ingest_query` is invoked, + Then it should produce a summary string listing the files analyzed and a combined content string. + """ + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = pattern_scenario["include_patterns"] or None + sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] or None + + summary, structure, content = ingest_query(sample_query) + + assert "Repository: test_user/test_repo" in summary + num_files_regex = re.compile(r"^Files analyzed: (\d+)$", re.MULTILINE) + assert (num_files_match := num_files_regex.search(summary)) is not None + assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"] + + # Check presence of key files in the content + for expected_content_item in pattern_scenario["expected_content"]: + assert expected_content_item in content + + # check presence of included directories in structure + for expected_structure_item in pattern_scenario["expected_structure"]: + assert expected_structure_item in structure + + # check non-presence of non-included directories in structure + for expected_not_structure_item in pattern_scenario["expected_not_structure"]: + assert expected_not_structure_item not in structure