Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions src/gitingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,34 @@

@click.command()
@click.argument("source", type=str, default=".")
@click.option("--output", "-o", default=None, help="Output file path (default: <repo_name>.txt in current directory)")
@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes")
@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude")
@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include")
@click.option(
"--output",
"-o",
default=None,
help="Output file path (default: <repo_name>.txt in current directory)",
)
@click.option(
"--max-size",
"-s",
default=MAX_FILE_SIZE,
help="Maximum file size to process in bytes",
)
@click.option(
"--exclude-pattern",
"-e",
multiple=True,
help="""Patterns to exclude. Handles python's arbitrary subset of Unix
shell-style wildcards. See:
https://docs.python.org/3/library/fnmatch.html""",
)
@click.option(
"--include-pattern",
"-i",
multiple=True,
help="""Patterns to include. Handles python's arbitrary subset of Unix
shell-style wildcards. See:
https://docs.python.org/3/library/fnmatch.html""",
)
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
def main(
source: str,
Expand All @@ -27,7 +51,7 @@ def main(
branch: Optional[str],
):
"""
Main entry point for the CLI. This function is called when the CLI is run as a script.
Main entry point for the CLI. This function is called when the CLI is run as a script.

It calls the async main function to run the command.

Expand Down
4 changes: 4 additions & 0 deletions src/gitingest/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,10 @@ def _process_node(
query=query,
stats=stats,
)

if not child_directory_node.children:
continue

node.children.append(child_directory_node)
node.size += child_directory_node.size
node.file_count += child_directory_node.file_count
Expand Down
4 changes: 3 additions & 1 deletion src/gitingest/utils/ingestion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) ->
return False

rel_str = str(rel_path)

# if path is a directory, include it by default
if path.is_dir():
rel_str += "/"
return True

for pattern in include_patterns:
if fnmatch(rel_str, pattern):
Expand Down
188 changes: 187 additions & 1 deletion tests/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
including filtering patterns and subpaths.
"""

import re
from pathlib import Path
from typing import Set, TypedDict

import pytest

from gitingest.ingestion import ingest_query
from gitingest.query_parsing import IngestionQuery
Expand Down Expand Up @@ -42,5 +46,187 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) ->
# TODO: Additional tests:
# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"].
# - Edge cases with weird file names or deep subdirectory structures.
# TODO : def test_include_txt_pattern
# TODO : def test_include_nonexistent_extension


class PatternScenario(TypedDict):
include_patterns: Set[str]
ignore_patterns: Set[str]
expected_num_files: int
expected_content: Set[str]
expected_structure: Set[str]
expected_not_structure: Set[str]


@pytest.mark.parametrize(
"pattern_scenario",
[
pytest.param(
PatternScenario(
{
"include_patterns": {"file2.py", "dir2/file_dir2.txt"},
"ignore_patterns": {*()},
"expected_num_files": 2,
"expected_content": {"file2.py", "dir2/file_dir2.txt"},
"expected_structure": {"test_repo/", "dir2/"},
"expected_not_structure": {"src/", "subdir/", "dir1/"},
}
),
id="include-explicit-files",
),
pytest.param(
PatternScenario(
{
"include_patterns": {
"file1.txt",
"file2.py",
"file_dir1.txt",
"*/file_dir2.txt",
},
"ignore_patterns": {*()},
"expected_num_files": 3,
"expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"},
"expected_structure": {"test_repo/", "dir2/"},
"expected_not_structure": {"src/", "subdir/", "dir1/"},
}
),
id="include-wildcard-directory",
),
pytest.param(
PatternScenario(
{
"include_patterns": {"*.py"},
"ignore_patterns": {*()},
"expected_num_files": 3,
"expected_content": {
"file2.py",
"src/subfile2.py",
"src/subdir/file_subdir.py",
},
"expected_structure": {"test_repo/", "src/", "subdir/"},
"expected_not_structure": {"dir1/", "dir2/"},
}
),
id="include-wildcard-files",
),
pytest.param(
PatternScenario(
{
"include_patterns": {"**/file_dir2.txt", "src/**/*.py"},
"ignore_patterns": {*()},
"expected_num_files": 2,
"expected_content": {
"dir2/file_dir2.txt",
"src/subdir/file_subdir.py",
},
"expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"},
"expected_not_structure": {"dir1/"},
}
),
id="include-recursive-wildcard",
),
pytest.param(
PatternScenario(
{
"include_patterns": {*()},
"ignore_patterns": {"file2.py", "dir2/file_dir2.txt"},
"expected_num_files": 6,
"expected_content": {
"file1.txt",
"src/subfile1.txt",
"src/subfile2.py",
"src/subdir/file_subdir.txt",
"src/subdir/file_subdir.py",
"dir1/file_dir1.txt",
},
"expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"},
"expected_not_structure": {"dir2/"},
}
),
id="exclude-explicit-files",
),
pytest.param(
PatternScenario(
{
"include_patterns": {*()},
"ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"},
"expected_num_files": 5,
"expected_content": {
"src/subfile1.txt",
"src/subfile2.py",
"src/subdir/file_subdir.txt",
"src/subdir/file_subdir.py",
"dir2/file_dir2.txt",
},
"expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"},
"expected_not_structure": {"dir1/"},
}
),
id="exclude-wildcard-directory",
),
pytest.param(
PatternScenario(
{
"include_patterns": {*()},
"ignore_patterns": {"src/**/*.py"},
"expected_num_files": 7,
"expected_content": {
"file1.txt",
"file2.py",
"src/subfile1.txt",
"src/subfile2.py",
"src/subdir/file_subdir.txt",
"dir1/file_dir1.txt",
"dir2/file_dir2.txt",
},
"expected_structure": {
"test_repo/",
"dir1/",
"dir2/",
"src/",
"subdir/",
},
"expected_not_structure": {*()},
}
),
id="exclude-recursive-wildcard",
),
],
)
def test_include_ignore_patterns(
temp_directory: Path,
sample_query: IngestionQuery,
pattern_scenario: PatternScenario,
) -> None:
"""
Test `ingest_query` to ensure included and ignored paths are included and ignored respectively.

Given a directory with .txt and .py files, and a set of include patterns or a set of ignore patterns:
When `ingest_query` is invoked,
Then it should produce a summary string listing the files analyzed and a combined content string.
"""

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
sample_query.include_patterns = pattern_scenario["include_patterns"] or None
sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] or None

summary, structure, content = ingest_query(sample_query)

assert "Repository: test_user/test_repo" in summary
num_files_regex = re.compile(r"^Files analyzed: (\d+)$", re.MULTILINE)
assert (num_files_match := num_files_regex.search(summary)) is not None
assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"]

# Check presence of key files in the content
for expected_content_item in pattern_scenario["expected_content"]:
assert expected_content_item in content

# check presence of included directories in structure
for expected_structure_item in pattern_scenario["expected_structure"]:
assert expected_structure_item in structure

# check non-presence of non-included directories in structure
for expected_not_structure_item in pattern_scenario["expected_not_structure"]:
assert expected_not_structure_item not in structure