From 977fcba7f57610b909db0c833c66f28c7b1ad513 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:30:10 +0200 Subject: [PATCH] utils: honour recursive glob patterns in _should_include (#275) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, `atomics/**/*.md` and similar patterns never matched, because `_should_include` treated all patterns as fixed-depth globs. This patch adds dedicated logic that distinguishes recursive (`**`) from non-recursive patterns and introduces directory-aware checks so we only descend into directories that *could* contain matches. ✅ Fixes #275 (pattern matching) ⚠️ Known issue the matcher now collects the correct `.md` files, but the directory-structure output still lists the directories themselves even when they contain no matched files (e.g. `atomics/Indexes/Attack-Navigator-Layers/`, `atomics/T1003.003/src/`). A follow-up PR will tackle this. --- .pre-commit-config.yaml | 6 +-- CONTRIBUTING.md | 2 +- src/gitingest/utils/ingestion_utils.py | 74 +++++++++++++++++++++++--- 3 files changed, 72 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1a70d007..b8b3f228 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,7 +44,7 @@ repos: - id: black - repo: https://github.com/asottile/pyupgrade - rev: v3.19.1 + rev: v3.20.0 hooks: - id: pyupgrade description: "Automatically upgrade syntax for newer versions." @@ -73,7 +73,7 @@ repos: - id: djlint-reformat-jinja - repo: https://github.com/igorshubovych/markdownlint-cli - rev: v0.44.0 + rev: v0.45.0 hooks: - id: markdownlint description: "Lint markdown files." @@ -88,7 +88,7 @@ repos: files: ^src/ - repo: https://github.com/pycqa/pylint - rev: v3.3.6 + rev: v3.3.7 hooks: - id: pylint name: pylint for source diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ece5d35..11ac19b1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -59,7 +59,7 @@ Thanks for your interest in contributing to Gitingest! 🚀 Gitingest aims to be 2. Run the local web server: ``` bash - uvicorn server.main:app + uvicorn server.main:app --reload ``` 3. Open your browser and navigate to `http://localhost:8000` to see the app running. diff --git a/src/gitingest/utils/ingestion_utils.py b/src/gitingest/utils/ingestion_utils.py index b4bb552c..f950e564 100644 --- a/src/gitingest/utils/ingestion_utils.py +++ b/src/gitingest/utils/ingestion_utils.py @@ -12,6 +12,16 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> This function checks whether the relative path of a file or directory matches any of the specified patterns. If a match is found, it returns `True`, indicating that the file or directory should be included in further processing. + The function handles both recursive (**) and non-recursive patterns differently: + - For non-recursive patterns (e.g. "src/*.py"), files must match the exact pattern depth + - For recursive patterns (e.g. "src/**/*.py"), files can match at any depth under the pattern prefix + + Directory matching has special handling: + - For directories, we check if they could contain matching files based on the pattern + - Patterns ending in /* are treated as matching any files in that directory + - For non-recursive patterns, directories must match the exact pattern depth + - For recursive patterns with **, directories are checked against the pattern prefix + Parameters ---------- path : Path @@ -19,12 +29,34 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> base_path : Path The base directory from which the relative path is calculated. include_patterns : Set[str] - A set of patterns to check against the relative path. + A set of patterns to check against the relative path. Patterns can include: + - * to match any characters except / + - ** to match any characters including / + - /* at the end to match any files in a directory Returns ------- bool `True` if the path matches any of the include patterns, `False` otherwise. + + Raises + ------ + ValueError + If a non-recursive pattern is used and the directory depth exceeds the pattern depth. + This indicates a traversal error since parent directories should have been filtered. + + Examples + -------- + >>> _should_include(Path("/root/src/file.py"), Path("/root"), include_patterns={"src/*.py"}) + True + >>> _should_include(Path("/root/src/nested/file.py"), Path("/root"), include_patterns={"src/**/*.py"}) + True + >>> _should_include(Path("/root/src"), Path("/root"), include_patterns={"src/*"}) + True # Directory matches as it could contain matching files + + # TODO: Fix bug where directories are included in the directory structure output when they should not be, + # e.g. atomics/**/Indexes-Markdown/*.md should not include atomics/Indexes/Attack-Navigator-Layers/ + # or atomics/T1003.003/src/ for the repository https://github.com/redcanaryco/atomic-red-team. """ try: rel_path = path.relative_to(base_path) @@ -33,12 +65,40 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> return False rel_str = str(rel_path) - if path.is_dir(): - rel_str += "/" - for pattern in include_patterns: - if fnmatch(rel_str, pattern): - return True + for pattern in include_patterns - {""}: # ignore empty pattern + if path.is_dir(): + # For directory traversal, check if the directory is part of the path that leads to matching files + if pattern.endswith("/*"): + # If the pattern ends with *, add a trailing * to the pattern to match any files in the directory + pattern += "*" + + pattern_parts = pattern.split("/") + dir_parts = rel_str.split("/") + + # For non-recursive patterns (no **), validate directory depth matches pattern depth + # Recursive patterns can match directories at any depth + if all(["**" not in pattern, len(pattern_parts) > 1, len(dir_parts) > len(pattern_parts)]): + raise ValueError( + f"Directory '{rel_str}' has {len(dir_parts)} path segments but pattern '{pattern}' " + f"only has {len(pattern_parts)} segments. This indicates a traversal error since " + f"parent directories should have been filtered out by pattern matching." + ) + + relevant_pattern_length = ( + min(len(dir_parts), pattern_parts.index("**")) if "**" in pattern_parts else len(dir_parts) + ) + pattern_prefix = "/".join(pattern_parts[:relevant_pattern_length]) + + if "**" in pattern_parts: + pattern_prefix += "*" + + if fnmatch(rel_str, pattern_prefix): + return True + else: + if fnmatch(rel_str, pattern): + return True + return False @@ -50,6 +110,8 @@ def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> b any of the specified ignore patterns. If a match is found, it returns `True`, indicating that the file or directory should be excluded from further processing. + TODO: Check if we need to handle exclude patterns with **, and if so, how. + Parameters ---------- path : Path