From 93a610ea4a9b38af39111fd4e5cef3a7a866d270 Mon Sep 17 00:00:00 2001 From: e2720pjk Date: Tue, 20 Jan 2026 10:59:51 +0800 Subject: [PATCH 1/2] feat: add .gitignore pattern support for repository analysis - Implement hybrid .gitignore processing using git check-ignore with pathspec fallback - Add --respect-gitignore CLI option to both config and generate commands - Update configuration models to store gitignore preference persistently - Enhance RepoAnalyzer with gitignore pattern matching and priority logic - Add comprehensive test suite for gitignore verification including negation patterns - Update documentation with detailed pattern behavior and processing logic - Add pathspec dependency for robust gitignore pattern matching The feature respects .gitignore patterns during file analysis while maintaining proper priority: 1. Git ignore patterns are checked first 2. User CLI exclude patterns override git tracking 3. Default ignore patterns are applied last 4. Include patterns filter the remaining files --- README.md | 21 +++- codewiki/cli/adapters/doc_generator.py | 1 + codewiki/cli/commands/config.py | 23 +++- codewiki/cli/commands/generate.py | 16 ++- codewiki/cli/config_manager.py | 8 +- codewiki/cli/models/config.py | 7 +- codewiki/cli/models/job.py | 1 + .../analysis/analysis_service.py | 50 ++++++-- .../analysis/repo_analyzer.py | 110 +++++++++++++++++- codewiki/src/config.py | 23 ++-- pyproject.toml | 3 +- tests/test_gitignore_verification.py | 70 +++++++++++ 12 files changed, 295 insertions(+), 38 deletions(-) create mode 100644 tests/test_gitignore_verification.py diff --git a/README.md b/README.md index ce04740..b089eb1 100644 --- a/README.md +++ b/README.md @@ -108,8 +108,8 @@ codewiki config set \ # Configure max token settings codewiki config set --max-tokens 32768 --max-token-per-module 36369 --max-token-per-leaf-module 16000 -# Configure max depth for hierarchical decomposition -codewiki config set --max-depth 3 +# Configure max depth for hierarchical decomposition and .gitignore support +codewiki config set --max-depth 3 --respect-gitignore # Show current configuration codewiki config show @@ -137,7 +137,7 @@ codewiki generate --github-pages codewiki generate --verbose # Full-featured generation -codewiki generate --create-branch --github-pages --verbose +codewiki generate --create-branch --github-pages --verbose --respect-gitignore ``` ### Customization Options @@ -145,8 +145,8 @@ codewiki generate --create-branch --github-pages --verbose CodeWiki supports customization for language-specific projects and documentation styles: ```bash -# C# project: only analyze .cs files, exclude test directories -codewiki generate --include "*.cs" --exclude "Tests,Specs,*.test.cs" +# C# project: only analyze .cs files, exclude test directories, respect .gitignore +codewiki generate --include "*.cs" --exclude "Tests,Specs,*.test.cs" --respect-gitignore # Focus on specific modules with architecture-style docs codewiki generate --focus "src/core,src/api" --doc-type architecture @@ -157,7 +157,7 @@ codewiki generate --instructions "Focus on public APIs and include usage example #### Pattern Behavior (Important!) -- **`--include`**: When specified, **ONLY** these patterns are used (replaces defaults completely) +- **`--include`**: When specified, **ONLY** these patterns are included from the remaining files (applied after exclusion) - Example: `--include "*.cs"` will analyze ONLY `.cs` files - If omitted, all supported file types are analyzed - Supports glob patterns: `*.py`, `src/**/*.ts`, `*.{js,jsx}` @@ -170,6 +170,14 @@ codewiki generate --instructions "Focus on public APIs and include usage example - Glob patterns: `*.test.js`, `*_test.py`, `*.min.*` - Directory patterns: `build/`, `dist/`, `coverage/` +- **`--respect-gitignore`**: Respect `.gitignore` patterns + - **Hybrid**: Uses `git check-ignore` for full recursive accuracy, falls back to pathspec if git unavailable + - **Processing Logic**: + 1. **Git Check**: If matched by `.gitignore` → **Excluded** + 2. **User Exclude**: If matched by CLI `--exclude` → **Excluded** (Overrides Git tracking) + 3. **Defaults**: If no match above → Check default ignore patterns + 4. **Inclusion**: Final check against `--include` patterns (if specified) + #### Setting Persistent Defaults Save your preferred settings as defaults: @@ -202,6 +210,7 @@ codewiki config agent --clear | `--doc-type` | Documentation style | Standalone option | `api`, `architecture`, `user-guide`, `developer` | | `--instructions` | Custom agent instructions | Standalone option | Free-form text | + ### Token Settings CodeWiki allows you to configure maximum token limits for LLM calls. This is useful for: diff --git a/codewiki/cli/adapters/doc_generator.py b/codewiki/cli/adapters/doc_generator.py index 826b60c..bacf7be 100644 --- a/codewiki/cli/adapters/doc_generator.py +++ b/codewiki/cli/adapters/doc_generator.py @@ -141,6 +141,7 @@ def generate(self) -> DocumentationJob: max_token_per_module=self.config.get('max_token_per_module', 36369), max_token_per_leaf_module=self.config.get('max_token_per_leaf_module', 16000), max_depth=self.config.get('max_depth', 2), + respect_gitignore=self.config.get('respect_gitignore', False), agent_instructions=self.config.get('agent_instructions') ) diff --git a/codewiki/cli/commands/config.py b/codewiki/cli/commands/config.py index f776273..7793477 100644 --- a/codewiki/cli/commands/config.py +++ b/codewiki/cli/commands/config.py @@ -83,6 +83,12 @@ def config_group(): type=int, help="Maximum depth for hierarchical decomposition (default: 2)" ) +@click.option( + '--respect-gitignore', + is_flag=True, + default=None, + help='Respect .gitignore patterns during analysis' +) def config_set( api_key: Optional[str], base_url: Optional[str], @@ -92,7 +98,8 @@ def config_set( max_tokens: Optional[int], max_token_per_module: Optional[int], max_token_per_leaf_module: Optional[int], - max_depth: Optional[int] + max_depth: Optional[int], + respect_gitignore: Optional[bool] ): """ Set configuration values for CodeWiki. @@ -127,7 +134,7 @@ def config_set( """ try: # Check if at least one option is provided - if not any([api_key, base_url, main_model, cluster_model, fallback_model, max_tokens, max_token_per_module, max_token_per_leaf_module, max_depth]): + if not any([api_key, base_url, main_model, cluster_model, fallback_model, max_tokens, max_token_per_module, max_token_per_leaf_module, max_depth, respect_gitignore is not None]): click.echo("No options provided. Use --help for usage information.") sys.exit(EXIT_CONFIG_ERROR) @@ -169,6 +176,9 @@ def config_set( raise ConfigurationError("max_depth must be a positive integer") validated_data['max_depth'] = max_depth + if respect_gitignore is not None: + validated_data['respect_gitignore'] = respect_gitignore + # Create config manager and save manager = ConfigManager() manager.load() # Load existing config if present @@ -182,7 +192,8 @@ def config_set( max_tokens=validated_data.get('max_tokens'), max_token_per_module=validated_data.get('max_token_per_module'), max_token_per_leaf_module=validated_data.get('max_token_per_leaf_module'), - max_depth=validated_data.get('max_depth') + max_depth=validated_data.get('max_depth'), + respect_gitignore=validated_data.get('respect_gitignore') ) # Display success messages @@ -231,6 +242,9 @@ def config_set( if max_depth: click.secho(f"✓ Max depth: {max_depth}", fg="green") + if respect_gitignore is not None: + click.secho(f"✓ Respect gitignore: {respect_gitignore}", fg="green") + click.echo("\n" + click.style("Configuration updated successfully.", fg="green", bold=True)) except ConfigurationError as e: @@ -291,6 +305,7 @@ def config_show(output_json: bool): "max_token_per_module": config.max_token_per_module if config else 36369, "max_token_per_leaf_module": config.max_token_per_leaf_module if config else 16000, "max_depth": config.max_depth if config else 2, + "respect_gitignore": config.respect_gitignore if config else False, "agent_instructions": config.agent_instructions.to_dict() if config and config.agent_instructions else {}, "config_file": str(manager.config_file_path) } @@ -335,7 +350,7 @@ def config_show(output_json: bool): click.secho("Decomposition Settings", fg="cyan", bold=True) if config: click.echo(f" Max Depth: {config.max_depth}") - + click.echo(f" Respect Gitignore: {config.respect_gitignore}") click.echo() click.secho("Agent Instructions", fg="cyan", bold=True) if config and config.agent_instructions and not config.agent_instructions.is_empty(): diff --git a/codewiki/cli/commands/generate.py b/codewiki/cli/commands/generate.py index 8512f73..529e845 100644 --- a/codewiki/cli/commands/generate.py +++ b/codewiki/cli/commands/generate.py @@ -126,6 +126,12 @@ def parse_patterns(patterns_str: str) -> List[str]: default=None, help="Maximum depth for hierarchical decomposition (overrides config)", ) +@click.option( + '--respect-gitignore', + is_flag=True, + default=None, + help='Respect .gitignore patterns during analysis' +) @click.pass_context def generate_command( ctx, @@ -142,7 +148,8 @@ def generate_command( max_tokens: Optional[int], max_token_per_module: Optional[int], max_token_per_leaf_module: Optional[int], - max_depth: Optional[int] + max_depth: Optional[int], + respect_gitignore: Optional[bool] ): """ Generate comprehensive documentation for a code repository. @@ -290,7 +297,8 @@ def generate_command( create_branch=create_branch, github_pages=github_pages, no_cache=no_cache, - custom_output=output if output != "docs" else None + custom_output=output if output != "docs" else None, + respect_gitignore=respect_gitignore if respect_gitignore is not None else config.respect_gitignore ) # Create runtime agent instructions from CLI options @@ -322,10 +330,12 @@ def generate_command( effective_max_token_per_module = max_token_per_module if max_token_per_module is not None else config.max_token_per_module effective_max_token_per_leaf = max_token_per_leaf_module if max_token_per_leaf_module is not None else config.max_token_per_leaf_module effective_max_depth = max_depth if max_depth is not None else config.max_depth + effective_respect_gitignore = respect_gitignore if respect_gitignore is not None else config.respect_gitignore logger.debug(f"Max tokens: {effective_max_tokens}") logger.debug(f"Max token/module: {effective_max_token_per_module}") logger.debug(f"Max token/leaf module: {effective_max_token_per_leaf}") logger.debug(f"Max depth: {effective_max_depth}") + logger.debug(f"Respect gitignore: {effective_respect_gitignore}") # Get agent instructions (merge runtime with persistent) agent_instructions_dict = None @@ -359,6 +369,8 @@ def generate_command( 'max_token_per_leaf_module': max_token_per_leaf_module if max_token_per_leaf_module is not None else config.max_token_per_leaf_module, # Max depth setting (runtime override takes precedence) 'max_depth': max_depth if max_depth is not None else config.max_depth, + # Gitignore setting (runtime override takes precedence) + 'respect_gitignore': respect_gitignore if respect_gitignore is not None else config.respect_gitignore, }, verbose=verbose, generate_html=github_pages diff --git a/codewiki/cli/config_manager.py b/codewiki/cli/config_manager.py index f1f86b2..409281e 100644 --- a/codewiki/cli/config_manager.py +++ b/codewiki/cli/config_manager.py @@ -92,7 +92,8 @@ def save( max_tokens: Optional[int] = None, max_token_per_module: Optional[int] = None, max_token_per_leaf_module: Optional[int] = None, - max_depth: Optional[int] = None + max_depth: Optional[int] = None, + respect_gitignore: Optional[bool] = None, ): """ Save configuration to file and keyring. @@ -108,6 +109,7 @@ def save( max_token_per_module: Maximum tokens per module for clustering max_token_per_leaf_module: Maximum tokens per leaf module max_depth: Maximum depth for hierarchical decomposition + respect_gitignore: Respect .gitignore patterns during analysis """ # Ensure config directory exists try: @@ -149,7 +151,9 @@ def save( self._config.max_token_per_leaf_module = max_token_per_leaf_module if max_depth is not None: self._config.max_depth = max_depth - + if respect_gitignore is not None: + self._config.respect_gitignore = respect_gitignore + # Validate configuration (only if base fields are set) if self._config.base_url and self._config.main_model and self._config.cluster_model: self._config.validate() diff --git a/codewiki/cli/models/config.py b/codewiki/cli/models/config.py index 585b427..41613e9 100644 --- a/codewiki/cli/models/config.py +++ b/codewiki/cli/models/config.py @@ -118,6 +118,7 @@ class Configuration: max_token_per_leaf_module: Maximum tokens per leaf module (default: 16000) max_depth: Maximum depth for hierarchical decomposition (default: 2) agent_instructions: Custom agent instructions for documentation generation + respect_gitignore: Respect .gitignore patterns during analysis """ base_url: str main_model: str @@ -129,6 +130,7 @@ class Configuration: max_token_per_leaf_module: int = 16000 max_depth: int = 2 agent_instructions: AgentInstructions = field(default_factory=AgentInstructions) + respect_gitignore: bool = False def validate(self): """ @@ -153,6 +155,7 @@ def to_dict(self) -> dict: 'max_token_per_module': self.max_token_per_module, 'max_token_per_leaf_module': self.max_token_per_leaf_module, 'max_depth': self.max_depth, + 'respect_gitignore': self.respect_gitignore, } if self.agent_instructions and not self.agent_instructions.is_empty(): result['agent_instructions'] = self.agent_instructions.to_dict() @@ -184,6 +187,7 @@ def from_dict(cls, data: dict) -> 'Configuration': max_token_per_leaf_module=data.get('max_token_per_leaf_module', 16000), max_depth=data.get('max_depth', 2), agent_instructions=agent_instructions, + respect_gitignore=data.get('respect_gitignore', False), ) def is_complete(self) -> bool: @@ -237,6 +241,7 @@ def to_backend_config(self, repo_path: str, output_dir: str, api_key: str, runti max_token_per_module=self.max_token_per_module, max_token_per_leaf_module=self.max_token_per_leaf_module, max_depth=self.max_depth, - agent_instructions=final_instructions.to_dict() if final_instructions else None + agent_instructions=final_instructions.to_dict() if final_instructions else None, + respect_gitignore=self.respect_gitignore, ) diff --git a/codewiki/cli/models/job.py b/codewiki/cli/models/job.py index c0c49d1..ecc7910 100644 --- a/codewiki/cli/models/job.py +++ b/codewiki/cli/models/job.py @@ -25,6 +25,7 @@ class GenerationOptions: github_pages: bool = False no_cache: bool = False custom_output: Optional[str] = None + respect_gitignore: Optional[bool] = None @dataclass diff --git a/codewiki/src/be/dependency_analyzer/analysis/analysis_service.py b/codewiki/src/be/dependency_analyzer/analysis/analysis_service.py index aa3ba47..89c9304 100644 --- a/codewiki/src/be/dependency_analyzer/analysis/analysis_service.py +++ b/codewiki/src/be/dependency_analyzer/analysis/analysis_service.py @@ -42,7 +42,8 @@ def analyze_local_repository( self, repo_path: str, max_files: int = 100, - languages: Optional[List[str]] = None + languages: Optional[List[str]] = None, + respect_gitignore: bool = False, ) -> Dict[str, Any]: """ Analyze a local repository folder. @@ -51,7 +52,8 @@ def analyze_local_repository( repo_path: Path to local repository folder max_files: Maximum number of files to analyze languages: List of languages to include (e.g., ['python', 'javascript']) - + respect_gitignore: Whether to respect .gitignore patterns + Returns: Dict with analysis results including nodes and relationships """ @@ -59,9 +61,14 @@ def analyze_local_repository( logger.debug(f"Analyzing local repository at {repo_path}") # Get repo analyzer to find files - repo_analyzer = RepoAnalyzer() + repo_analyzer = RepoAnalyzer(respect_gitignore=respect_gitignore, repo_path=repo_path) structure_result = repo_analyzer.analyze_repository_structure(repo_path) - + if structure_result is None: + structure_result = { + "file_tree": {"type": "directory", "name": "", "path": ".", "children": []}, + "summary": {"total_files": 0, "total_size_kb": 0.0}, + } + # Extract code files code_files = self.call_graph_analyzer.extract_code_files(structure_result["file_tree"]) @@ -98,6 +105,7 @@ def analyze_repository_full( github_url: str, include_patterns: Optional[List[str]] = None, exclude_patterns: Optional[List[str]] = None, + respect_gitignore: bool = False, ) -> AnalysisResult: """ Perform complete repository analysis including call graph generation. @@ -106,6 +114,7 @@ def analyze_repository_full( github_url: GitHub repository URL to analyze include_patterns: File patterns to include (e.g., ['*.py', '*.js']) exclude_patterns: Additional patterns to exclude + respect_gitignore: Whether to respect .gitignore patterns Returns: AnalysisResult: Complete analysis with functions, relationships, and visualization @@ -122,7 +131,9 @@ def analyze_repository_full( repo_info = self._parse_repository_info(github_url) logger.debug("Analyzing repository file structure...") - structure_result = self._analyze_structure(temp_dir, include_patterns, exclude_patterns) + structure_result = self._analyze_structure( + temp_dir, include_patterns, exclude_patterns, respect_gitignore + ) logger.debug(f"Found {structure_result['summary']['total_files']} files to analyze.") logger.debug("Starting call graph analysis...") @@ -172,6 +183,7 @@ def analyze_repository_structure_only( github_url: str, include_patterns: Optional[List[str]] = None, exclude_patterns: Optional[List[str]] = None, + respect_gitignore: bool = False, ) -> Dict[str, Any]: """ Perform lightweight structure-only analysis without call graph generation. @@ -180,6 +192,7 @@ def analyze_repository_structure_only( github_url: GitHub repository URL to analyze include_patterns: File patterns to include exclude_patterns: Additional patterns to exclude + respect_gitignore: Whether to respect .gitignore patterns Returns: Dict: Repository structure with file tree and summary statistics @@ -191,7 +204,9 @@ def analyze_repository_structure_only( temp_dir = self._clone_repository(github_url) repo_info = self._parse_repository_info(github_url) - structure_result = self._analyze_structure(temp_dir, include_patterns, exclude_patterns) + structure_result = self._analyze_structure( + temp_dir, include_patterns, exclude_patterns, respect_gitignore + ) result = { "repository": repo_info, @@ -233,13 +248,26 @@ def _analyze_structure( repo_dir: str, include_patterns: Optional[List[str]], exclude_patterns: Optional[List[str]], + respect_gitignore: bool = False, ) -> Dict[str, Any]: """Analyze repository file structure with filtering.""" logger.debug( - f"Initializing RepoAnalyzer with include: {include_patterns}, exclude: {exclude_patterns}" + f"Initializing RepoAnalyzer with include: {include_patterns}, exclude: {exclude_patterns}, " + f"respect_gitignore: {respect_gitignore}" + ) + repo_analyzer = RepoAnalyzer( + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + respect_gitignore=respect_gitignore, + repo_path=repo_dir, ) - repo_analyzer = RepoAnalyzer(include_patterns, exclude_patterns) - return repo_analyzer.analyze_repository_structure(repo_dir) + result = repo_analyzer.analyze_repository_structure(repo_dir) + if result is None: + result = { + "file_tree": {"type": "directory", "name": "", "path": ".", "children": []}, + "summary": {"total_files": 0, "total_size_kb": 0.0}, + } + return result def _read_readme_file(self, repo_dir: str) -> Optional[str]: """Find and read the README file from the repository root.""" @@ -354,7 +382,7 @@ def analyze_repository( def analyze_repository_structure_only( - github_url: str, include_patterns=None, exclude_patterns=None + github_url: str, include_patterns=None, exclude_patterns=None, respect_gitignore=False ) -> tuple[Dict, None]: """ Backward compatibility function. @@ -364,6 +392,6 @@ def analyze_repository_structure_only( """ service = AnalysisService() result = service.analyze_repository_structure_only( - github_url, include_patterns, exclude_patterns + github_url, include_patterns, exclude_patterns, respect_gitignore ) return result, None diff --git a/codewiki/src/be/dependency_analyzer/analysis/repo_analyzer.py b/codewiki/src/be/dependency_analyzer/analysis/repo_analyzer.py index 458ac14..02c6921 100644 --- a/codewiki/src/be/dependency_analyzer/analysis/repo_analyzer.py +++ b/codewiki/src/be/dependency_analyzer/analysis/repo_analyzer.py @@ -8,9 +8,16 @@ import os import fnmatch import json +import shutil +import subprocess from pathlib import Path from typing import Dict, List, Optional, Union -from codewiki.src.be.dependency_analyzer.utils.patterns import DEFAULT_IGNORE_PATTERNS, DEFAULT_INCLUDE_PATTERNS +from codewiki.src.be.dependency_analyzer.utils.patterns import ( + DEFAULT_IGNORE_PATTERNS, + DEFAULT_INCLUDE_PATTERNS, +) +from pathspec import PathSpec +from pathspec.patterns.gitwildmatch import GitWildMatchPattern class RepoAnalyzer: @@ -18,6 +25,8 @@ def __init__( self, include_patterns: Optional[List[str]] = None, exclude_patterns: Optional[List[str]] = None, + respect_gitignore: bool = False, + repo_path: Optional[str] = None, ) -> None: # Include patterns: if specified, use ONLY those patterns (replaces defaults) self.include_patterns = ( @@ -29,9 +38,28 @@ def __init__( if exclude_patterns is not None else list(DEFAULT_IGNORE_PATTERNS) ) + # User-specified exclude patterns only (separated from defaults) + self._user_exclude_patterns = exclude_patterns if exclude_patterns is not None else [] + self.respect_gitignore = respect_gitignore + self.repo_path = repo_path + self.gitignore_spec = None + self.include_spec = None + self._git_path = shutil.which("git") + self._git_available = self._check_git_availability() + + if include_patterns: + try: + self.include_spec = PathSpec.from_lines(GitWildMatchPattern, self.include_patterns) + except ImportError: + pass + + if self.respect_gitignore: + self._load_gitignore_patterns() - def analyze_repository_structure(self, repo_dir: str) -> Dict: + def analyze_repository_structure(self, repo_dir: str) -> Optional[Dict]: file_tree = self._build_file_tree(repo_dir) + if file_tree is None: + return None return { "file_tree": file_tree, "summary": { @@ -40,7 +68,50 @@ def analyze_repository_structure(self, repo_dir: str) -> Dict: }, } - def _build_file_tree(self, repo_dir: str) -> Dict: + def _load_gitignore_patterns(self): + if not self.repo_path: + self.gitignore_spec = None + return + + gitignore_file = os.path.join(self.repo_path, ".gitignore") + if os.path.exists(gitignore_file): + with open(gitignore_file, "r", encoding="utf-8") as f: + lines = f.readlines() + self.gitignore_spec = PathSpec.from_lines(GitWildMatchPattern, lines) + else: + self.gitignore_spec = None + + def _check_git_availability(self) -> bool: + if self._git_path is None: + return False + try: + result = subprocess.run([self._git_path, "--version"], capture_output=True, timeout=5) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired, subprocess.SubprocessError): + return False + + def _is_ignored_by_git(self, path: str) -> Optional[bool]: + if not self.repo_path: + return None + + if not self._git_available: + return None + + assert self._git_path is not None + + try: + full_path = os.path.join(self.repo_path, path) + result = subprocess.run( + [self._git_path, "check-ignore", "--quiet", full_path], + cwd=self.repo_path, + capture_output=True, + timeout=5, + ) + return result.returncode == 0 + except (subprocess.SubprocessError, subprocess.TimeoutExpired, FileNotFoundError): + return None + + def _build_file_tree(self, repo_dir: str) -> Optional[Dict]: def build_tree(path: Path, base_path: Path) -> Optional[Dict]: relative_path = path.relative_to(base_path) relative_path_str = str(relative_path) @@ -98,6 +169,34 @@ def build_tree(path: Path, base_path: Path) -> Optional[Dict]: return build_tree(Path(repo_dir), Path(repo_dir)) def _should_exclude_path(self, path: str, filename: str) -> bool: + if self.respect_gitignore: + git_ignored = self._is_ignored_by_git(path) + + if git_ignored is True: + return True + + if git_ignored is False: + for pattern in self._user_exclude_patterns: + if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(filename, pattern): + return True + if pattern.endswith("/") and path.startswith(pattern.rstrip("/")): + return True + if path.startswith(pattern + "/") or path == pattern: + return True + if pattern in path.split("/"): + return True + return False + + if ( + git_ignored is None + and hasattr(self, "gitignore_spec") + and self.gitignore_spec is not None + ): + if self.gitignore_spec.match_file(path): + return True + if not path.endswith("/") and self.gitignore_spec.match_file(path + "/"): + return True + for pattern in self.exclude_patterns: if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(filename, pattern): return True @@ -107,11 +206,16 @@ def _should_exclude_path(self, path: str, filename: str) -> bool: return True if pattern in path.split("/"): return True + return False def _should_include_file(self, path: str, filename: str) -> bool: if not self.include_patterns: return True + + if self.include_spec and self.include_spec.match_file(path): + return True + for pattern in self.include_patterns: if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(filename, pattern): return True diff --git a/codewiki/src/config.py b/codewiki/src/config.py index 420d1ea..63a712d 100644 --- a/codewiki/src/config.py +++ b/codewiki/src/config.py @@ -63,7 +63,9 @@ class Config: max_token_per_leaf_module: int = DEFAULT_MAX_TOKEN_PER_LEAF_MODULE # Agent instructions for customization agent_instructions: Optional[Dict[str, Any]] = None - + # Git integration + respect_gitignore: bool = False + @property def include_patterns(self) -> Optional[List[str]]: """Get file include patterns from agent instructions.""" @@ -131,7 +133,8 @@ def from_args(cls, args: argparse.Namespace) -> 'Config': """Create configuration from parsed arguments.""" repo_name = os.path.basename(os.path.normpath(args.repo_path)) sanitized_repo_name = ''.join(c if c.isalnum() else '_' for c in repo_name) - + respect_gitignore = getattr(args, 'respect_gitignore', False) + return cls( repo_path=args.repo_path, output_dir=OUTPUT_BASE_DIR, @@ -142,7 +145,8 @@ def from_args(cls, args: argparse.Namespace) -> 'Config': llm_api_key=LLM_API_KEY, main_model=MAIN_MODEL, cluster_model=CLUSTER_MODEL, - fallback_model=FALLBACK_MODEL_1 + fallback_model=FALLBACK_MODEL_1, + respect_gitignore=respect_gitignore, ) @classmethod @@ -159,7 +163,8 @@ def from_cli( max_token_per_module: int = DEFAULT_MAX_TOKEN_PER_MODULE, max_token_per_leaf_module: int = DEFAULT_MAX_TOKEN_PER_LEAF_MODULE, max_depth: int = MAX_DEPTH, - agent_instructions: Optional[Dict[str, Any]] = None + agent_instructions: Optional[Dict[str, Any]] = None, + respect_gitignore: bool = False, ) -> 'Config': """ Create configuration for CLI context. @@ -177,13 +182,14 @@ def from_cli( max_token_per_leaf_module: Maximum tokens per leaf module max_depth: Maximum depth for hierarchical decomposition agent_instructions: Custom agent instructions dict - + respect_gitignore: Respect .gitignore patterns during analysis + Returns: Config instance """ repo_name = os.path.basename(os.path.normpath(repo_path)) base_output_dir = os.path.join(output_dir, "temp") - + return cls( repo_path=repo_path, output_dir=base_output_dir, @@ -198,5 +204,6 @@ def from_cli( max_tokens=max_tokens, max_token_per_module=max_token_per_module, max_token_per_leaf_module=max_token_per_leaf_module, - agent_instructions=agent_instructions - ) \ No newline at end of file + agent_instructions=agent_instructions, + respect_gitignore=respect_gitignore, + ) diff --git a/pyproject.toml b/pyproject.toml index 00c3e01..eadaa13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,8 @@ dependencies = [ "psutil>=7.0.0", "PyYAML>=6.0.2", "mermaid-parser-py>=0.0.2", - "mermaid-py>=0.8.0" + "mermaid-py>=0.8.0", + "pathspec>=0.12.0" ] [external] diff --git a/tests/test_gitignore_verification.py b/tests/test_gitignore_verification.py new file mode 100644 index 0000000..d423a89 --- /dev/null +++ b/tests/test_gitignore_verification.py @@ -0,0 +1,70 @@ +import tempfile +import subprocess +import shutil +import sys +import os +from pathlib import Path + + +def main() -> None: + if not Path("codewiki").exists(): + print("Please run this script from the project root directory.") + sys.exit(1) + sys.path.insert(0, os.getcwd()) + + from codewiki.src.be.dependency_analyzer.analysis.repo_analyzer import RepoAnalyzer + + git_cmd = shutil.which("git") + if git_cmd is None: + print("Error: git command not found. Please install git.") + sys.exit(1) + + temp_dir = Path(tempfile.mkdtemp(prefix="codewiki_verification_")) + print(f"Creating test fixtures in: {temp_dir}") + + try: + subprocess.run([git_cmd, "init", "-q"], cwd=temp_dir, check=True) + (temp_dir / ".gitignore").write_text("node_modules/\n*.log\n!important.log") + (temp_dir / "backend").mkdir() + (temp_dir / "backend" / ".gitignore").write_text("secrets.py") + + (temp_dir / "app.log").touch() + (temp_dir / "important.log").touch() + (temp_dir / "backend" / "secrets.py").touch() + (temp_dir / "backend" / "api.py").touch() + (temp_dir / "force_exclude.py").touch() + + print("-" * 60) + print("TEST 1: Gitignore Logic (Negation & Nested)") + analyzer = RepoAnalyzer(respect_gitignore=True, repo_path=str(temp_dir)) + + check1 = analyzer._should_exclude_path("app.log", "app.log") is True + check2 = analyzer._should_exclude_path("important.log", "important.log") is False + check3 = analyzer._should_exclude_path("backend/secrets.py", "secrets.py") is True + + print(f" [{'✅' if check1 else '❌'}] Basic pattern (*.log)") + print(f" [{'✅' if check2 else '❌'}] Negation pattern (!important.log)") + print(f" [{'✅' if check3 else '❌'}] Nested .gitignore") + + print("\nTEST 2: Priority Logic (CLI Override > Git)") + analyzer_override = RepoAnalyzer( + respect_gitignore=True, repo_path=str(temp_dir), exclude_patterns=["force_exclude.py"] + ) + check4 = ( + analyzer_override._should_exclude_path("force_exclude.py", "force_exclude.py") is True + ) + print(f" [{'✅' if check4 else '❌'}] CLI --exclude overrides Git tracking") + + if all([check1, check2, check3, check4]): + print("\n✨ ALL CHECKS PASSED") + sys.exit(0) + else: + print("\n🚫 SOME CHECKS FAILED") + sys.exit(1) + + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + +if __name__ == "__main__": + main() From cbc64bd0a73bf3c0d1257fe09e54b6c10057d119 Mon Sep 17 00:00:00 2001 From: e2720pjk Date: Thu, 22 Jan 2026 13:44:28 +0800 Subject: [PATCH 2/2] fix: improve gitignore pattern handling and add configurable respect option - Remove early return in repo_analyzer to allow default ignore patterns to be checked - Add respect_gitignore parameter to DependencyParser constructor for configurable gitignore handling - Update DependencyGraphBuilder to pass respect_gitignore configuration to parser - Enhance test coverage to verify gitignore behavior when both enabled and disabled - Modify test patterns to avoid conflicts with default ignore patterns for more accurate testing --- .../analysis/repo_analyzer.py | 2 +- .../src/be/dependency_analyzer/ast_parser.py | 14 +++++--- .../dependency_graphs_builder.py | 3 +- tests/test_gitignore_verification.py | 36 ++++++++++++------- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/codewiki/src/be/dependency_analyzer/analysis/repo_analyzer.py b/codewiki/src/be/dependency_analyzer/analysis/repo_analyzer.py index 02c6921..92d4078 100644 --- a/codewiki/src/be/dependency_analyzer/analysis/repo_analyzer.py +++ b/codewiki/src/be/dependency_analyzer/analysis/repo_analyzer.py @@ -185,7 +185,7 @@ def _should_exclude_path(self, path: str, filename: str) -> bool: return True if pattern in path.split("/"): return True - return False + # Allow default ignore patterns to be checked (remove early return) if ( git_ignored is None diff --git a/codewiki/src/be/dependency_analyzer/ast_parser.py b/codewiki/src/be/dependency_analyzer/ast_parser.py index 3323ed7..8ed2ecf 100644 --- a/codewiki/src/be/dependency_analyzer/ast_parser.py +++ b/codewiki/src/be/dependency_analyzer/ast_parser.py @@ -18,7 +18,7 @@ class DependencyParser: """Parser for extracting code components from multi-language repositories.""" - def __init__(self, repo_path: str, include_patterns: List[str] = None, exclude_patterns: List[str] = None): + def __init__(self, repo_path: str, include_patterns: List[str] = None, exclude_patterns: List[str] = None,respect_gitignore: bool = False): """ Initialize the dependency parser. @@ -26,13 +26,14 @@ def __init__(self, repo_path: str, include_patterns: List[str] = None, exclude_p repo_path: Path to the repository include_patterns: File patterns to include (e.g., ["*.cs", "*.py"]) exclude_patterns: File/directory patterns to exclude (e.g., ["*Tests*"]) + respect_gitignore: Whether to respect .gitignore patterns """ self.repo_path = os.path.abspath(repo_path) self.components: Dict[str, Node] = {} self.modules: Set[str] = set() self.include_patterns = include_patterns self.exclude_patterns = exclude_patterns - + self.respect_gitignore = respect_gitignore self.analysis_service = AnalysisService() def parse_repository(self, filtered_folders: List[str] = None) -> Dict[str, Node]: @@ -43,11 +44,14 @@ def parse_repository(self, filtered_folders: List[str] = None) -> Dict[str, Node logger.info(f"Using custom include patterns: {self.include_patterns}") if self.exclude_patterns: logger.info(f"Using custom exclude patterns: {self.exclude_patterns}") - + if self.respect_gitignore: + logger.info(f"Respecting .gitignore patterns") + structure_result = self.analysis_service._analyze_structure( - self.repo_path, + self.repo_path, include_patterns=self.include_patterns, - exclude_patterns=self.exclude_patterns + exclude_patterns=self.exclude_patterns, + respect_gitignore=self.respect_gitignore, ) call_graph_result = self.analysis_service._analyze_call_graph( diff --git a/codewiki/src/be/dependency_analyzer/dependency_graphs_builder.py b/codewiki/src/be/dependency_analyzer/dependency_graphs_builder.py index f638f4c..3a20b3f 100644 --- a/codewiki/src/be/dependency_analyzer/dependency_graphs_builder.py +++ b/codewiki/src/be/dependency_analyzer/dependency_graphs_builder.py @@ -44,7 +44,8 @@ def build_dependency_graph(self) -> tuple[Dict[str, Any], List[str]]: parser = DependencyParser( self.config.repo_path, include_patterns=include_patterns, - exclude_patterns=exclude_patterns + exclude_patterns=exclude_patterns, + respect_gitignore=self.config.respect_gitignore, ) filtered_folders = None diff --git a/tests/test_gitignore_verification.py b/tests/test_gitignore_verification.py index d423a89..d8371b7 100644 --- a/tests/test_gitignore_verification.py +++ b/tests/test_gitignore_verification.py @@ -24,27 +24,24 @@ def main() -> None: try: subprocess.run([git_cmd, "init", "-q"], cwd=temp_dir, check=True) - (temp_dir / ".gitignore").write_text("node_modules/\n*.log\n!important.log") + # Use *.txt files instead of *.log since *.log is in DEFAULT_IGNORE_PATTERNS + (temp_dir / ".gitignore").write_text("node_modules/\n*.txt\nbackend/config.ini") (temp_dir / "backend").mkdir() (temp_dir / "backend" / ".gitignore").write_text("secrets.py") - (temp_dir / "app.log").touch() - (temp_dir / "important.log").touch() - (temp_dir / "backend" / "secrets.py").touch() - (temp_dir / "backend" / "api.py").touch() - (temp_dir / "force_exclude.py").touch() - print("-" * 60) - print("TEST 1: Gitignore Logic (Negation & Nested)") + print("TEST 1: Gitignore Logic (Basic & Nested) with respect_gitignore=True") analyzer = RepoAnalyzer(respect_gitignore=True, repo_path=str(temp_dir)) - check1 = analyzer._should_exclude_path("app.log", "app.log") is True - check2 = analyzer._should_exclude_path("important.log", "important.log") is False + check1 = analyzer._should_exclude_path("notes.txt", "notes.txt") is True + check2 = analyzer._should_exclude_path("readme.txt", "readme.txt") is True check3 = analyzer._should_exclude_path("backend/secrets.py", "secrets.py") is True + check4 = analyzer._should_exclude_path("backend/config.ini", "config.ini") is True - print(f" [{'✅' if check1 else '❌'}] Basic pattern (*.log)") - print(f" [{'✅' if check2 else '❌'}] Negation pattern (!important.log)") + print(f" [{'✅' if check1 else '❌'}] Basic pattern (*.txt)") + print(f" [{'✅' if check2 else '❌'}] Another txt file (*.txt)") print(f" [{'✅' if check3 else '❌'}] Nested .gitignore") + print(f" [{'✅' if check4 else '❌'}] Path pattern (backend/config.ini)") print("\nTEST 2: Priority Logic (CLI Override > Git)") analyzer_override = RepoAnalyzer( @@ -55,7 +52,20 @@ def main() -> None: ) print(f" [{'✅' if check4 else '❌'}] CLI --exclude overrides Git tracking") - if all([check1, check2, check3, check4]): + print("\nTEST 3: Gitignore Disabled (respect_gitignore=False)") + analyzer_no_gitignore = RepoAnalyzer(respect_gitignore=False, repo_path=str(temp_dir)) + + check5 = analyzer_no_gitignore._should_exclude_path("notes.txt", "notes.txt") is False + check6 = analyzer_no_gitignore._should_exclude_path("readme.txt", "readme.txt") is False + check7 = ( + analyzer_no_gitignore._should_exclude_path("backend/secrets.py", "secrets.py") is False + ) + + print(f" [{'✅' if check5 else '❌'}] *.txt files NOT excluded by gitignore") + print(f" [{'✅' if check6 else '❌'}] readme.txt NOT excluded by gitignore") + print(f" [{'✅' if check7 else '❌'}] nested .gitignore NOT respected") + + if all([check1, check2, check3, check4, check5, check6, check7]): print("\n✨ ALL CHECKS PASSED") sys.exit(0) else: