From 33ba02d4c0379c9efb7f404ca0327fe5389c7878 Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Fri, 6 Jun 2025 23:17:16 +0100 Subject: [PATCH 01/14] Refactor CodeTide initialization to improve clarity and consistency in logging and variable naming --- codetide/__init__.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index dbb0342..8d051b0 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -65,29 +65,29 @@ async def from_path( Initialized CodeTide instance """ rootpath = Path(rootpath) - codebase = cls(rootpath=rootpath) - logger.info(f"Initializing CodeBase from path: {str(rootpath)}") + codeTide = cls(rootpath=rootpath) + logger.info(f"Initializing CodeTide from path: {str(rootpath)}") st = time.time() - codebase._find_code_files(rootpath, languages=languages) - if not codebase.file_list: + codeTide._find_code_files(rootpath, languages=languages) + if not codeTide.file_list: logger.warning("No code files found matching the criteria") - return codebase + return codeTide - language_files = codebase._organize_files_by_language() - await codebase._initialize_parsers(language_files.keys()) + language_files = codeTide._organize_files_by_language() + await codeTide._initialize_parsers(language_files.keys()) - results = await codebase._process_files_concurrently( + results = await codeTide._process_files_concurrently( language_files, max_concurrent_tasks, batch_size ) - codebase._add_results_to_codebase(results) - codebase._resolve_files_dependencies() - logger.info(f"CodeBase initialized with {len(results)} files processed in {time.time() - st:.2f}s") + codeTide._add_results_to_codebase(results) + codeTide._resolve_files_dependencies() + logger.info(f"CodeTide initialized with {len(results)} files processed in {time.time() - st:.2f}s") - return codebase + return codeTide def serialize(self, filepath :Optional[Union[str, Path]]=DEFAULT_SERIALIZATION_PATH, include_codebase_cached_elements :bool=False, include_cached_ids :bool=False): if not os.path.exists(filepath): From 37f6ee9f88350a47d30439f1c0f833c8d5226b3c Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 00:16:54 +0100 Subject: [PATCH 02/14] Enhance serialize method to manage .gitignore entries and improve cached elements handling --- .gitignore | 3 +++ codetide/__init__.py | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 0a19790..9ac7e6f 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,6 @@ cython_debug/ # PyPI configuration file .pypirc + +storage/ + diff --git a/codetide/__init__.py b/codetide/__init__.py index 8d051b0..cac5bb7 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -89,19 +89,37 @@ async def from_path( return codeTide - def serialize(self, filepath :Optional[Union[str, Path]]=DEFAULT_SERIALIZATION_PATH, include_codebase_cached_elements :bool=False, include_cached_ids :bool=False): + def serialize(self, filepath: Optional[Union[str, Path]] = DEFAULT_SERIALIZATION_PATH, + include_codebase_cached_elements: bool = False, + include_cached_ids: bool = False): if not os.path.exists(filepath): os.makedirs(os.path.split(filepath)[0], exist_ok=True) + writeFile(self.model_dump_json(indent=4), filepath) - if include_codebase_cached_elements or include_cached_ids: - dir_path = Path(os.path.split(filepath)[0]) - if include_codebase_cached_elements: - cached_elements_path = dir_path / DEFAULT_CACHED_ELEMENTS_FILE - writeFile(self.codebase.serialize_cache_elements(), cached_elements_path) - - if include_cached_ids: - cached_ids_path = dir_path / DEFAULT_CACHED_IDS_FILE - writeFile(json.dumps(self.codebase.unique_ids, indent=4), cached_ids_path) + + dir_path = Path(os.path.split(filepath)[0]) + + current_path = dir_path + gitignore_path = None + for parent in current_path.parents: + potential_gitignore = parent / ".gitignore" + if potential_gitignore.exists(): + gitignore_path = potential_gitignore + break + + if gitignore_path: + with open(gitignore_path, 'r+') as f: + lines = f.read().splitlines() + if f"{dir_path.name}/" not in lines: + f.write(f"\n{dir_path.name}/\n") + + if include_codebase_cached_elements: + cached_elements_path = dir_path / DEFAULT_CACHED_ELEMENTS_FILE + writeFile(self.codebase.serialize_cache_elements(), cached_elements_path) + + if include_cached_ids: + cached_ids_path = dir_path / DEFAULT_CACHED_IDS_FILE + writeFile(json.dumps(self.codebase.unique_ids, indent=4), cached_ids_path) @classmethod def deserialize(cls, filepath :Optional[Union[str, Path]]=DEFAULT_SERIALIZATION_PATH)->"CodeTide": From 495d6feda581da61791a13c13c7793edc7ce4f3f Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 13:01:37 +0100 Subject: [PATCH 03/14] Fix file_list initialization to use dict and update file_list assignment to store timestamps --- codetide/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index cac5bb7..8b61290 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -11,6 +11,7 @@ from pydantic import BaseModel, Field, field_validator from typing import Optional, List, Union, Dict +from datetime import datetime, timezone from pathspec import GitIgnoreSpec from pathlib import Path import logging @@ -29,7 +30,7 @@ class CodeTide(BaseModel): """Root model representing a complete codebase""" rootpath : Union[str, Path] codebase :CodeBase = Field(default_factory=CodeBase) - file_list :List[Path] = Field(default_factory=list) + file_list :List[Path] = Field(default_factory=dict) _instantiated_parsers :Dict[str, BaseParser] = {} _gitignore_cache :Dict[str, GitIgnoreSpec] = {} @@ -332,7 +333,7 @@ def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None code_files.append(file_path) - self.file_list = code_files + self.file_list[code_files] = datetime.now(timezone.utc) return code_files @staticmethod From 34cb8e238ef6198fb5e470b04fcb3d303f4c815b Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 13:04:54 +0100 Subject: [PATCH 04/14] Enhance serialize and deserialize methods to support custom file paths based on rootpath --- codetide/__init__.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index 8b61290..f9a90da 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -90,9 +90,15 @@ async def from_path( return codeTide - def serialize(self, filepath: Optional[Union[str, Path]] = DEFAULT_SERIALIZATION_PATH, - include_codebase_cached_elements: bool = False, - include_cached_ids: bool = False): + def serialize(self, + filepath: Optional[Union[str, Path]] = DEFAULT_SERIALIZATION_PATH, + include_codebase_cached_elements: bool = False, + include_cached_ids: bool = False, + store_in_project_root: bool=True): + + if store_in_project_root: + filepath = Path(self.rootpath) / filepath + if not os.path.exists(filepath): os.makedirs(os.path.split(filepath)[0], exist_ok=True) @@ -123,7 +129,10 @@ def serialize(self, filepath: Optional[Union[str, Path]] = DEFAULT_SERIALIZATION writeFile(json.dumps(self.codebase.unique_ids, indent=4), cached_ids_path) @classmethod - def deserialize(cls, filepath :Optional[Union[str, Path]]=DEFAULT_SERIALIZATION_PATH)->"CodeTide": + def deserialize(cls, filepath :Optional[Union[str, Path]]=DEFAULT_SERIALIZATION_PATH, rootpath :Optional[Union[str, Path]] = None)->"CodeTide": + if rootpath is not None: + filepath = Path(rootpath) / filepath + if not os.path.exists(filepath): raise FileNotFoundError(f"{filepath} is not a valid path") From 4bebcd4da7d20634bc1ba6379e333ee4d2e3a8de Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 14:03:51 +0100 Subject: [PATCH 05/14] Refactor file_list to use a dictionary for improved file tracking and timestamp management --- codetide/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index f9a90da..954dea6 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -30,7 +30,7 @@ class CodeTide(BaseModel): """Root model representing a complete codebase""" rootpath : Union[str, Path] codebase :CodeBase = Field(default_factory=CodeBase) - file_list :List[Path] = Field(default_factory=dict) + file_list :Dict[Path, datetime]= Field(default_factory=dict) _instantiated_parsers :Dict[str, BaseParser] = {} _gitignore_cache :Dict[str, GitIgnoreSpec] = {} @@ -320,7 +320,7 @@ def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None if lang in LANGUAGE_EXTENSIONS: extensions.extend(LANGUAGE_EXTENSIONS[lang]) - code_files = [] + code_files = dict() for file_path in rootpath.rglob('*'): if not file_path.is_file() or (extensions and file_path.suffix.lower() not in extensions): @@ -340,9 +340,9 @@ def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None if gitignore_spec.match_file(rel_path): continue - code_files.append(file_path) + code_files[file_path] = datetime.now(timezone.utc) - self.file_list[code_files] = datetime.now(timezone.utc) + self.file_list = code_files return code_files @staticmethod From 801582b94eba2022a6198bff71ffc3915173714d Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 17:05:35 +0100 Subject: [PATCH 06/14] Refactor _organize_files_by_language method to accept file_list as an argument and update its usage in processing files --- codetide/__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index 954dea6..c0f6074 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -75,8 +75,8 @@ async def from_path( logger.warning("No code files found matching the criteria") return codeTide - language_files = codeTide._organize_files_by_language() - await codeTide._initialize_parsers(language_files.keys()) + language_files = codeTide._organize_files_by_language(codeTide.file_list) + codeTide._initialize_parsers(language_files.keys()) results = await codeTide._process_files_concurrently( language_files, @@ -147,19 +147,18 @@ def deserialize(cls, filepath :Optional[Union[str, Path]]=DEFAULT_SERIALIZATION_ return tideInstance - def _organize_files_by_language( - self, - ) -> Dict[str, List[Path]]: + @classmethod + def _organize_files_by_language(cls, file_list :Union[List, Dict[str, str]]) -> Dict[str, List[Path]]: """Organize files by their programming language.""" language_files = {} - for filepath in self.file_list: - language = self._get_language_from_extension(filepath) + for filepath in file_list: + language = cls._get_language_from_extension(filepath) if language not in language_files: language_files[language] = [] language_files[language].append(filepath) return language_files - async def _initialize_parsers( + def _initialize_parsers( self, languages: List[str] ) -> None: From d88cc56a18eb4cc6981868c27f834bb88efe4a97 Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 17:09:57 +0100 Subject: [PATCH 07/14] Refactor resolve_inter_files_dependencies and resolve_intra_file_dependencies methods to accept codeFiles parameter for improved dependency resolution --- codetide/parsers/base_parser.py | 7 ++++--- codetide/parsers/generic_parser.py | 6 +++--- codetide/parsers/python_parser.py | 27 +++++++++++++++------------ 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/codetide/parsers/base_parser.py b/codetide/parsers/base_parser.py index 7c2a9c6..ecd013f 100644 --- a/codetide/parsers/base_parser.py +++ b/codetide/parsers/base_parser.py @@ -1,7 +1,8 @@ from codetide.core.models import CodeBase, CodeFileModel, ImportStatement + +from typing import List, Optional, Union from abc import ABC, abstractmethod -from typing import Optional, Union from tree_sitter import Parser from pydantic import BaseModel from pathlib import Path @@ -47,11 +48,11 @@ async def parse_file(self, file_path: Union[str, Path], root_path: Optional[Unio pass @abstractmethod - def resolve_inter_files_dependencies(self, codeBase: CodeBase) -> None: + def resolve_inter_files_dependencies(self, codeBase: CodeBase, codeFiles :Optional[List[CodeFileModel]]=None) -> None: pass @abstractmethod - def resolve_intra_file_dependencies(self, codeBase: CodeBase) -> None: + def resolve_intra_file_dependencies(self, codeFiles: List[CodeFileModel]) -> None: pass # @abstractmethod diff --git a/codetide/parsers/generic_parser.py b/codetide/parsers/generic_parser.py index d06e058..3652c5c 100644 --- a/codetide/parsers/generic_parser.py +++ b/codetide/parsers/generic_parser.py @@ -2,7 +2,7 @@ from codetide.parsers.base_parser import BaseParser from concurrent.futures import ThreadPoolExecutor -from typing import Optional, Union +from typing import List, Optional, Union from pathlib import Path import asyncio @@ -59,8 +59,8 @@ def parse_code(self, file_path :Path): ) return codeFile - def resolve_inter_files_dependencies(self, codeBase: CodeBase) -> None: + def resolve_inter_files_dependencies(self, codeBase: CodeBase, codeFiles :Optional[List[CodeFileModel]]=None) -> None: pass - def resolve_intra_file_dependencies(self, codeBase: CodeBase) -> None: + def resolve_intra_file_dependencies(self, codeFiles: List[CodeFileModel]) -> None: pass \ No newline at end of file diff --git a/codetide/parsers/python_parser.py b/codetide/parsers/python_parser.py index 39e73a8..61df7e1 100644 --- a/codetide/parsers/python_parser.py +++ b/codetide/parsers/python_parser.py @@ -396,7 +396,8 @@ def _generate_unique_import_id(cls, importModel :ImportStatement): importModel.raw = cls.import_statement_template(importModel) - def resolve_inter_files_dependencies(self, codeBase: CodeBase) -> None: + @classmethod + def resolve_inter_files_dependencies(cls, codeBase: CodeBase, codeFiles :Optional[List[CodeFileModel]]=None) -> None: ### for codeFile in codeBase search through imports and if defition_id matches an id from a class, a function or a variable let it be ### otherwise check if it matches a unique_id from imports, if so map dfeiniton_id to import unique id ### othewise map to None and is a package @@ -417,7 +418,7 @@ def resolve_inter_files_dependencies(self, codeBase: CodeBase) -> None: continue importStatement.definition_id = None - importStatement.unique_id = self._default_unique_import_id(importStatement) + importStatement.unique_id = cls._default_unique_import_id(importStatement) @staticmethod def count_occurences_in_code(code: str, substring: str) -> int: @@ -431,8 +432,8 @@ def count_occurences_in_code(code: str, substring: str) -> int: matches = re.findall(pattern, code) return len(matches) - def resolve_intra_file_dependencies(self, codeBase: CodeBase) -> None: - for codeFile in codeBase.root: + def resolve_intra_file_dependencies(self, codeFiles: List[CodeFileModel]) -> None: + for codeFile in codeFiles: if not codeFile.file_path.endswith(self.extension): continue @@ -464,7 +465,8 @@ def resolve_intra_file_dependencies(self, codeBase: CodeBase) -> None: codeFile=codeFile ) - def _find_elements_references(self, + @classmethod + def _find_elements_references(cls, element_type :Literal["variables", "functions", "classes"], non_import_ids :List[str], raw_contents :List[str], @@ -473,12 +475,12 @@ def _find_elements_references(self, ### broken for class defintion as we need to search through methods and attributes if element_type == "classes": for classAttribute in element.attributes: - elementCounts = self._get_element_count(raw_contents, classAttribute) + elementCounts = cls._get_element_count(raw_contents, classAttribute) if elementCounts <= 0: continue - self._find_references( + cls._find_references( non_import_ids=non_import_ids, raw_contents=raw_contents, matches_count=elementCounts, @@ -489,12 +491,12 @@ def _find_elements_references(self, for classMethod in element.methods: # print(f"{classMethod.name=}") - elementCounts = self._get_element_count(raw_contents, classMethod) + elementCounts = cls._get_element_count(raw_contents, classMethod) if elementCounts <= 0: continue - self._find_references( + cls._find_references( non_import_ids=non_import_ids, raw_contents=raw_contents, matches_count=elementCounts, @@ -504,12 +506,12 @@ def _find_elements_references(self, ) else: - elementCounts = self._get_element_count(raw_contents, element) + elementCounts = cls._get_element_count(raw_contents, element) if elementCounts <= 0: continue - self._find_references( + cls._find_references( non_import_ids=non_import_ids, raw_contents=raw_contents, matches_count=elementCounts, @@ -524,7 +526,8 @@ def _get_element_count(cls, raw_contents :List[str], element): elementCounts -= 1 return elementCounts - def _find_references(self, + @staticmethod + def _find_references( non_import_ids :List[str], raw_contents :List[str], matches_count :int, From 20f84241b3a20016764b1e5cf50d2291f8cd0390 Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 19:15:51 +0100 Subject: [PATCH 08/14] Fix resolve_intra_file_dependencies to handle None codeFiles by defaulting to codeBase.root --- codetide/parsers/python_parser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/codetide/parsers/python_parser.py b/codetide/parsers/python_parser.py index 61df7e1..cf11f42 100644 --- a/codetide/parsers/python_parser.py +++ b/codetide/parsers/python_parser.py @@ -402,9 +402,12 @@ def resolve_inter_files_dependencies(cls, codeBase: CodeBase, codeFiles :Optiona ### otherwise check if it matches a unique_id from imports, if so map dfeiniton_id to import unique id ### othewise map to None and is a package ### this should handle all imports across file + if codeFiles is None: + codeFiles = codeBase.root + all_imports = codeBase.all_imports() all_elements = codeBase.all_classes() + codeBase.all_functions() + codeBase.all_variables() - for codeFile in codeBase.root: + for codeFile in codeFiles: global_imports_minus_current = [ importId for importId in all_imports if importId not in codeFile.all_imports() From 18ca28eb9142eb99416ece2c55612fd801f5f626 Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 19:18:22 +0100 Subject: [PATCH 09/14] Add _reset method to reinitialize CodeTide from rootpath and update intra-file dependency resolution --- codetide/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index c0f6074..af4865e 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -90,6 +90,9 @@ async def from_path( return codeTide + async def _reset(self): + self = await self.from_path(self.rootpath) + def serialize(self, filepath: Optional[Union[str, Path]] = DEFAULT_SERIALIZATION_PATH, include_codebase_cached_elements: bool = False, @@ -356,7 +359,7 @@ def _get_language_from_extension(filepath: Path) -> Optional[str]: Language name or None if not recognized """ - extension = filepath.suffix.lower() + extension = Path(filepath).suffix.lower() for language, extensions in LANGUAGE_EXTENSIONS.items(): if extension in extensions: @@ -367,5 +370,5 @@ def _get_language_from_extension(filepath: Path) -> Optional[str]: def _resolve_files_dependencies(self): for _, parser in self._instantiated_parsers.items(): parser.resolve_inter_files_dependencies(self.codebase) - parser.resolve_intra_file_dependencies(self.codebase) + parser.resolve_intra_file_dependencies(self.codebase.root) From 87f7f8a202689fe79c4c7db9c11480c9aa0c1688 Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 19:28:33 +0100 Subject: [PATCH 10/14] Refactor CodeTide methods to improve file handling and organization --- codetide/__init__.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index af4865e..d8f9c42 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -70,12 +70,12 @@ async def from_path( logger.info(f"Initializing CodeTide from path: {str(rootpath)}") st = time.time() - codeTide._find_code_files(rootpath, languages=languages) - if not codeTide.file_list: + codeTide.files = codeTide._find_code_files(rootpath, languages=languages) + if not codeTide.files: logger.warning("No code files found matching the criteria") return codeTide - language_files = codeTide._organize_files_by_language(codeTide.file_list) + language_files = codeTide._organize_files_by_language(codeTide.files) codeTide._initialize_parsers(language_files.keys()) results = await codeTide._process_files_concurrently( @@ -151,10 +151,10 @@ def deserialize(cls, filepath :Optional[Union[str, Path]]=DEFAULT_SERIALIZATION_ return tideInstance @classmethod - def _organize_files_by_language(cls, file_list :Union[List, Dict[str, str]]) -> Dict[str, List[Path]]: + def _organize_files_by_language(cls, files :Union[List, Dict[str, str]]) -> Dict[str, List[Path]]: """Organize files by their programming language.""" language_files = {} - for filepath in file_list: + for filepath in files: language = cls._get_language_from_extension(filepath) if language not in language_files: language_files[language] = [] @@ -300,7 +300,8 @@ def _get_gitignore_for_path(self, path: Path) -> GitIgnoreSpec: return combined_spec - def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None) -> List[Path]: + @classmethod + def _find_code_files(cls, rootpath: Path, languages: Optional[List[str]] = None) -> List[Path]: """ Find all code files in a directory tree, respecting .gitignore rules in each directory. @@ -329,7 +330,7 @@ def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None continue # Get the combined gitignore spec for this path - gitignore_spec = self._get_gitignore_for_path(file_path) + gitignore_spec = cls._get_gitignore_for_path(file_path) # Convert path to relative path for gitignore matching try: @@ -344,7 +345,6 @@ def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None code_files[file_path] = datetime.now(timezone.utc) - self.file_list = code_files return code_files @staticmethod From 331a44ff00c5c92f5426301c3bd3dcce5d7dd244 Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 19:33:09 +0100 Subject: [PATCH 11/14] Fix file_list attribute name and update _find_code_files method to use instance context --- codetide/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index d8f9c42..cba6231 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -30,7 +30,7 @@ class CodeTide(BaseModel): """Root model representing a complete codebase""" rootpath : Union[str, Path] codebase :CodeBase = Field(default_factory=CodeBase) - file_list :Dict[Path, datetime]= Field(default_factory=dict) + files :Dict[Path, datetime]= Field(default_factory=dict) _instantiated_parsers :Dict[str, BaseParser] = {} _gitignore_cache :Dict[str, GitIgnoreSpec] = {} @@ -300,8 +300,7 @@ def _get_gitignore_for_path(self, path: Path) -> GitIgnoreSpec: return combined_spec - @classmethod - def _find_code_files(cls, rootpath: Path, languages: Optional[List[str]] = None) -> List[Path]: + def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None) -> List[Path]: """ Find all code files in a directory tree, respecting .gitignore rules in each directory. @@ -330,7 +329,7 @@ def _find_code_files(cls, rootpath: Path, languages: Optional[List[str]] = None) continue # Get the combined gitignore spec for this path - gitignore_spec = cls._get_gitignore_for_path(file_path) + gitignore_spec = self._get_gitignore_for_path(file_path) # Convert path to relative path for gitignore matching try: From 58e8bd7fc9ca0902b09cf323ee604861f84dd389 Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 20:02:45 +0100 Subject: [PATCH 12/14] Enhance file tracking by updating modified timestamps and adding _get_changed_files method for change detection --- codetide/__init__.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index cba6231..5c2ec10 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -10,7 +10,7 @@ from codetide import parsers from pydantic import BaseModel, Field, field_validator -from typing import Optional, List, Union, Dict +from typing import Optional, List, Tuple, Union, Dict from datetime import datetime, timezone from pathspec import GitIgnoreSpec from pathlib import Path @@ -342,7 +342,11 @@ def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None if gitignore_spec.match_file(rel_path): continue - code_files[file_path] = datetime.now(timezone.utc) + # Get the last modified time and convert to UTC datetime + modified_timestamp = file_path.stat().st_mtime + modified_datetime = datetime.fromtimestamp(modified_timestamp, timezone.utc) + + code_files[file_path] = modified_datetime return code_files @@ -371,3 +375,30 @@ def _resolve_files_dependencies(self): parser.resolve_inter_files_dependencies(self.codebase) parser.resolve_intra_file_dependencies(self.codebase.root) + def _get_changed_files(self) -> Tuple[List[Path], bool]: + """ + this is a bit slow but works: need to optimize _find_code_files for speed + """ + file_deletion_detected = False + files = self._find_code_files(self.rootpath) # Dict[Path, datetime] + + changed_files = [] + + # Check for new files and modified files + for file_path, current_modified_time in files.items(): + if file_path not in self.files: + # New file + changed_files.append(file_path) + elif current_modified_time > self.files[file_path]: + # File has been modified since last scan + changed_files.append(file_path) + + # Check for deleted files + for stored_file_path in self.files: + if stored_file_path not in files: + file_deletion_detected = True + break + + self.files = files + return changed_files, file_deletion_detected + From 082b41e32d30fa7669f0a8bda54ff90d6b8a18cb Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sat, 7 Jun 2025 20:03:12 +0100 Subject: [PATCH 13/14] Add check_for_updates method to handle file changes and dependencies --- codetide/__init__.py | 61 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/codetide/__init__.py b/codetide/__init__.py index 5c2ec10..ba23322 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -402,3 +402,64 @@ def _get_changed_files(self) -> Tuple[List[Path], bool]: self.files = files return changed_files, file_deletion_detected + async def check_for_updates(self, + max_concurrent_tasks: int = DEFAULT_MAX_CONCURRENT_TASKS, + batch_size: int = DEFAULT_BATCH_SIZE): + + changed_files, deletion_detected = self._get_changed_files() + if deletion_detected: + logger.info("deletion operation detected reseting CodeTide [this is a temporary solution]") + await self._reset() + + changed_language_files = self._organize_files_by_language(changed_files) + self._initialize_parsers(changed_language_files.keys()) + + results :List[CodeFileModel] = await self._process_files_concurrently( + changed_language_files, + max_concurrent_tasks=max_concurrent_tasks, + batch_size=batch_size + ) + changedPaths = { + codeFile.file_path: None for codeFile in results + } + + for i, codeFile in enumerate(self.codebase.root): + if codeFile.file_path in changedPaths: + changedPaths[codeFile.file_path] = i + + newFiles :List[CodeFileModel] = [] + for codeFile in results: + i = changedPaths.get(codeFile.file_path) + if i is not None: ### is file update + ### TODO if new imports are found need to build inter and then intra + ### otherwise can just build intra and add directly + if codeFile.all_imports() == self.codebase.root[i].all_imports(): + language = self._get_language_from_extension(codeFile.file_path) + parser = self._instantiated_parsers.get(language) + self.codebase.root[i] = codeFile + logger.info(f"updating {codeFile.file_path} no new dependencies detected") + continue + + self.codebase.root[i] = codeFile + logger.info(f"updating {codeFile.file_path} with new dependencies") + + else: + self.codebase.root.append(codeFile) + changedPaths[codeFile.file_path] = len(self.codebase.root) - 1 + logger.info(f"adding new file {codeFile.file_path}") + + newFiles.append(codeFile) + + + for language, filepaths in changed_language_files.items(): + parser = self._instantiated_parsers.get(language) + filteredNewFiles = [ + newFile for newFile in newFiles + if self.rootpath / newFile.file_path in filepaths + ] + parser.resolve_inter_files_dependencies(self.codebase, filteredNewFiles) + parser.resolve_intra_file_dependencies(filteredNewFiles) + + for codeFile in filteredNewFiles: + i = changedPaths.get(codeFile.file_path) + self.codebase.root[i] = codeFile From d6d8999407c266ff7a8b8ff53c43da0bf36f4a4d Mon Sep 17 00:00:00 2001 From: BrunoV21 Date: Sun, 8 Jun 2025 00:39:57 +0100 Subject: [PATCH 14/14] Refactor CodeTide to utilize pygit2 for improved file tracking and remove unused gitignore handling --- codetide/__init__.py | 126 ++++++++++++++----------------------------- requirements.txt | 1 + 2 files changed, 41 insertions(+), 86 deletions(-) diff --git a/codetide/__init__.py b/codetide/__init__.py index ba23322..8348455 100644 --- a/codetide/__init__.py +++ b/codetide/__init__.py @@ -12,10 +12,10 @@ from pydantic import BaseModel, Field, field_validator from typing import Optional, List, Tuple, Union, Dict from datetime import datetime, timezone -from pathspec import GitIgnoreSpec from pathlib import Path import logging import asyncio +import pygit2 import time import json import os @@ -32,7 +32,6 @@ class CodeTide(BaseModel): codebase :CodeBase = Field(default_factory=CodeBase) files :Dict[Path, datetime]= Field(default_factory=dict) _instantiated_parsers :Dict[str, BaseParser] = {} - _gitignore_cache :Dict[str, GitIgnoreSpec] = {} @field_validator("rootpath", mode="after") @classmethod @@ -238,68 +237,6 @@ def _add_results_to_codebase( self.codebase.root.append(code_file) logger.debug(f"Added {len(results)} files to codebase") - @staticmethod - def _load_gitignore_spec(directory: Path) -> GitIgnoreSpec: - """ - Load and parse .gitignore file from a directory into a GitIgnoreSpec object. - - Args: - directory: Directory containing the .gitignore file - - Returns: - GitIgnoreSpec object with the patterns from the .gitignore file - """ - gitignore_path = directory / ".gitignore" - patterns = [".git/"] - - if gitignore_path.exists() and gitignore_path.is_file(): - try: - _gitignore = readFile(gitignore_path) - for line in _gitignore.splitlines(): - line = line.strip() - # Skip empty lines and comments - if line and not line.startswith('#'): - patterns.append(line) - except Exception as e: - logger.warning(f"Error reading .gitignore file {gitignore_path}: {e}") - - return GitIgnoreSpec.from_lines(patterns) - - def _get_gitignore_for_path(self, path: Path) -> GitIgnoreSpec: - """ - Get the combined GitIgnoreSpec for a path by checking all parent directories. - - Args: - path: The file path to check - - Returns: - Combined GitIgnoreSpec for all relevant .gitignore files - """ - # Check cache first - if path in self._gitignore_cache: - return self._gitignore_cache[path] - - # Collect all .gitignore specs from parent directories - specs = [] - - # Check the directory containing the file - parent_dir = path.parent if path.is_file() else path - - # Walk up the directory tree - for directory in [parent_dir, *parent_dir.parents]: - if directory not in self._gitignore_cache: - # Load and cache the spec for this directory - self._gitignore_cache[directory] = self._load_gitignore_spec(directory) - - specs.append(self._gitignore_cache[directory]) - - # Combine all specs into one - combined_spec = GitIgnoreSpec([]) - for spec in reversed(specs): # Apply from root to leaf - combined_spec += spec - - return combined_spec - def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None) -> List[Path]: """ Find all code files in a directory tree, respecting .gitignore rules in each directory. @@ -309,11 +246,11 @@ def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None languages: List of languages to include (None for all supported) Returns: - List of paths to code files + List of paths to code files with their last modified timestamps """ if not rootpath.exists() or not rootpath.is_dir(): logger.error(f"Root path does not exist or is not a directory: {rootpath}") - return [] + return {} # Determine valid extensions extensions = [] @@ -322,32 +259,48 @@ def _find_code_files(self, rootpath: Path, languages: Optional[List[str]] = None if lang in LANGUAGE_EXTENSIONS: extensions.extend(LANGUAGE_EXTENSIONS[lang]) - code_files = dict() - - for file_path in rootpath.rglob('*'): - if not file_path.is_file() or (extensions and file_path.suffix.lower() not in extensions): - continue - - # Get the combined gitignore spec for this path - gitignore_spec = self._get_gitignore_for_path(file_path) - - # Convert path to relative path for gitignore matching - try: - rel_path = file_path.relative_to(rootpath) - except ValueError: - # This shouldn't happen since we're scanning from rootpath + code_files = {} + + try: + # Try to open the repository + repo = pygit2.Repository(rootpath) + + # Get the repository's index (staging area) + index = repo.index + + # Convert all tracked files to Path objects + tracked_files = {Path(rootpath) / Path(entry.path) for entry in index} + + # Get status and filter files + status = repo.status() + + # Untracked files are those with status == pygit2.GIT_STATUS_WT_NEW + untracked_not_ignored = { + Path(rootpath) / Path(filepath) + for filepath, file_status in status.items() + if file_status == pygit2.GIT_STATUS_WT_NEW and not repo.path_is_ignored(filepath) + } + + all_files = tracked_files.union(untracked_not_ignored) + + except (pygit2.GitError, KeyError): + # Fallback to simple directory walk if not a git repo + all_files = set(rootpath.rglob('*')) + + for file_path in all_files: + if not file_path.is_file(): continue - - # Check if the file is ignored by any gitignore rules - if gitignore_spec.match_file(rel_path): + + # Check extension filter if languages were specified + if extensions and file_path.suffix.lower() not in extensions: continue - + # Get the last modified time and convert to UTC datetime modified_timestamp = file_path.stat().st_mtime modified_datetime = datetime.fromtimestamp(modified_timestamp, timezone.utc) code_files[file_path] = modified_datetime - + return code_files @staticmethod @@ -377,7 +330,8 @@ def _resolve_files_dependencies(self): def _get_changed_files(self) -> Tuple[List[Path], bool]: """ - this is a bit slow but works: need to optimize _find_code_files for speed + TODO consider if it is worth storing singular timestamp for latest fetch and then just use + pygit2 to changed files based on commit history + current repo status """ file_deletion_detected = False files = self._find_code_files(self.rootpath) # Dict[Path, datetime] diff --git a/requirements.txt b/requirements.txt index bca6fa6..17c960b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ pathspec==0.12.1 pydantic==2.10.3 +pygit2==1.18.0 pyyaml==6.0.2 tree-sitter==0.24.0 tree-sitter-python==0.23.6 \ No newline at end of file