From 979f85cdb7ad404f3515d46d7a28c607a5ed6244 Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Fri, 25 Apr 2025 18:47:20 +0000 Subject: [PATCH 1/2] Fix language detection bug in monorepos with mixed languages --- src/codegen/git/utils/language.py | 5 +- src/codegen/sdk/codebase/config.py | 14 +++- src/codegen/sdk/core/codebase.py | 4 +- .../sdk/codebase/test_language_detection.py | 83 +++++++++++++++++++ 4 files changed, 99 insertions(+), 7 deletions(-) create mode 100644 tests/unit/codegen/sdk/codebase/test_language_detection.py diff --git a/src/codegen/git/utils/language.py b/src/codegen/git/utils/language.py index 551ac4212..4feef9448 100644 --- a/src/codegen/git/utils/language.py +++ b/src/codegen/git/utils/language.py @@ -132,8 +132,11 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua repo_config = RepoConfig.from_repo_path(repo_path=git_root) repo_operator = RepoOperator(repo_config=repo_config) + # Use the specified subfolder path for language detection if provided + subdirs = [base_path] if base_path else None + # Walk through the directory - for rel_path, _ in repo_operator.iter_files(subdirs=[base_path] if base_path else None, ignore_list=GLOBAL_FILE_IGNORE_LIST): + for rel_path, _ in repo_operator.iter_files(subdirs=subdirs, ignore_list=GLOBAL_FILE_IGNORE_LIST): # Convert to Path object file_path = Path(git_root) / Path(rel_path) diff --git a/src/codegen/sdk/codebase/config.py b/src/codegen/sdk/codebase/config.py index 25f3c2e0e..af3d4d6a6 100644 --- a/src/codegen/sdk/codebase/config.py +++ b/src/codegen/sdk/codebase/config.py @@ -46,23 +46,29 @@ def from_path(cls, path: str, programming_language: ProgrammingLanguage | None = repo_path = os.path.abspath(path) git_root, base_path = split_git_path(repo_path) subdirectories = [base_path] if base_path else None - programming_language = programming_language or determine_project_language(repo_path) + + # Only determine project language if not explicitly provided + detected_language = programming_language or determine_project_language(repo_path) + repo_config = RepoConfig.from_repo_path(repo_path=git_root) - repo_config.language = programming_language + repo_config.language = detected_language repo_config.subdirectories = subdirectories # Create main project return cls( repo_operator=RepoOperator(repo_config=repo_config), - programming_language=programming_language, + programming_language=detected_language, base_path=base_path, subdirectories=subdirectories, ) @classmethod def from_repo_operator(cls, repo_operator: RepoOperator, programming_language: ProgrammingLanguage | None = None, base_path: str | None = None) -> Self: + # Only determine project language if not explicitly provided + detected_language = programming_language or determine_project_language(repo_operator.repo_path) + return cls( repo_operator=repo_operator, - programming_language=programming_language or determine_project_language(repo_operator.repo_path), + programming_language=detected_language, base_path=base_path, subdirectories=[base_path] if base_path else None, ) diff --git a/src/codegen/sdk/core/codebase.py b/src/codegen/sdk/core/codebase.py index fc3e0557e..07701acf8 100644 --- a/src/codegen/sdk/core/codebase.py +++ b/src/codegen/sdk/core/codebase.py @@ -200,7 +200,7 @@ def __init__( if repo_path is not None: main_project = ProjectConfig.from_path( repo_path, - programming_language=ProgrammingLanguage(language.upper()) if language else None, + programming_language=ProgrammingLanguage(language.upper()) if isinstance(language, str) and language else language, ) projects = [main_project] else: @@ -1392,7 +1392,7 @@ def from_repo( logger.info("Initializing Codebase...") project = ProjectConfig.from_repo_operator( repo_operator=repo_operator, - programming_language=ProgrammingLanguage(language.upper()) if language else None, + programming_language=ProgrammingLanguage(language.upper()) if isinstance(language, str) and language else language, ) codebase = Codebase(projects=[project], config=config, secrets=secrets) logger.info("Codebase initialization complete") diff --git a/tests/unit/codegen/sdk/codebase/test_language_detection.py b/tests/unit/codegen/sdk/codebase/test_language_detection.py new file mode 100644 index 000000000..d381ce1d3 --- /dev/null +++ b/tests/unit/codegen/sdk/codebase/test_language_detection.py @@ -0,0 +1,83 @@ +import os +import tempfile +from pathlib import Path + +import pytest + +from codegen.sdk.codebase.config import ProjectConfig +from codegen.shared.enums.programming_language import ProgrammingLanguage + + +def test_explicit_language_respected(): + """Test that explicitly provided language is respected and not overridden by detection.""" + with tempfile.TemporaryDirectory() as tmp_dir: + # Create a temporary directory with more TypeScript files than Python files + ts_dir = Path(tmp_dir) / "ts" + py_dir = Path(tmp_dir) / "py" + ts_dir.mkdir() + py_dir.mkdir() + + # Create TypeScript files + for i in range(5): + with open(ts_dir / f"file{i}.ts", "w") as f: + f.write(f"// TypeScript file {i}") + + # Create fewer Python files + for i in range(2): + with open(py_dir / f"file{i}.py", "w") as f: + f.write(f"# Python file {i}") + + # Initialize git repo + os.system(f"cd {tmp_dir} && git init && git config user.email 'test@example.com' && git config user.name 'Test User' && git add . && git commit -m 'Initial commit'") + + # Test with explicit Python language + project_config = ProjectConfig.from_path( + path=str(tmp_dir), + programming_language=ProgrammingLanguage.PYTHON + ) + + # Verify that the language is Python, not TypeScript (which would be detected based on file count) + assert project_config.programming_language == ProgrammingLanguage.PYTHON + + # Test with explicit TypeScript language + project_config = ProjectConfig.from_path( + path=str(py_dir), # Use Python directory + programming_language=ProgrammingLanguage.TYPESCRIPT + ) + + # Verify that the language is TypeScript, not Python (which would be detected based on file count) + assert project_config.programming_language == ProgrammingLanguage.TYPESCRIPT + + +def test_subfolder_language_detection(): + """Test that language detection respects the specified subfolder.""" + with tempfile.TemporaryDirectory() as tmp_dir: + # Create a temporary directory with TypeScript files in root and Python files in subfolder + ts_dir = Path(tmp_dir) + py_dir = Path(tmp_dir) / "python_only" + py_dir.mkdir() + + # Create TypeScript files in root + for i in range(5): + with open(ts_dir / f"file{i}.ts", "w") as f: + f.write(f"// TypeScript file {i}") + + # Create Python files in subfolder + for i in range(3): + with open(py_dir / f"file{i}.py", "w") as f: + f.write(f"# Python file {i}") + + # Initialize git repo + os.system(f"cd {tmp_dir} && git init && git config user.email 'test@example.com' && git config user.name 'Test User' && git add . && git commit -m 'Initial commit'") + + # Test with root path - should detect TypeScript + project_config = ProjectConfig.from_path( + path=str(tmp_dir) + ) + assert project_config.programming_language == ProgrammingLanguage.TYPESCRIPT + + # Test with Python subfolder path - should detect Python + project_config = ProjectConfig.from_path( + path=str(py_dir) + ) + assert project_config.programming_language == ProgrammingLanguage.PYTHON From 4019b8f82edc5775fbc225e985e6c3683d7fe0cc Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Fri, 25 Apr 2025 18:49:02 +0000 Subject: [PATCH 2/2] Automated pre-commit update --- src/codegen/git/utils/language.py | 2 +- src/codegen/sdk/codebase/config.py | 6 +-- .../sdk/codebase/test_language_detection.py | 41 ++++++++----------- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/src/codegen/git/utils/language.py b/src/codegen/git/utils/language.py index 4feef9448..8c761e52b 100644 --- a/src/codegen/git/utils/language.py +++ b/src/codegen/git/utils/language.py @@ -134,7 +134,7 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua # Use the specified subfolder path for language detection if provided subdirs = [base_path] if base_path else None - + # Walk through the directory for rel_path, _ in repo_operator.iter_files(subdirs=subdirs, ignore_list=GLOBAL_FILE_IGNORE_LIST): # Convert to Path object diff --git a/src/codegen/sdk/codebase/config.py b/src/codegen/sdk/codebase/config.py index af3d4d6a6..0b9900ee2 100644 --- a/src/codegen/sdk/codebase/config.py +++ b/src/codegen/sdk/codebase/config.py @@ -46,10 +46,10 @@ def from_path(cls, path: str, programming_language: ProgrammingLanguage | None = repo_path = os.path.abspath(path) git_root, base_path = split_git_path(repo_path) subdirectories = [base_path] if base_path else None - + # Only determine project language if not explicitly provided detected_language = programming_language or determine_project_language(repo_path) - + repo_config = RepoConfig.from_repo_path(repo_path=git_root) repo_config.language = detected_language repo_config.subdirectories = subdirectories @@ -65,7 +65,7 @@ def from_path(cls, path: str, programming_language: ProgrammingLanguage | None = def from_repo_operator(cls, repo_operator: RepoOperator, programming_language: ProgrammingLanguage | None = None, base_path: str | None = None) -> Self: # Only determine project language if not explicitly provided detected_language = programming_language or determine_project_language(repo_operator.repo_path) - + return cls( repo_operator=repo_operator, programming_language=detected_language, diff --git a/tests/unit/codegen/sdk/codebase/test_language_detection.py b/tests/unit/codegen/sdk/codebase/test_language_detection.py index d381ce1d3..ae112f4c8 100644 --- a/tests/unit/codegen/sdk/codebase/test_language_detection.py +++ b/tests/unit/codegen/sdk/codebase/test_language_detection.py @@ -2,8 +2,6 @@ import tempfile from pathlib import Path -import pytest - from codegen.sdk.codebase.config import ProjectConfig from codegen.shared.enums.programming_language import ProgrammingLanguage @@ -16,35 +14,32 @@ def test_explicit_language_respected(): py_dir = Path(tmp_dir) / "py" ts_dir.mkdir() py_dir.mkdir() - + # Create TypeScript files for i in range(5): with open(ts_dir / f"file{i}.ts", "w") as f: f.write(f"// TypeScript file {i}") - + # Create fewer Python files for i in range(2): with open(py_dir / f"file{i}.py", "w") as f: f.write(f"# Python file {i}") - + # Initialize git repo os.system(f"cd {tmp_dir} && git init && git config user.email 'test@example.com' && git config user.name 'Test User' && git add . && git commit -m 'Initial commit'") - + # Test with explicit Python language - project_config = ProjectConfig.from_path( - path=str(tmp_dir), - programming_language=ProgrammingLanguage.PYTHON - ) - + project_config = ProjectConfig.from_path(path=str(tmp_dir), programming_language=ProgrammingLanguage.PYTHON) + # Verify that the language is Python, not TypeScript (which would be detected based on file count) assert project_config.programming_language == ProgrammingLanguage.PYTHON - + # Test with explicit TypeScript language project_config = ProjectConfig.from_path( path=str(py_dir), # Use Python directory - programming_language=ProgrammingLanguage.TYPESCRIPT + programming_language=ProgrammingLanguage.TYPESCRIPT, ) - + # Verify that the language is TypeScript, not Python (which would be detected based on file count) assert project_config.programming_language == ProgrammingLanguage.TYPESCRIPT @@ -56,28 +51,24 @@ def test_subfolder_language_detection(): ts_dir = Path(tmp_dir) py_dir = Path(tmp_dir) / "python_only" py_dir.mkdir() - + # Create TypeScript files in root for i in range(5): with open(ts_dir / f"file{i}.ts", "w") as f: f.write(f"// TypeScript file {i}") - + # Create Python files in subfolder for i in range(3): with open(py_dir / f"file{i}.py", "w") as f: f.write(f"# Python file {i}") - + # Initialize git repo os.system(f"cd {tmp_dir} && git init && git config user.email 'test@example.com' && git config user.name 'Test User' && git add . && git commit -m 'Initial commit'") - + # Test with root path - should detect TypeScript - project_config = ProjectConfig.from_path( - path=str(tmp_dir) - ) + project_config = ProjectConfig.from_path(path=str(tmp_dir)) assert project_config.programming_language == ProgrammingLanguage.TYPESCRIPT - + # Test with Python subfolder path - should detect Python - project_config = ProjectConfig.from_path( - path=str(py_dir) - ) + project_config = ProjectConfig.from_path(path=str(py_dir)) assert project_config.programming_language == ProgrammingLanguage.PYTHON