From 795842780fbcbff8e0e9acb33f29d10128e7ff1a Mon Sep 17 00:00:00 2001 From: jpotw <105954991+jpotw@users.noreply.github.com> Date: Thu, 3 Jul 2025 16:26:15 +0900 Subject: [PATCH 1/3] feat: add optional --include-submodules flag to CLI and ingestion - Adds --include-submodules CLI flag to control submodule analysis - Propagates include_submodules through ingestion, schemas, and clone logic - Updates tests to cover submodule inclusion - Adds a helper function (_checkout_partial_clone) to avoid repetition - Web UI for this option is not implemented for now (https://github.com/cyclotruc/gitingest/pull/313#issuecomment-3019912523) --- src/gitingest/cli.py | 12 ++++++ src/gitingest/clone.py | 39 ++++++++++++++++---- src/gitingest/entrypoint.py | 9 +++++ src/gitingest/schemas/ingestion.py | 9 ++++- tests/query_parser/test_git_host_agnostic.py | 1 + tests/test_cli.py | 10 +++++ tests/test_clone.py | 22 +++++++++++ 7 files changed, 93 insertions(+), 9 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 64ef463c..422be57f 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -20,6 +20,7 @@ class _CLIArgs(TypedDict): include_pattern: tuple[str, ...] branch: str | None include_gitignored: bool + include_submodules: bool token: str | None output: str | None @@ -47,6 +48,12 @@ class _CLIArgs(TypedDict): default=False, help="Include files matched by .gitignore and .gitingestignore", ) +@click.option( + "--include-submodules", + is_flag=True, + help="Include repository's submodules in the analysis", + default=False, +) @click.option( "--token", "-t", @@ -106,6 +113,7 @@ async def _async_main( include_pattern: tuple[str, ...] | None = None, branch: str | None = None, include_gitignored: bool = False, + include_submodules: bool = False, token: str | None = None, output: str | None = None, ) -> None: @@ -129,6 +137,9 @@ async def _async_main( Git branch to ingest. If ``None``, the repository's default branch is used. include_gitignored : bool If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``). + include_submodules : bool + If ``True``, recursively include and analyze all Git submodules within the repository. + Set to ``False`` to ignore submodules during analysis (default is ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. @@ -160,6 +171,7 @@ async def _async_main( include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, + include_submodules=include_submodules, output=output_target, include_gitignored=include_gitignored, token=token, diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index a65c3046..d46002ef 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -49,6 +49,7 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: branch: str | None = config.branch tag: str | None = config.tag partial_clone: bool = config.subpath != "/" + include_submodules: bool = config.include_submodules # Create parent directory if it doesn't exist await ensure_directory(Path(local_path).parent) @@ -63,7 +64,8 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: clone_cmd += ["-c", create_git_auth_header(token, url=url)] clone_cmd += ["clone", "--single-branch"] - # TODO: Re-enable --recurse-submodules when submodule support is needed + if include_submodules: + clone_cmd += ["--recurse-submodules"] if partial_clone: clone_cmd += ["--filter=blob:none", "--sparse"] @@ -86,15 +88,36 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: # Checkout the subpath if it is a partial clone if partial_clone: - subpath = config.subpath.lstrip("/") - if config.blob: - # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. - subpath = str(Path(subpath).parent.as_posix()) - - checkout_cmd = create_git_command(["git"], local_path, url, token) - await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) + await _checkout_partial_clone(config, token) # Checkout the commit if it is provided if commit: checkout_cmd = create_git_command(["git"], local_path, url, token) await run_command(*checkout_cmd, "checkout", commit) + + +def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None: + """Handle sparse-checkout for partial clones. + + This helper function sets the sparse-checkout configuration for a partial clone, + optionally adjusting the subpath if ingesting from a file URL. + + Parameters + ---------- + config : CloneConfig + The configuration for cloning the repository, including subpath and blob flag. + token : str | None + GitHub personal access token (PAT) for accessing private repositories. + Can also be set via the ``GITHUB_TOKEN`` environment variable. + + Returns + ------- + None + + """ + subpath = config.subpath.lstrip("/") + if config.blob: + # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. + subpath = str(Path(subpath).parent.as_posix()) + checkout_cmd = create_git_command(["git"], config.local_path, config.url, token) + return run_command(*checkout_cmd, "sparse-checkout", "set", subpath) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 9c04d65b..97c8f2e6 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -27,6 +27,7 @@ async def ingest_async( branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, + include_submodules: bool = False, token: str | None = None, output: str | None = None, ) -> tuple[str, str, str]: @@ -52,6 +53,8 @@ async def ingest_async( The tag to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). + include_submodules : bool + If ``True``, recursively include and analyze all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. @@ -86,6 +89,8 @@ async def ingest_async( if query.url: _override_branch_and_tag(query, branch=branch, tag=tag) + query.include_submodules = include_submodules + async with _clone_repo_if_remote(query, token=token): summary, tree, content = ingest_query(query) await _write_output(tree, content=content, target=output) @@ -101,6 +106,7 @@ def ingest( branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, + include_submodules: bool = False, token: str | None = None, output: str | None = None, ) -> tuple[str, str, str]: @@ -126,6 +132,8 @@ def ingest( The tag to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). + include_submodules : bool + If ``True``, recursively include and analyze all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. @@ -156,6 +164,7 @@ def ingest( branch=branch, tag=tag, include_gitignored=include_gitignored, + include_submodules=include_submodules, token=token, output=output, ), diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index 3e1c5e81..93e2f3ed 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -11,7 +11,7 @@ @dataclass -class CloneConfig: +class CloneConfig: # pylint: disable=too-many-instance-attributes, too-many-arguments """Configuration for cloning a Git repository. This class holds the necessary parameters for cloning a repository to a local path, including @@ -33,6 +33,8 @@ class CloneConfig: The subpath to clone from the repository (default: ``"/"``). blob: bool Whether the repository is a blob (default: ``False``). + include_submodules: bool + Whether to clone submodules (default: ``False``). """ @@ -43,6 +45,7 @@ class CloneConfig: tag: str | None = None subpath: str = "/" blob: bool = False + include_submodules: bool = False class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes @@ -78,6 +81,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes The patterns to ignore (default: ``set()``). include_patterns : set[str] | None The patterns to include. + include_submodules : bool + The flag whether to include Git submodules in the analysis. (default: ``False``) """ @@ -95,6 +100,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes max_file_size: int = Field(default=MAX_FILE_SIZE) ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type include_patterns: set[str] | None = None + include_submodules: bool = False def extract_clone_config(self) -> CloneConfig: """Extract the relevant fields for the CloneConfig object. @@ -122,6 +128,7 @@ def extract_clone_config(self) -> CloneConfig: tag=self.tag, subpath=self.subpath, blob=self.type == "blob", + include_submodules=self.include_submodules, ) def ensure_url(self) -> None: diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index 710330d7..d3d2542a 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -68,6 +68,7 @@ async def test_parse_query_without_host( "commit": None, "max_file_size": 50, "include_patterns": None, + "include_submodules": False, } assert actual == expected diff --git a/tests/test_cli.py b/tests/test_cli.py index f001b84a..010703ef 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -31,6 +31,16 @@ True, id="custom-options", ), + pytest.param( + [ + "./", + "--output", + str(OUTPUT_FILE_NAME), + "--include-submodules", + ], + True, + id="with-include-submodules", + ), ], ) def test_cli_writes_file( diff --git a/tests/test_clone.py b/tests/test_clone.py index fed456b9..0e0f7b98 100644 --- a/tests/test_clone.py +++ b/tests/test_clone.py @@ -414,3 +414,25 @@ async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> Non ) assert run_command_mock.call_count == expected_call_count + + +@pytest.mark.asyncio +async def test_clone_with_include_submodules(run_command_mock: AsyncMock) -> None: + """Test cloning a repository with submodules included. + + Given a valid URL and include_submodules=True: + When `clone_repo` is called, + Then the repository should be cloned with --recurse-submodules in the git command. + """ + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="main", include_submodules=True) + + await clone_repo(clone_config) + + # Check that --recurse-submodules is in the clone command + found = False + for call in run_command_mock.call_args_list: + args = call[0] + if "clone" in args and "--recurse-submodules" in args: + found = True + break + assert found, "--recurse-submodules not found in git clone command when include_submodules=True" From 8575f475f0bde1e4b320a4475510d1e5aaac4f8f Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Thu, 3 Jul 2025 18:58:56 +0200 Subject: [PATCH 2/3] docs & tests maintenance: polish submodule flag docs, async helper, tidy tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * **CLI** * Add example using `--include-submodules` in help text * Shorten `include_submodules` param description in `_async_main` * Re-order args in `ingest_async` call for readability * **Clone** * Convert `_checkout_partial_clone` to **async** and update docstring accordingly * **Schemas / Entrypoint** * Tighten wording in docstrings for `include_submodules` * **Tests** * Drop redundant `test_cli` case for options mix * Rename `test_clone_commit_without_branch` → `test_clone_commit` * Strengthen `test_clone_with_include_submodules` assertions --- src/gitingest/cli.py | 12 +++++++----- src/gitingest/clone.py | 20 ++++++------------- src/gitingest/entrypoint.py | 4 ++-- src/gitingest/schemas/ingestion.py | 4 ++-- tests/test_cli.py | 11 +---------- tests/test_clone.py | 31 ++++++++++++++++-------------- 6 files changed, 35 insertions(+), 47 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 422be57f..e14ed681 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -101,6 +101,9 @@ def main(**cli_kwargs: Unpack[_CLIArgs]) -> None: $ gitingest https://github.com/user/private-repo -t ghp_token $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo + Include submodules: + $ gitingest https://github.com/user/repo --include-submodules + """ asyncio.run(_async_main(**cli_kwargs)) @@ -138,8 +141,7 @@ async def _async_main( include_gitignored : bool If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``). include_submodules : bool - If ``True``, recursively include and analyze all Git submodules within the repository. - Set to ``False`` to ignore submodules during analysis (default is ``False``). + If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. @@ -166,15 +168,15 @@ async def _async_main( click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True) summary, _, _ = await ingest_async( - source=source, + source, max_file_size=max_size, include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, - include_submodules=include_submodules, - output=output_target, include_gitignored=include_gitignored, + include_submodules=include_submodules, token=token, + output=output_target, ) except Exception as exc: # Convert any exception into Click.Abort so that exit status is non-zero diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index d46002ef..1f091486 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -49,7 +49,6 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: branch: str | None = config.branch tag: str | None = config.tag partial_clone: bool = config.subpath != "/" - include_submodules: bool = config.include_submodules # Create parent directory if it doesn't exist await ensure_directory(Path(local_path).parent) @@ -64,7 +63,8 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: clone_cmd += ["-c", create_git_auth_header(token, url=url)] clone_cmd += ["clone", "--single-branch"] - if include_submodules: + + if config.include_submodules: clone_cmd += ["--recurse-submodules"] if partial_clone: @@ -96,11 +96,8 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: await run_command(*checkout_cmd, "checkout", commit) -def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None: - """Handle sparse-checkout for partial clones. - - This helper function sets the sparse-checkout configuration for a partial clone, - optionally adjusting the subpath if ingesting from a file URL. +async def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None: + """Configure sparse-checkout for a partially cloned repository. Parameters ---------- @@ -108,16 +105,11 @@ def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None: The configuration for cloning the repository, including subpath and blob flag. token : str | None GitHub personal access token (PAT) for accessing private repositories. - Can also be set via the ``GITHUB_TOKEN`` environment variable. - - Returns - ------- - None """ subpath = config.subpath.lstrip("/") if config.blob: - # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. + # Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt) subpath = str(Path(subpath).parent.as_posix()) checkout_cmd = create_git_command(["git"], config.local_path, config.url, token) - return run_command(*checkout_cmd, "sparse-checkout", "set", subpath) + await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 97c8f2e6..f64dec08 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -54,7 +54,7 @@ async def ingest_async( include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool - If ``True``, recursively include and analyze all Git submodules within the repository (default: ``False``). + If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. @@ -133,7 +133,7 @@ def ingest( include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool - If ``True``, recursively include and analyze all Git submodules within the repository (default: ``False``). + If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index 93e2f3ed..c40e11d6 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -11,7 +11,7 @@ @dataclass -class CloneConfig: # pylint: disable=too-many-instance-attributes, too-many-arguments +class CloneConfig: # pylint: disable=too-many-instance-attributes """Configuration for cloning a Git repository. This class holds the necessary parameters for cloning a repository to a local path, including @@ -82,7 +82,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes include_patterns : set[str] | None The patterns to include. include_submodules : bool - The flag whether to include Git submodules in the analysis. (default: ``False``) + Whether to include all Git submodules within the repository. (default: ``False``) """ diff --git a/tests/test_cli.py b/tests/test_cli.py index 010703ef..f9bbde85 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -27,19 +27,10 @@ "tests/", "--include-pattern", "src/", - ], - True, - id="custom-options", - ), - pytest.param( - [ - "./", - "--output", - str(OUTPUT_FILE_NAME), "--include-submodules", ], True, - id="with-include-submodules", + id="custom-options", ), ], ) diff --git a/tests/test_clone.py b/tests/test_clone.py index 0e0f7b98..9ffaa376 100644 --- a/tests/test_clone.py +++ b/tests/test_clone.py @@ -181,10 +181,10 @@ async def test_clone_default_shallow_clone(run_command_mock: AsyncMock) -> None: @pytest.mark.asyncio -async def test_clone_commit_without_branch(run_command_mock: AsyncMock) -> None: - """Test cloning when a commit hash is provided but no branch is specified. +async def test_clone_commit(run_command_mock: AsyncMock) -> None: + """Test cloning when a commit hash is provided. - Given a valid URL and a commit hash (but no branch): + Given a valid URL and a commit hash: When ``clone_repo`` is called, Then the repository should be cloned and checked out at that commit. """ @@ -420,19 +420,22 @@ async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> Non async def test_clone_with_include_submodules(run_command_mock: AsyncMock) -> None: """Test cloning a repository with submodules included. - Given a valid URL and include_submodules=True: - When `clone_repo` is called, - Then the repository should be cloned with --recurse-submodules in the git command. + Given a valid URL and ``include_submodules=True``: + When ``clone_repo`` is called, + Then the repository should be cloned with ``--recurse-submodules`` in the git command. """ + expected_call_count = 1 # No commit and no partial clone clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="main", include_submodules=True) await clone_repo(clone_config) - # Check that --recurse-submodules is in the clone command - found = False - for call in run_command_mock.call_args_list: - args = call[0] - if "clone" in args and "--recurse-submodules" in args: - found = True - break - assert found, "--recurse-submodules not found in git clone command when include_submodules=True" + assert run_command_mock.call_count == expected_call_count + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--recurse-submodules", + "--depth=1", + clone_config.url, + clone_config.local_path, + ) From 1bd543bca525f5d6cec4f1dd1ebb37d52c7837f4 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Thu, 3 Jul 2025 19:09:59 +0200 Subject: [PATCH 3/3] docs: add include_submodules example in README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 84b80e31..aab92204 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,9 @@ gitingest https://github.com/username/private-repo --token github_pat_... # Or set it as an environment variable export GITHUB_TOKEN=github_pat_... gitingest https://github.com/username/private-repo + +# Include repository submodules +gitingest https://github.com/username/repo-with-submodules --include-submodules ``` By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you @@ -163,6 +166,9 @@ summary, tree, content = ingest("https://github.com/username/private-repo", toke import os os.environ["GITHUB_TOKEN"] = "github_pat_..." summary, tree, content = ingest("https://github.com/username/private-repo") + +# Include repository submodules +summary, tree, content = ingest("https://github.com/username/repo-with-submodules", include_submodules=True) ``` By default, this won't write a file but can be enabled with the `output` argument.