Skip to content

Commit 3edb86f

Browse files
resolve commit
1 parent e023af3 commit 3edb86f

File tree

5 files changed

+170
-47
lines changed

5 files changed

+170
-47
lines changed

src/gitingest/clone.py

Lines changed: 17 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
from gitingest.config import DEFAULT_TIMEOUT
99
from gitingest.utils.git_utils import (
1010
check_repo_exists,
11+
checkout_partial_clone,
1112
create_git_auth_header,
1213
create_git_command,
1314
ensure_git_installed,
1415
is_github_host,
16+
resolve_commit,
1517
run_command,
1618
)
17-
from gitingest.utils.os_utils import ensure_directory
19+
from gitingest.utils.os_utils import ensure_directory_exists_or_create
1820
from gitingest.utils.timeout_wrapper import async_timeout
1921

2022
if TYPE_CHECKING:
@@ -45,71 +47,42 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
4547
# Extract and validate query parameters
4648
url: str = config.url
4749
local_path: str = config.local_path
48-
commit: str | None = config.commit
49-
branch: str | None = config.branch
50-
tag: str | None = config.tag
5150
partial_clone: bool = config.subpath != "/"
5251

53-
# Create parent directory if it doesn't exist
54-
await ensure_directory(Path(local_path).parent)
52+
await ensure_git_installed()
53+
await ensure_directory_exists_or_create(Path(local_path).parent)
5554

56-
# Check if the repository exists
5755
if not await check_repo_exists(url, token=token):
5856
msg = "Repository not found. Make sure it is public or that you have provided a valid token."
5957
raise ValueError(msg)
6058

59+
commit = await resolve_commit(config, url=url, token=token)
60+
6161
clone_cmd = ["git"]
6262
if token and is_github_host(url):
6363
clone_cmd += ["-c", create_git_auth_header(token, url=url)]
6464

65-
clone_cmd += ["clone", "--single-branch"]
66-
67-
if config.include_submodules:
68-
clone_cmd += ["--recurse-submodules"]
69-
65+
clone_cmd += ["clone", "--single-branch", "--no-checkout", "--depth=1"]
7066
if partial_clone:
7167
clone_cmd += ["--filter=blob:none", "--sparse"]
7268

73-
# Shallow clone unless a specific commit is requested
74-
if not commit:
75-
clone_cmd += ["--depth=1"]
76-
77-
# Prefer tag over branch when both are provided
78-
if tag:
79-
clone_cmd += ["--branch", tag]
80-
elif branch and branch.lower() not in ("main", "master"):
81-
clone_cmd += ["--branch", branch]
82-
8369
clone_cmd += [url, local_path]
8470

8571
# Clone the repository
86-
await ensure_git_installed()
8772
await run_command(*clone_cmd)
8873

8974
# Checkout the subpath if it is a partial clone
9075
if partial_clone:
91-
await _checkout_partial_clone(config, token)
76+
await checkout_partial_clone(config, token)
9277

93-
# Checkout the commit if it is provided
94-
if commit:
95-
checkout_cmd = create_git_command(["git"], local_path, url, token)
96-
await run_command(*checkout_cmd, "checkout", commit)
78+
git = create_git_command(["git"], local_path, url, token)
9779

80+
# Ensure the commit is locally available
81+
await run_command(*git, "fetch", "--depth=1", "origin", commit)
9882

99-
async def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
100-
"""Configure sparse-checkout for a partially cloned repository.
83+
# Write the work-tree at that commit
84+
await run_command(*git, "checkout", commit)
10185

102-
Parameters
103-
----------
104-
config : CloneConfig
105-
The configuration for cloning the repository, including subpath and blob flag.
106-
token : str | None
107-
GitHub personal access token (PAT) for accessing private repositories.
108-
109-
"""
110-
subpath = config.subpath.lstrip("/")
111-
if config.blob:
112-
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
113-
subpath = str(Path(subpath).parent.as_posix())
114-
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
115-
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
86+
# Update submodules
87+
if config.include_submodules:
88+
await run_command(*git, "submodule", "update", "--init", "--recursive", "--depth=1")

src/gitingest/query_parser.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ async def parse_query(
5555
A dataclass object containing the parsed details of the repository or file path.
5656
5757
"""
58+
if source.endswith(".git"):
59+
source = source[:-4]
60+
5861
# Determine the parsing method based on the source type
5962
if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
6063
# We either have a full URL or a domain-less slug

src/gitingest/utils/git_utils.py

Lines changed: 148 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
import asyncio
66
import base64
77
import re
8-
from typing import Final
8+
from pathlib import Path
9+
from typing import TYPE_CHECKING, Final, Iterable, Literal
910
from urllib.parse import urlparse
1011

1112
import httpx
@@ -14,6 +15,9 @@
1415
from gitingest.utils.compat_func import removesuffix
1516
from gitingest.utils.exceptions import InvalidGitHubTokenError
1617

18+
if TYPE_CHECKING:
19+
from gitingest.schemas import CloneConfig
20+
1721
# GitHub Personal-Access tokens (classic + fine-grained).
1822
# - ghp_ / gho_ / ghu_ / ghs_ / ghr_ → 36 alphanumerics
1923
# - github_pat_ → 22 alphanumerics + "_" + 59 alphanumerics
@@ -303,3 +307,146 @@ def validate_github_token(token: str) -> None:
303307
"""
304308
if not re.fullmatch(_GITHUB_PAT_PATTERN, token):
305309
raise InvalidGitHubTokenError
310+
311+
312+
async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
313+
"""Configure sparse-checkout for a partially cloned repository.
314+
315+
Parameters
316+
----------
317+
config : CloneConfig
318+
The configuration for cloning the repository, including subpath and blob flag.
319+
token : str | None
320+
GitHub personal access token (PAT) for accessing private repositories.
321+
322+
"""
323+
subpath = config.subpath.lstrip("/")
324+
if config.blob:
325+
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
326+
subpath = str(Path(subpath).parent.as_posix())
327+
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
328+
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
329+
330+
331+
async def resolve_commit(config: CloneConfig, url: str, token: str | None) -> str:
332+
"""Resolve the commit to use for the clone.
333+
334+
Parameters
335+
----------
336+
config : CloneConfig
337+
The configuration for cloning the repository.
338+
url : str
339+
The URL of the remote repository.
340+
token : str | None
341+
GitHub personal access token (PAT) for accessing private repositories.
342+
343+
Returns
344+
-------
345+
str
346+
347+
"""
348+
if config.commit:
349+
commit = config.commit
350+
elif config.tag:
351+
commit = await _resolve_ref_to_sha(url, ref=config.tag, kind="tag", token=token)
352+
elif config.branch:
353+
commit = await _resolve_ref_to_sha(url, ref=config.branch, kind="branch", token=token)
354+
else:
355+
commit = await _resolve_ref_to_sha(url, ref="HEAD", kind="branch", token=token)
356+
return commit
357+
358+
359+
async def _resolve_ref_to_sha(
360+
url: str,
361+
ref: str,
362+
kind: Literal["branch", "tag"],
363+
*,
364+
token: str | None = None,
365+
) -> str:
366+
"""Return the commit SHA that <kind>/<ref> points to in <url>.
367+
368+
* Branch → first line from ``git ls-remote``.
369+
* Tag → if annotated, prefer the peeled ``^{}`` line (commit).
370+
371+
Parameters
372+
----------
373+
url : str
374+
The URL of the remote repository.
375+
ref : str
376+
The reference to resolve to a commit SHA.
377+
kind : Literal["branch", "tag"]
378+
The kind of reference to resolve to a commit SHA.
379+
token : str | None
380+
GitHub personal access token (PAT) for accessing private repositories.
381+
382+
Returns
383+
-------
384+
str
385+
The commit SHA.
386+
387+
Raises
388+
------
389+
ValueError
390+
If the ref does not exist in the remote repository.
391+
392+
"""
393+
await ensure_git_installed()
394+
395+
# Build: git [-c http.<host>/.extraheader=Auth...] ls-remote <url> <pattern>
396+
cmd: list[str] = ["git"]
397+
if token and is_github_host(url):
398+
cmd += ["-c", create_git_auth_header(token, url=url)]
399+
400+
if ref == "HEAD":
401+
pattern = "HEAD"
402+
elif kind == "branch":
403+
pattern = f"refs/heads/{ref}"
404+
else: # tag
405+
pattern = f"refs/tags/{ref}*"
406+
407+
cmd += ["ls-remote", url, pattern]
408+
stdout, _ = await run_command(*cmd)
409+
410+
lines = stdout.decode().splitlines()
411+
412+
sha = _pick_commit_sha(lines)
413+
if not sha:
414+
msg = f"{kind} {ref!r} not found in {url}"
415+
raise ValueError(msg)
416+
417+
return sha
418+
419+
420+
def _pick_commit_sha(lines: Iterable[str]) -> str | None:
421+
"""Return a commit SHA from ``git ls-remote`` output.
422+
423+
• Annotated tag → prefer the peeled line (<sha> refs/tags/x^{})
424+
• Branch / lightweight tag → first non-peeled line
425+
426+
427+
Parameters
428+
----------
429+
lines : Iterable[str]
430+
The lines of a ``git ls-remote`` output.
431+
432+
Returns
433+
-------
434+
str | None
435+
The commit SHA, or ``None`` if no commit SHA is found.
436+
437+
"""
438+
first_non_peeled: str | None = None
439+
440+
for ln in lines:
441+
if not ln.strip():
442+
continue
443+
444+
sha, ref = ln.split(maxsplit=1)
445+
446+
if ref.endswith("^{}"): # peeled commit of annotated tag
447+
return sha # ← best match, done
448+
449+
if first_non_peeled is None: # remember the first ordinary line
450+
first_non_peeled = sha
451+
452+
return first_non_peeled # branch or lightweight tag (or None)

src/gitingest/utils/os_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pathlib import Path
44

55

6-
async def ensure_directory(path: Path) -> None:
6+
async def ensure_directory_exists_or_create(path: Path) -> None:
77
"""Ensure the directory exists, creating it if necessary.
88
99
Parameters

tests/test_clone.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ async def test_clone_with_commit(repo_exists_true: AsyncMock, run_command_mock:
3333
When ``clone_repo`` is called,
3434
Then the repository should be cloned and checked out at that commit.
3535
"""
36-
expected_call_count = 2
36+
expected_call_count = 3 # clone + fetch + checkout
3737
clone_config = CloneConfig(
3838
url=DEMO_URL,
3939
local_path=LOCAL_REPO_PATH,

0 commit comments

Comments
 (0)