Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 95 additions & 66 deletions src/gitingest/clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,89 @@ class CloneConfig:
local_path: str
commit: str | None = None
branch: str | None = None
pat: str | None = None


async def _check_repo_exists(url: str, pat: str | None = None) -> bool:
"""
Check if a repository exists at the given URL using an HTTP HEAD request.

Parameters
----------
url : str
The URL of the repository.
pat : str | None
Personal Access Token for authentication, optional.

Returns
-------
bool
True if the repository exists, False otherwise.
"""
# Parse URL to get components
parts = url.split("/")
if len(parts) < 5: # Need at least protocol, empty, host, username, repo
return False

host = parts[2]
username = parts[3]
repo = parts[4]

# Construct API URL based on host
if "github.com" in host:
api_url = url
else:
# For custom Git servers, use API v1 endpoint
api_url = f"https://{host}/api/v1/repos/{username}/{repo}"

cmd = ["curl", "-I"]
if pat:
cmd.extend(["-H", f"Authorization: token {pat}"])
cmd.append(api_url)

proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await proc.communicate()
if proc.returncode != 0:
return False
# Check if stdout contains "404" status code
stdout_str = stdout.decode()
return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str


async def _run_git_command(*args: str) -> tuple[bytes, bytes]:
"""
Executes a git command asynchronously and captures its output.

Parameters
----------
*args : str
The git command and its arguments to execute.

Returns
-------
tuple[bytes, bytes]
A tuple containing the stdout and stderr of the git command.

Raises
------
RuntimeError
If the git command exits with a non-zero status.
"""
proc = await asyncio.create_subprocess_exec(
*args,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
error_message = stderr.decode().strip()
raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}")

return stdout, stderr


@async_timeout(CLONE_TIMEOUT)
Expand All @@ -45,11 +128,12 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
Parameters
----------
config : CloneConfig
A dictionary containing the following keys:
Configuration object containing:
- url (str): The URL of the repository.
- local_path (str): The local path to clone the repository to.
- commit (Optional[str]): The specific commit hash to checkout.
- branch (Optional[str]): The branch to clone. Defaults to 'main' or 'master' if not provided.
- pat (Optional[str]): Personal Access Token for authentication.

Returns
-------
Expand All @@ -65,11 +149,12 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
AsyncTimeoutError
If the cloning process exceeds the specified timeout.
"""
# Extract and validate query parameters
# Extract and validate parameters
url: str = config.url
local_path: str = config.local_path
commit: str | None = config.commit
branch: str | None = config.branch
pat: str | None = config.pat

if not url:
raise ValueError("The 'url' parameter is required.")
Expand All @@ -78,13 +163,15 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
raise ValueError("The 'local_path' parameter is required.")

# Check if the repository exists
if not await _check_repo_exists(url):
raise ValueError("Repository not found, make sure it is public")
if not await _check_repo_exists(url, pat):
raise ValueError("Repository not found, make sure it is public or provide valid PAT")

try:
if commit:
# Scenario 1: Clone and checkout a specific commit
# Clone the repository without depth to ensure full history for checkout
if pat:
url = url.replace("https://", f"https://oauth2:{pat}@")
clone_cmd = ["git", "clone", "--single-branch", url, local_path]
await _run_git_command(*clone_cmd)

Expand All @@ -93,75 +180,17 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
return await _run_git_command(*checkout_cmd)

if branch and branch.lower() not in ("main", "master"):

# Scenario 2: Clone a specific branch with shallow depth
if pat:
url = url.replace("https://", f"https://oauth2:{pat}@")
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path]
return await _run_git_command(*clone_cmd)

# Scenario 3: Clone the default branch with shallow depth
if pat:
url = url.replace("https://", f"https://oauth2:{pat}@")
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path]
return await _run_git_command(*clone_cmd)

except (RuntimeError, asyncio.TimeoutError, AsyncTimeoutError):
raise # Re-raise the exception


async def _check_repo_exists(url: str) -> bool:
"""
Check if a repository exists at the given URL using an HTTP HEAD request.

Parameters
----------
url : str
The URL of the repository.

Returns
-------
bool
True if the repository exists, False otherwise.
"""
proc = await asyncio.create_subprocess_exec(
"curl",
"-I",
url,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await proc.communicate()
if proc.returncode != 0:
return False
# Check if stdout contains "404" status code
stdout_str = stdout.decode()
return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str


async def _run_git_command(*args: str) -> tuple[bytes, bytes]:
"""
Executes a git command asynchronously and captures its output.

Parameters
----------
*args : str
The git command and its arguments to execute.

Returns
-------
tuple[bytes, bytes]
A tuple containing the stdout and stderr of the git command.

Raises
------
RuntimeError
If the git command exits with a non-zero status.
"""
proc = await asyncio.create_subprocess_exec(
*args,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
error_message = stderr.decode().strip()
raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}")

return stdout, stderr
89 changes: 83 additions & 6 deletions src/gitingest/tests/test_clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,24 @@ async def test_clone_repo_with_commit() -> None:
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"output", b"error")
mock_exec.return_value = mock_process

await clone_repo(clone_config)
mock_check.assert_called_once_with(clone_config.url)
mock_check.assert_called_once_with(clone_config.url, None)
assert mock_exec.call_count == 2 # Clone and checkout calls


@pytest.mark.asyncio
async def test_clone_repo_without_commit() -> None:
query = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", commit=None, branch="main")
clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", commit=None, branch="main")

with patch("gitingest.clone._check_repo_exists", return_value=True) as mock_check:
with patch("gitingest.clone._run_git_command", new_callable=AsyncMock) as mock_exec:
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"output", b"error")
mock_exec.return_value = mock_process

await clone_repo(query)
mock_check.assert_called_once_with(query.url)
await clone_repo(clone_config)
mock_check.assert_called_once_with(clone_config.url, None)
assert mock_exec.call_count == 1 # Only clone call


Expand All @@ -50,7 +51,7 @@ async def test_clone_repo_nonexistent_repository() -> None:
with patch("gitingest.clone._check_repo_exists", return_value=False) as mock_check:
with pytest.raises(ValueError, match="Repository not found"):
await clone_repo(clone_config)
mock_check.assert_called_once_with(clone_config.url)
mock_check.assert_called_once_with(clone_config.url, None)


@pytest.mark.asyncio
Expand Down Expand Up @@ -167,5 +168,81 @@ async def test_check_repo_exists_with_redirect() -> None:
mock_process.communicate.return_value = (b"HTTP/1.1 302 Found\n", b"")
mock_process.returncode = 0 # Simulate successful request
mock_exec.return_value = mock_process

assert await _check_repo_exists(url)


@pytest.mark.asyncio
async def test_check_repo_exists_with_pat() -> None:
url = "https://github.com/user/repo"
pat = "test_token_123"

with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec:
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"HTTP/1.1 200 OK\n", b"")
mock_process.returncode = 0
mock_exec.return_value = mock_process

await _check_repo_exists(url, pat)

# Verify curl command includes authorization header
mock_exec.assert_called_with(
"curl",
"-I",
"-H",
f"Authorization: token {pat}",
url,
stdout=-1, # asyncio.subprocess.PIPE
stderr=-1, # asyncio.subprocess.PIPE
)


@pytest.mark.asyncio
async def test_check_repo_exists_custom_git_server() -> None:
url = "https://git.custom.com/user/repo"
pat = "test_token_123"

with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec:
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"HTTP/1.1 200 OK\n", b"")
mock_process.returncode = 0
mock_exec.return_value = mock_process

await _check_repo_exists(url, pat)

# Verify curl command uses correct API endpoint and includes authorization header
mock_exec.assert_called_with(
"curl",
"-I",
"-H",
f"Authorization: token {pat}",
"https://git.custom.com/api/v1/repos/user/repo",
stdout=-1, # asyncio.subprocess.PIPE
stderr=-1, # asyncio.subprocess.PIPE
)


@pytest.mark.asyncio
async def test_clone_repo_with_pat() -> None:
clone_config = CloneConfig(
url="https://git.custom.com/user/repo",
local_path="/tmp/repo",
commit=None,
branch="main",
pat="test_token_123",
)

with patch("gitingest.clone._check_repo_exists", return_value=True) as mock_check:
with patch("gitingest.clone._run_git_command", new_callable=AsyncMock) as mock_exec:
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"output", b"error")
mock_exec.return_value = mock_process

await clone_repo(clone_config)
mock_check.assert_called_once_with(clone_config.url, clone_config.pat)

# Verify git clone command includes PAT in URL
expected_url = clone_config.url.replace("https://", f"https://oauth2:{clone_config.pat}@")
# Check that the command was called with the correct arguments
mock_exec.assert_called_with(
"git", "clone", "--depth=1", "--single-branch", expected_url, clone_config.local_path
)
2 changes: 2 additions & 0 deletions src/process_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ async def process_query(
slider_position: int,
pattern_type: str = "exclude",
pattern: str = "",
pat: str | None = None,
is_index: bool = False,
) -> _TemplateResponse:
"""
Expand Down Expand Up @@ -68,6 +69,7 @@ async def process_query(
local_path=query["local_path"],
commit=query.get("commit"),
branch=query.get("branch"),
pat=pat,
)
await clone_repo(clone_config)
summary, tree, content = ingest_from_query(query)
Expand Down
Loading
Loading