Skip to content

Commit 07f3c62

Browse files
committed
Merge branch 'main' into feat/json-logger
2 parents ea991d9 + 5fbb445 commit 07f3c62

File tree

6 files changed

+444
-64
lines changed

6 files changed

+444
-64
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
test:
1818
runs-on: ${{ matrix.os }}
1919
strategy:
20-
fail-fast: true
20+
fail-fast: false
2121
matrix:
2222
os: [ubuntu-latest, macos-latest, windows-latest]
2323
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ repos:
2626

2727
- id: trailing-whitespace
2828
description: 'Trim trailing whitespace.'
29+
exclude: CHANGELOG.md
2930

3031
- id: check-docstring-first
3132
description: 'Check a common error of defining a docstring after code.'

src/gitingest/output_formatter.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
from __future__ import annotations
44

55
import logging
6+
import warnings
7+
from ssl import SSLError
68
from typing import TYPE_CHECKING
79

810
import tiktoken
11+
from requests.exceptions import RequestException
912

1013
from gitingest.schemas import FileSystemNode, FileSystemNodeType
1114
from gitingest.utils.compat_func import readlink
@@ -192,8 +195,12 @@ def _format_token_count(text: str) -> str | None:
192195
try:
193196
encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini
194197
total_tokens = len(encoding.encode(text, disallowed_special=()))
195-
except (ValueError, UnicodeEncodeError):
196-
logger.exception("Failed to estimate token size.")
198+
except (ValueError, UnicodeEncodeError) as exc:
199+
warnings.warn(f"Failed to estimate token size: {exc}", RuntimeWarning, stacklevel=3)
200+
return None
201+
except (RequestException, SSLError) as exc:
202+
# If network errors, skip token count estimation instead of erroring out
203+
warnings.warn(f"Failed to download tiktoken model: {exc}", RuntimeWarning, stacklevel=3)
197204
return None
198205

199206
for threshold, suffix in _TOKEN_THRESHOLDS:

src/server/models.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,25 @@ class IngestErrorResponse(BaseModel):
116116
IngestResponse = Union[IngestSuccessResponse, IngestErrorResponse]
117117

118118

119+
class S3Metadata(BaseModel):
120+
"""Model for S3 metadata structure.
121+
122+
Attributes
123+
----------
124+
summary : str
125+
Summary of the ingestion process including token estimates.
126+
tree : str
127+
File tree structure of the repository.
128+
content : str
129+
Processed content from the repository files.
130+
131+
"""
132+
133+
summary: str = Field(..., description="Ingestion summary with token estimates")
134+
tree: str = Field(..., description="File tree structure")
135+
content: str = Field(..., description="Processed file content")
136+
137+
119138
class QueryForm(BaseModel):
120139
"""Form data for the query.
121140

src/server/query_processor.py

Lines changed: 196 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,196 @@
44

55
import logging
66
from pathlib import Path
7-
from typing import cast
7+
from typing import TYPE_CHECKING, cast
88

99
from gitingest.clone import clone_repo
1010
from gitingest.ingestion import ingest_query
1111
from gitingest.query_parser import parse_remote_repo
12-
from gitingest.utils.git_utils import validate_github_token
12+
from gitingest.utils.git_utils import resolve_commit, validate_github_token
1313
from gitingest.utils.pattern_utils import process_patterns
14-
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
15-
from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
14+
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata
15+
from server.s3_utils import (
16+
_build_s3_url,
17+
check_s3_object_exists,
18+
generate_s3_file_path,
19+
get_metadata_from_s3,
20+
is_s3_enabled,
21+
upload_metadata_to_s3,
22+
upload_to_s3,
23+
)
1624
from server.server_config import MAX_DISPLAY_SIZE
1725

1826
logger = logging.getLogger(__name__)
1927

28+
if TYPE_CHECKING:
29+
from gitingest.schemas.cloning import CloneConfig
30+
from gitingest.schemas.ingestion import IngestionQuery
31+
32+
logger = logging.getLogger(__name__)
33+
34+
35+
async def _check_s3_cache(
36+
query: IngestionQuery,
37+
input_text: str,
38+
max_file_size: int,
39+
pattern_type: str,
40+
pattern: str,
41+
token: str | None,
42+
) -> IngestSuccessResponse | None:
43+
"""Check if digest already exists on S3 and return response if found.
44+
45+
Parameters
46+
----------
47+
query : IngestionQuery
48+
The parsed query object.
49+
input_text : str
50+
Original input text.
51+
max_file_size : int
52+
Maximum file size in KB.
53+
pattern_type : str
54+
Pattern type (include/exclude).
55+
pattern : str
56+
Pattern string.
57+
token : str | None
58+
GitHub token.
59+
60+
Returns
61+
-------
62+
IngestSuccessResponse | None
63+
Response if file exists on S3, None otherwise.
64+
65+
"""
66+
if not is_s3_enabled():
67+
return None
68+
69+
try:
70+
# Use git ls-remote to get commit SHA without cloning
71+
clone_config = query.extract_clone_config()
72+
query.commit = await resolve_commit(clone_config, token=token)
73+
# Generate S3 file path using the resolved commit
74+
s3_file_path = generate_s3_file_path(query)
75+
76+
# Check if file exists on S3
77+
if check_s3_object_exists(s3_file_path):
78+
# File exists on S3, serve it directly without cloning
79+
s3_url = _build_s3_url(s3_file_path)
80+
query.s3_url = s3_url
81+
82+
short_repo_url = f"{query.user_name}/{query.repo_name}"
83+
84+
# Try to get cached metadata
85+
metadata = get_metadata_from_s3(s3_file_path)
86+
87+
if metadata:
88+
# Use cached metadata if available
89+
summary = metadata.summary
90+
tree = metadata.tree
91+
content = metadata.content
92+
else:
93+
# Fallback to placeholder messages if metadata not available
94+
summary = "Digest served from cache (S3). Download the full digest to see content details."
95+
tree = "Digest served from cache. Download the full digest to see the file tree."
96+
content = "Digest served from cache. Download the full digest to see the content."
97+
98+
return IngestSuccessResponse(
99+
repo_url=input_text,
100+
short_repo_url=short_repo_url,
101+
summary=summary,
102+
digest_url=s3_url,
103+
tree=tree,
104+
content=content,
105+
default_max_file_size=max_file_size,
106+
pattern_type=pattern_type,
107+
pattern=pattern,
108+
)
109+
except Exception as exc:
110+
# Log the exception but don't fail the entire request
111+
logger.warning("S3 cache check failed, falling back to normal cloning: %s", exc)
112+
113+
return None
114+
115+
116+
def _store_digest_content(
117+
query: IngestionQuery,
118+
clone_config: CloneConfig,
119+
digest_content: str,
120+
summary: str,
121+
tree: str,
122+
content: str,
123+
) -> None:
124+
"""Store digest content either to S3 or locally based on configuration.
125+
126+
Parameters
127+
----------
128+
query : IngestionQuery
129+
The query object containing repository information.
130+
clone_config : CloneConfig
131+
The clone configuration object.
132+
digest_content : str
133+
The complete digest content to store.
134+
summary : str
135+
The summary content for metadata.
136+
tree : str
137+
The tree content for metadata.
138+
content : str
139+
The file content for metadata.
140+
141+
"""
142+
if is_s3_enabled():
143+
# Upload to S3 instead of storing locally
144+
s3_file_path = generate_s3_file_path(query)
145+
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
146+
147+
# Also upload metadata JSON for caching
148+
metadata = S3Metadata(
149+
summary=summary,
150+
tree=tree,
151+
content=content,
152+
)
153+
try:
154+
upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id)
155+
logger.debug("Successfully uploaded metadata to S3")
156+
except Exception as metadata_exc:
157+
# Log the error but don't fail the entire request
158+
logger.warning("Failed to upload metadata to S3: %s", metadata_exc)
159+
160+
# Store S3 URL in query for later use
161+
query.s3_url = s3_url
162+
else:
163+
# Store locally
164+
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
165+
with local_txt_file.open("w", encoding="utf-8") as f:
166+
f.write(digest_content)
167+
168+
169+
def _generate_digest_url(query: IngestionQuery) -> str:
170+
"""Generate the digest URL based on S3 configuration.
171+
172+
Parameters
173+
----------
174+
query : IngestionQuery
175+
The query object containing repository information.
176+
177+
Returns
178+
-------
179+
str
180+
The digest URL.
181+
182+
Raises
183+
------
184+
RuntimeError
185+
If S3 is enabled but no S3 URL was generated.
186+
187+
"""
188+
if is_s3_enabled():
189+
digest_url = getattr(query, "s3_url", None)
190+
if not digest_url:
191+
# This should not happen if S3 upload was successful
192+
msg = "S3 is enabled but no S3 URL was generated"
193+
raise RuntimeError(msg)
194+
return digest_url
195+
return f"/api/download/file/{query.id}"
196+
20197

21198
async def process_query(
22199
input_text: str,
@@ -79,11 +256,23 @@ async def process_query(
79256
include_patterns=pattern if pattern_type == PatternType.INCLUDE else None,
80257
)
81258

259+
# Check if digest already exists on S3 before cloning
260+
s3_response = await _check_s3_cache(
261+
query=query,
262+
input_text=input_text,
263+
max_file_size=max_file_size,
264+
pattern_type=pattern_type.value,
265+
pattern=pattern,
266+
token=token,
267+
)
268+
if s3_response:
269+
return s3_response
270+
82271
clone_config = query.extract_clone_config()
83272
logger.debug("Cloning repo with config: %r", clone_config)
84273
await clone_repo(clone_config, token=token)
85274

86-
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
275+
short_repo_url = f"{query.user_name}/{query.repo_name}"
87276

88277
# The commit hash should always be available at this point
89278
if not query.commit:
@@ -92,32 +281,8 @@ async def process_query(
92281

93282
try:
94283
summary, tree, content = ingest_query(query)
95-
96-
# Prepare the digest content (tree + content)
97284
digest_content = tree + "\n" + content
98-
99-
# Store digest based on S3 configuration
100-
if is_s3_enabled():
101-
# Upload to S3 instead of storing locally
102-
s3_file_path = generate_s3_file_path(
103-
source=query.url,
104-
user_name=cast("str", query.user_name),
105-
repo_name=cast("str", query.repo_name),
106-
commit=query.commit,
107-
include_patterns=query.include_patterns,
108-
ignore_patterns=query.ignore_patterns,
109-
)
110-
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
111-
# Store S3 URL in query for later use
112-
query.s3_url = s3_url
113-
else:
114-
# Store locally
115-
logger.debug("Ingest query complete. Writing tree and content to file.")
116-
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
117-
with local_txt_file.open("w", encoding="utf-8") as f:
118-
f.write(digest_content)
119-
logger.debug("Wrote output to %s", local_txt_file)
120-
285+
_store_digest_content(query, clone_config, digest_content, summary, tree, content)
121286
except Exception as exc:
122287
logger.exception(
123288
"Error processing query for URL %s (max_file_size=%s, pattern_type=%s, pattern=%s).",
@@ -151,15 +316,7 @@ async def process_query(
151316
estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :]
152317
logger.info("Estimated tokens: %s", estimated_tokens) # Important: token estimation
153318

154-
# Generate digest_url based on S3 configuration
155-
if is_s3_enabled():
156-
digest_url = getattr(query, "s3_url", None)
157-
if not digest_url:
158-
# This should not happen if S3 upload was successful
159-
msg = "S3 is enabled but no S3 URL was generated"
160-
raise RuntimeError(msg)
161-
else:
162-
digest_url = f"/api/download/file/{query.id}"
319+
digest_url = _generate_digest_url(query)
163320

164321
return IngestSuccessResponse(
165322
repo_url=input_text,

0 commit comments

Comments
 (0)