44
55import logging
66from pathlib import Path
7- from typing import cast
7+ from typing import TYPE_CHECKING , cast
88
99from gitingest .clone import clone_repo
1010from gitingest .ingestion import ingest_query
1111from gitingest .query_parser import parse_remote_repo
12- from gitingest .utils .git_utils import validate_github_token
12+ from gitingest .utils .git_utils import resolve_commit , validate_github_token
1313from gitingest .utils .pattern_utils import process_patterns
14- from server .models import IngestErrorResponse , IngestResponse , IngestSuccessResponse , PatternType
15- from server .s3_utils import generate_s3_file_path , is_s3_enabled , upload_to_s3
14+ from server .models import IngestErrorResponse , IngestResponse , IngestSuccessResponse , PatternType , S3Metadata
15+ from server .s3_utils import (
16+ _build_s3_url ,
17+ check_s3_object_exists ,
18+ generate_s3_file_path ,
19+ get_metadata_from_s3 ,
20+ is_s3_enabled ,
21+ upload_metadata_to_s3 ,
22+ upload_to_s3 ,
23+ )
1624from server .server_config import MAX_DISPLAY_SIZE
1725
1826logger = logging .getLogger (__name__ )
1927
28+ if TYPE_CHECKING :
29+ from gitingest .schemas .cloning import CloneConfig
30+ from gitingest .schemas .ingestion import IngestionQuery
31+
32+ logger = logging .getLogger (__name__ )
33+
34+
35+ async def _check_s3_cache (
36+ query : IngestionQuery ,
37+ input_text : str ,
38+ max_file_size : int ,
39+ pattern_type : str ,
40+ pattern : str ,
41+ token : str | None ,
42+ ) -> IngestSuccessResponse | None :
43+ """Check if digest already exists on S3 and return response if found.
44+
45+ Parameters
46+ ----------
47+ query : IngestionQuery
48+ The parsed query object.
49+ input_text : str
50+ Original input text.
51+ max_file_size : int
52+ Maximum file size in KB.
53+ pattern_type : str
54+ Pattern type (include/exclude).
55+ pattern : str
56+ Pattern string.
57+ token : str | None
58+ GitHub token.
59+
60+ Returns
61+ -------
62+ IngestSuccessResponse | None
63+ Response if file exists on S3, None otherwise.
64+
65+ """
66+ if not is_s3_enabled ():
67+ return None
68+
69+ try :
70+ # Use git ls-remote to get commit SHA without cloning
71+ clone_config = query .extract_clone_config ()
72+ query .commit = await resolve_commit (clone_config , token = token )
73+ # Generate S3 file path using the resolved commit
74+ s3_file_path = generate_s3_file_path (query )
75+
76+ # Check if file exists on S3
77+ if check_s3_object_exists (s3_file_path ):
78+ # File exists on S3, serve it directly without cloning
79+ s3_url = _build_s3_url (s3_file_path )
80+ query .s3_url = s3_url
81+
82+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
83+
84+ # Try to get cached metadata
85+ metadata = get_metadata_from_s3 (s3_file_path )
86+
87+ if metadata :
88+ # Use cached metadata if available
89+ summary = metadata .summary
90+ tree = metadata .tree
91+ content = metadata .content
92+ else :
93+ # Fallback to placeholder messages if metadata not available
94+ summary = "Digest served from cache (S3). Download the full digest to see content details."
95+ tree = "Digest served from cache. Download the full digest to see the file tree."
96+ content = "Digest served from cache. Download the full digest to see the content."
97+
98+ return IngestSuccessResponse (
99+ repo_url = input_text ,
100+ short_repo_url = short_repo_url ,
101+ summary = summary ,
102+ digest_url = s3_url ,
103+ tree = tree ,
104+ content = content ,
105+ default_max_file_size = max_file_size ,
106+ pattern_type = pattern_type ,
107+ pattern = pattern ,
108+ )
109+ except Exception as exc :
110+ # Log the exception but don't fail the entire request
111+ logger .warning ("S3 cache check failed, falling back to normal cloning: %s" , exc )
112+
113+ return None
114+
115+
116+ def _store_digest_content (
117+ query : IngestionQuery ,
118+ clone_config : CloneConfig ,
119+ digest_content : str ,
120+ summary : str ,
121+ tree : str ,
122+ content : str ,
123+ ) -> None :
124+ """Store digest content either to S3 or locally based on configuration.
125+
126+ Parameters
127+ ----------
128+ query : IngestionQuery
129+ The query object containing repository information.
130+ clone_config : CloneConfig
131+ The clone configuration object.
132+ digest_content : str
133+ The complete digest content to store.
134+ summary : str
135+ The summary content for metadata.
136+ tree : str
137+ The tree content for metadata.
138+ content : str
139+ The file content for metadata.
140+
141+ """
142+ if is_s3_enabled ():
143+ # Upload to S3 instead of storing locally
144+ s3_file_path = generate_s3_file_path (query )
145+ s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
146+
147+ # Also upload metadata JSON for caching
148+ metadata = S3Metadata (
149+ summary = summary ,
150+ tree = tree ,
151+ content = content ,
152+ )
153+ try :
154+ upload_metadata_to_s3 (metadata = metadata , s3_file_path = s3_file_path , ingest_id = query .id )
155+ logger .debug ("Successfully uploaded metadata to S3" )
156+ except Exception as metadata_exc :
157+ # Log the error but don't fail the entire request
158+ logger .warning ("Failed to upload metadata to S3: %s" , metadata_exc )
159+
160+ # Store S3 URL in query for later use
161+ query .s3_url = s3_url
162+ else :
163+ # Store locally
164+ local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
165+ with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
166+ f .write (digest_content )
167+
168+
169+ def _generate_digest_url (query : IngestionQuery ) -> str :
170+ """Generate the digest URL based on S3 configuration.
171+
172+ Parameters
173+ ----------
174+ query : IngestionQuery
175+ The query object containing repository information.
176+
177+ Returns
178+ -------
179+ str
180+ The digest URL.
181+
182+ Raises
183+ ------
184+ RuntimeError
185+ If S3 is enabled but no S3 URL was generated.
186+
187+ """
188+ if is_s3_enabled ():
189+ digest_url = getattr (query , "s3_url" , None )
190+ if not digest_url :
191+ # This should not happen if S3 upload was successful
192+ msg = "S3 is enabled but no S3 URL was generated"
193+ raise RuntimeError (msg )
194+ return digest_url
195+ return f"/api/download/file/{ query .id } "
196+
20197
21198async def process_query (
22199 input_text : str ,
@@ -79,11 +256,23 @@ async def process_query(
79256 include_patterns = pattern if pattern_type == PatternType .INCLUDE else None ,
80257 )
81258
259+ # Check if digest already exists on S3 before cloning
260+ s3_response = await _check_s3_cache (
261+ query = query ,
262+ input_text = input_text ,
263+ max_file_size = max_file_size ,
264+ pattern_type = pattern_type .value ,
265+ pattern = pattern ,
266+ token = token ,
267+ )
268+ if s3_response :
269+ return s3_response
270+
82271 clone_config = query .extract_clone_config ()
83272 logger .debug ("Cloning repo with config: %r" , clone_config )
84273 await clone_repo (clone_config , token = token )
85274
86- short_repo_url = f"{ query .user_name } /{ query .repo_name } " # Sets the "<user>/<repo>" for the page title
275+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
87276
88277 # The commit hash should always be available at this point
89278 if not query .commit :
@@ -92,32 +281,8 @@ async def process_query(
92281
93282 try :
94283 summary , tree , content = ingest_query (query )
95-
96- # Prepare the digest content (tree + content)
97284 digest_content = tree + "\n " + content
98-
99- # Store digest based on S3 configuration
100- if is_s3_enabled ():
101- # Upload to S3 instead of storing locally
102- s3_file_path = generate_s3_file_path (
103- source = query .url ,
104- user_name = cast ("str" , query .user_name ),
105- repo_name = cast ("str" , query .repo_name ),
106- commit = query .commit ,
107- include_patterns = query .include_patterns ,
108- ignore_patterns = query .ignore_patterns ,
109- )
110- s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
111- # Store S3 URL in query for later use
112- query .s3_url = s3_url
113- else :
114- # Store locally
115- logger .debug ("Ingest query complete. Writing tree and content to file." )
116- local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
117- with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
118- f .write (digest_content )
119- logger .debug ("Wrote output to %s" , local_txt_file )
120-
285+ _store_digest_content (query , clone_config , digest_content , summary , tree , content )
121286 except Exception as exc :
122287 logger .exception (
123288 "Error processing query for URL %s (max_file_size=%s, pattern_type=%s, pattern=%s)." ,
@@ -151,15 +316,7 @@ async def process_query(
151316 estimated_tokens = summary [summary .index ("Estimated tokens:" ) + len ("Estimated " ) :]
152317 logger .info ("Estimated tokens: %s" , estimated_tokens ) # Important: token estimation
153318
154- # Generate digest_url based on S3 configuration
155- if is_s3_enabled ():
156- digest_url = getattr (query , "s3_url" , None )
157- if not digest_url :
158- # This should not happen if S3 upload was successful
159- msg = "S3 is enabled but no S3 URL was generated"
160- raise RuntimeError (msg )
161- else :
162- digest_url = f"/api/download/file/{ query .id } "
319+ digest_url = _generate_digest_url (query )
163320
164321 return IngestSuccessResponse (
165322 repo_url = input_text ,
0 commit comments