Skip to content

Commit 6a65cb7

Browse files
committed
feat: update S3 file path format to include hostname and structured naming
1 parent b9a42bc commit 6a65cb7

File tree

1 file changed

+4
-10
lines changed

1 file changed

+4
-10
lines changed

src/server/s3_utils.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ def generate_s3_file_path(
6060
"""Generate S3 file path with proper naming convention.
6161
6262
The file path is formatted as:
63-
[<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
63+
[<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/
64+
<exclude&include hash>/<owner>-<repo-name>.txt
6465
6566
If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path.
6667
The commit-ID is always included in the URL.
@@ -98,20 +99,13 @@ def generate_s3_file_path(
9899
logger.error(msg)
99100
raise ValueError(msg)
100101

101-
# Extract source from URL or default to "unknown"
102-
git_source = {
103-
"github.com": "github",
104-
"gitlab.com": "gitlab",
105-
"bitbucket.org": "bitbucket",
106-
}.get(hostname, "unknown")
107-
108102
# Create hash of exclude/include patterns for uniqueness
109103
patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}"
110104
patterns_str += f"exclude:{sorted(ignore_patterns)}"
111105
patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16]
112106

113-
# Build the base path
114-
base_path = f"ingest/{git_source}/{user_name}/{repo_name}/{commit}/{patterns_hash}.txt"
107+
# Build the base path using hostname directly
108+
base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{user_name}-{repo_name}.txt"
115109

116110
# Check for S3_DIRECTORY_PREFIX environment variable
117111
s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX")

0 commit comments

Comments
 (0)