Skip to content

Commit f401a20

Browse files
Merge pull request #1400 from datajoint/fix/skip-s3-reupload-1397
fix: Skip redundant S3 upload when file already exists after rollback
2 parents 06f44b3 + 6f627c7 commit f401a20

File tree

2 files changed

+40
-12
lines changed

2 files changed

+40
-12
lines changed

datajoint/external.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -300,11 +300,36 @@ def upload_filepath(self, local_filepath):
300300
)
301301
else:
302302
# upload the file and create its tracking entry
303-
self._upload_file(
304-
local_filepath,
305-
self._make_external_filepath(relative_filepath),
306-
metadata={"contents_hash": str(contents_hash) if contents_hash else ""},
307-
)
303+
external_path = self._make_external_filepath(relative_filepath)
304+
already_uploaded = False
305+
if self.spec["protocol"] == "s3":
306+
stat = self.s3.stat(str(external_path))
307+
if stat is not None and stat.size == file_size:
308+
# Verify contents_hash from S3 metadata when available
309+
if skip_checksum:
310+
already_uploaded = True
311+
else:
312+
remote_meta = {
313+
k.lower().lstrip("x-amz-meta-"): v
314+
for k, v in (stat.metadata or {}).items()
315+
}
316+
remote_hash = remote_meta.get("contents_hash", "")
317+
if remote_hash == str(contents_hash):
318+
already_uploaded = True
319+
if already_uploaded:
320+
logger.info(
321+
f"File already exists on S3 with matching size"
322+
f"{'' if skip_checksum else ' and checksum'}"
323+
f", skipping upload: '{relative_filepath}'"
324+
)
325+
if not already_uploaded:
326+
self._upload_file(
327+
local_filepath,
328+
external_path,
329+
metadata={
330+
"contents_hash": str(contents_hash) if contents_hash else ""
331+
},
332+
)
308333
self.connection.query(
309334
"INSERT INTO {tab} (hash, size, filepath, contents_hash) VALUES (%s, {size}, '{filepath}', %s)".format(
310335
tab=self.full_table_name,

datajoint/s3.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,19 @@ def fget(self, name, local_filepath):
9595
if "contents_hash" in meta:
9696
return uuid.UUID(meta["contents_hash"])
9797

98-
def exists(self, name):
99-
logger.debug("exists: {}:{}".format(self.bucket, name))
98+
def stat(self, name):
99+
"""Return stat result for an object, or None if it does not exist."""
100+
logger.debug("stat: {}:{}".format(self.bucket, name))
100101
try:
101-
self.client.stat_object(self.bucket, str(name))
102+
return self.client.stat_object(self.bucket, str(name))
102103
except minio.error.S3Error as e:
103104
if e.code == "NoSuchKey":
104-
return False
105-
else:
106-
raise e
107-
return True
105+
return None
106+
raise e
107+
108+
def exists(self, name):
109+
logger.debug("exists: {}:{}".format(self.bucket, name))
110+
return self.stat(name) is not None
108111

109112
def get_size(self, name):
110113
logger.debug("get_size: {}:{}".format(self.bucket, name))

0 commit comments

Comments
 (0)