Skip to content

Commit 60a8f05

Browse files
authored
Merge pull request #1387 from datajoint/fix/filepath-checksum-bypass
fix: Add config option to skip filepath checksum on insert
2 parents 5d644c9 + 6d02652 commit 60a8f05

File tree

2 files changed

+21
-6
lines changed

2 files changed

+21
-6
lines changed

datajoint/external.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -276,13 +276,25 @@ def upload_filepath(self, local_filepath):
276276
uuid = uuid_from_buffer(
277277
init_string=relative_filepath
278278
) # hash relative path, not contents
279-
contents_hash = uuid_from_file(local_filepath)
279+
280+
# Check if checksum should be skipped based on file size limit
281+
file_size = Path(local_filepath).stat().st_size
282+
size_limit = config.get("filepath_checksum_size_limit_insert")
283+
skip_checksum = size_limit is not None and file_size > size_limit
284+
285+
if skip_checksum:
286+
contents_hash = None
287+
logger.warning(
288+
f"Skipping checksum for '{relative_filepath}' ({file_size} bytes > {size_limit} byte limit)"
289+
)
290+
else:
291+
contents_hash = uuid_from_file(local_filepath)
280292

281293
# check if the remote file already exists and verify that it matches
282294
check_hash = (self & {"hash": uuid}).fetch("contents_hash")
283295
if check_hash.size:
284296
# the tracking entry exists, check that it's the same file as before
285-
if contents_hash != check_hash[0]:
297+
if not skip_checksum and contents_hash != check_hash[0]:
286298
raise DataJointError(
287299
f"A different version of '{relative_filepath}' has already been placed."
288300
)
@@ -291,15 +303,15 @@ def upload_filepath(self, local_filepath):
291303
self._upload_file(
292304
local_filepath,
293305
self._make_external_filepath(relative_filepath),
294-
metadata={"contents_hash": str(contents_hash)},
306+
metadata={"contents_hash": str(contents_hash) if contents_hash else ""},
295307
)
296308
self.connection.query(
297309
"INSERT INTO {tab} (hash, size, filepath, contents_hash) VALUES (%s, {size}, '{filepath}', %s)".format(
298310
tab=self.full_table_name,
299-
size=Path(local_filepath).stat().st_size,
311+
size=file_size,
300312
filepath=relative_filepath,
301313
),
302-
args=(uuid.bytes, contents_hash.bytes),
314+
args=(uuid.bytes, contents_hash.bytes if contents_hash else None),
303315
)
304316
return uuid
305317

datajoint/settings.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,11 @@
4949
"database.use_tls": None,
5050
"enable_python_native_blobs": True, # python-native/dj0 encoding support
5151
"add_hidden_timestamp": False,
52-
# file size limit for when to disable checksums
52+
# file size limits for when to disable checksums (in bytes)
53+
# filepath_checksum_size_limit: skip checksum verification on fetch for large files
5354
"filepath_checksum_size_limit": None,
55+
# filepath_checksum_size_limit_insert: skip checksum computation on insert for large files
56+
"filepath_checksum_size_limit_insert": None,
5457
}
5558
)
5659

0 commit comments

Comments
 (0)