Skip to content

Commit 83ade69

Browse files
fix: Add separate config options for filepath checksum on insert vs fetch
Adds two new config options with fallback to the existing one: - `filepath_checksum_size_limit_insert`: skip checksum computation on insert - `filepath_checksum_size_limit_fetch`: skip checksum verification on fetch - `filepath_checksum_size_limit`: fallback for both if specific ones not set This allows teams to: - Compute checksums on insert (for auditing) but skip verification on fetch - Skip checksums entirely for large files - Configure different size limits for insert vs fetch operations When checksum is skipped on insert: - A warning is logged - contents_hash is stored as NULL - Existing file verification is bypassed Fixes #1386 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 5d644c9 commit 83ade69

File tree

2 files changed

+28
-7
lines changed

2 files changed

+28
-7
lines changed

datajoint/external.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -276,13 +276,27 @@ def upload_filepath(self, local_filepath):
276276
uuid = uuid_from_buffer(
277277
init_string=relative_filepath
278278
) # hash relative path, not contents
279-
contents_hash = uuid_from_file(local_filepath)
279+
280+
# Check if checksum should be skipped based on file size limit
281+
file_size = Path(local_filepath).stat().st_size
282+
size_limit = config.get("filepath_checksum_size_limit_insert") or config.get(
283+
"filepath_checksum_size_limit"
284+
)
285+
skip_checksum = size_limit is not None and file_size > size_limit
286+
287+
if skip_checksum:
288+
contents_hash = None
289+
logger.warning(
290+
f"Skipping checksum for '{relative_filepath}' ({file_size} bytes > {size_limit} byte limit)"
291+
)
292+
else:
293+
contents_hash = uuid_from_file(local_filepath)
280294

281295
# check if the remote file already exists and verify that it matches
282296
check_hash = (self & {"hash": uuid}).fetch("contents_hash")
283297
if check_hash.size:
284298
# the tracking entry exists, check that it's the same file as before
285-
if contents_hash != check_hash[0]:
299+
if not skip_checksum and contents_hash != check_hash[0]:
286300
raise DataJointError(
287301
f"A different version of '{relative_filepath}' has already been placed."
288302
)
@@ -291,15 +305,15 @@ def upload_filepath(self, local_filepath):
291305
self._upload_file(
292306
local_filepath,
293307
self._make_external_filepath(relative_filepath),
294-
metadata={"contents_hash": str(contents_hash)},
308+
metadata={"contents_hash": str(contents_hash) if contents_hash else ""},
295309
)
296310
self.connection.query(
297311
"INSERT INTO {tab} (hash, size, filepath, contents_hash) VALUES (%s, {size}, '{filepath}', %s)".format(
298312
tab=self.full_table_name,
299-
size=Path(local_filepath).stat().st_size,
313+
size=file_size,
300314
filepath=relative_filepath,
301315
),
302-
args=(uuid.bytes, contents_hash.bytes),
316+
args=(uuid.bytes, contents_hash.bytes if contents_hash else None),
303317
)
304318
return uuid
305319

@@ -312,7 +326,9 @@ def download_filepath(self, filepath_hash):
312326
"""
313327

314328
def _need_checksum(local_filepath, expected_size):
315-
limit = config.get("filepath_checksum_size_limit")
329+
limit = config.get("filepath_checksum_size_limit_fetch") or config.get(
330+
"filepath_checksum_size_limit"
331+
)
316332
actual_size = Path(local_filepath).stat().st_size
317333
if expected_size != actual_size:
318334
# this should never happen without outside interference

datajoint/settings.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,13 @@
4949
"database.use_tls": None,
5050
"enable_python_native_blobs": True, # python-native/dj0 encoding support
5151
"add_hidden_timestamp": False,
52-
# file size limit for when to disable checksums
52+
# file size limits for when to disable checksums (in bytes)
53+
# filepath_checksum_size_limit: fallback for both insert and fetch if specific ones not set
5354
"filepath_checksum_size_limit": None,
55+
# filepath_checksum_size_limit_insert: skip checksum computation on insert for large files
56+
"filepath_checksum_size_limit_insert": None,
57+
# filepath_checksum_size_limit_fetch: skip checksum verification on fetch for large files
58+
"filepath_checksum_size_limit_fetch": None,
5459
}
5560
)
5661

0 commit comments

Comments
 (0)