Skip to content

Commit 36d2350

Browse files
authored
[v2] Add case-sensitivity handling to multi-object download operations (#9925)
1 parent b7ccb23 commit 36d2350

File tree

19 files changed

+869
-9
lines changed

19 files changed

+869
-9
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"type": "enhancement",
3+
"category": "``s3``",
4+
"description": "Adds new parameter ``--case-conflict`` that configures how case conflicts are handled on case-insensitive filesystems"
5+
}

awscli/customizations/s3/filegenerator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ def __init__(
109109
operation_name=None,
110110
response_data=None,
111111
etag=None,
112+
case_conflict_submitted=None,
113+
case_conflict_key=None,
112114
):
113115
self.src = src
114116
self.dest = dest
@@ -120,6 +122,8 @@ def __init__(
120122
self.operation_name = operation_name
121123
self.response_data = response_data
122124
self.etag = etag
125+
self.case_conflict_submitted = case_conflict_submitted
126+
self.case_conflict_key = case_conflict_key
123127

124128

125129
class FileGenerator:

awscli/customizations/s3/fileinfo.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ def __init__(
5555
is_stream=False,
5656
associated_response_data=None,
5757
etag=None,
58+
case_conflict_submitted=None,
59+
case_conflict_key=None,
5860
):
5961
self.src = src
6062
self.src_type = src_type
@@ -73,6 +75,8 @@ def __init__(
7375
self.is_stream = is_stream
7476
self.associated_response_data = associated_response_data
7577
self.etag = etag
78+
self.case_conflict_submitted = case_conflict_submitted
79+
self.case_conflict_key = case_conflict_key
7680

7781
def is_glacier_compatible(self):
7882
"""Determines if a file info object is glacier compatible

awscli/customizations/s3/fileinfobuilder.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ def _inject_info(self, file_base):
4848
file_info_attr['is_stream'] = self._is_stream
4949
file_info_attr['associated_response_data'] = file_base.response_data
5050
file_info_attr['etag'] = file_base.etag
51+
file_info_attr['case_conflict_submitted'] = getattr(
52+
file_base, 'case_conflict_submitted', None
53+
)
54+
file_info_attr['case_conflict_key'] = getattr(
55+
file_base, 'case_conflict_key', None
56+
)
5157

5258
# This is a bit quirky. The below conditional hinges on the --delete
5359
# flag being set, which only occurs during a sync command. The source

awscli/customizations/s3/s3handler.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
SuccessResult,
3333
)
3434
from awscli.customizations.s3.subscribers import (
35+
CaseConflictCleanupSubscriber,
3536
CopyPropsSubscriberFactory,
3637
DeleteCopySourceObjectSubscriber,
3738
DeleteSourceFileSubscriber,
@@ -425,6 +426,13 @@ def _add_additional_subscribers(self, subscribers, fileinfo):
425426
subscribers.append(
426427
DeleteSourceObjectSubscriber(fileinfo.source_client)
427428
)
429+
if fileinfo.case_conflict_submitted is not None:
430+
subscribers.append(
431+
CaseConflictCleanupSubscriber(
432+
fileinfo.case_conflict_submitted,
433+
fileinfo.case_conflict_key,
434+
)
435+
)
428436

429437
def _submit_transfer_request(self, fileinfo, extra_args, subscribers):
430438
bucket, key = find_bucket_key(fileinfo.src)

awscli/customizations/s3/subcommands.py

Lines changed: 132 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,12 @@
3636
from awscli.customizations.s3.filters import create_filter
3737
from awscli.customizations.s3.s3handler import S3TransferHandlerFactory
3838
from awscli.customizations.s3.syncstrategy.base import (
39+
AlwaysSync,
3940
MissingFileSync,
4041
NeverSync,
4142
SizeAndLastModifiedSync,
4243
)
44+
from awscli.customizations.s3.syncstrategy.caseconflict import CaseConflictSync
4345
from awscli.customizations.s3.utils import (
4446
AppendFilter,
4547
RequestParamsMapper,
@@ -543,9 +545,7 @@
543545
'name': 'progress-multiline',
544546
'dest': 'progress_multiline',
545547
'action': 'store_true',
546-
'help_text': (
547-
'Show progress on multiple lines.'
548-
),
548+
'help_text': ('Show progress on multiple lines.'),
549549
}
550550

551551

@@ -664,6 +664,35 @@
664664
),
665665
}
666666

667+
668+
CASE_CONFLICT = {
669+
'name': 'case-conflict',
670+
'choices': [
671+
'ignore',
672+
'skip',
673+
'warn',
674+
'error',
675+
],
676+
'default': 'warn',
677+
'help_text': (
678+
"Configures behavior when attempting to download multiple objects "
679+
"whose keys differ only by case, which can cause undefined behavior "
680+
"on case-insensitive filesystems. "
681+
"This parameter only applies for commands that perform multiple S3 "
682+
"to local downloads. "
683+
f"See <a href='{CaseConflictSync.DOC_URI}'>Handling case "
684+
"conflicts</a> for details. Valid values are: "
685+
"<ul>"
686+
"<li>``error`` - Raise an error and abort downloads.</li>"
687+
"<li>``warn`` - The default value. Emit a warning and download "
688+
"the object.</li>"
689+
"<li>``skip`` - Skip downloading the object.</li>"
690+
"<li>``ignore`` - Ignore the conflict and download the object.</li>"
691+
"</ul>"
692+
),
693+
}
694+
695+
667696
TRANSFER_ARGS = [
668697
DRYRUN,
669698
QUIET,
@@ -1081,7 +1110,14 @@ class CpCommand(S3TransferCommand):
10811110
}
10821111
]
10831112
+ TRANSFER_ARGS
1084-
+ [METADATA, COPY_PROPS, METADATA_DIRECTIVE, EXPECTED_SIZE, RECURSIVE]
1113+
+ [
1114+
METADATA,
1115+
COPY_PROPS,
1116+
METADATA_DIRECTIVE,
1117+
EXPECTED_SIZE,
1118+
RECURSIVE,
1119+
CASE_CONFLICT,
1120+
]
10851121
)
10861122

10871123

@@ -1105,6 +1141,7 @@ class MvCommand(S3TransferCommand):
11051141
METADATA_DIRECTIVE,
11061142
RECURSIVE,
11071143
VALIDATE_SAME_S3_PATHS,
1144+
CASE_CONFLICT,
11081145
]
11091146
)
11101147

@@ -1150,7 +1187,7 @@ class SyncCommand(S3TransferCommand):
11501187
}
11511188
]
11521189
+ TRANSFER_ARGS
1153-
+ [METADATA, COPY_PROPS, METADATA_DIRECTIVE]
1190+
+ [METADATA, COPY_PROPS, METADATA_DIRECTIVE, CASE_CONFLICT]
11541191
)
11551192

11561193

@@ -1307,7 +1344,16 @@ def choose_sync_strategies(self):
13071344
sync_strategies['file_at_src_and_dest_sync_strategy'] = (
13081345
SizeAndLastModifiedSync()
13091346
)
1310-
sync_strategies['file_not_at_dest_sync_strategy'] = MissingFileSync()
1347+
if self._should_handle_case_conflicts():
1348+
sync_strategies['file_not_at_dest_sync_strategy'] = (
1349+
CaseConflictSync(
1350+
on_case_conflict=self.parameters['case_conflict']
1351+
)
1352+
)
1353+
else:
1354+
sync_strategies['file_not_at_dest_sync_strategy'] = (
1355+
MissingFileSync()
1356+
)
13111357
sync_strategies['file_not_at_src_sync_strategy'] = NeverSync()
13121358

13131359
# Determine what strategies to override if any.
@@ -1440,6 +1486,12 @@ def run(self):
14401486
'file_info_builder': [file_info_builder],
14411487
's3_handler': [s3_transfer_handler],
14421488
}
1489+
if self._should_handle_case_conflicts():
1490+
self._handle_case_conflicts(
1491+
command_dict,
1492+
rev_files,
1493+
rev_generator,
1494+
)
14431495
elif self.cmd == 'rm':
14441496
command_dict = {
14451497
'setup': [files],
@@ -1456,6 +1508,12 @@ def run(self):
14561508
'file_info_builder': [file_info_builder],
14571509
's3_handler': [s3_transfer_handler],
14581510
}
1511+
if self._should_handle_case_conflicts():
1512+
self._handle_case_conflicts(
1513+
command_dict,
1514+
rev_files,
1515+
rev_generator,
1516+
)
14591517

14601518
files = command_dict['setup']
14611519
while self.instructions:
@@ -1517,6 +1575,74 @@ def _map_sse_c_params(self, request_parameters, paths_type):
15171575
},
15181576
)
15191577

1578+
def _should_handle_case_conflicts(self):
1579+
return (
1580+
self.cmd in {'sync', 'cp', 'mv'}
1581+
and self.parameters.get('paths_type') == 's3local'
1582+
and self.parameters['case_conflict'] != 'ignore'
1583+
and self.parameters.get('dir_op')
1584+
)
1585+
1586+
def _handle_case_conflicts(self, command_dict, rev_files, rev_generator):
1587+
# Objects are not returned in lexicographical order when
1588+
# operated on S3 Express directory buckets. This is required
1589+
# for sync operations to behave correctly, which is what
1590+
# recursive copies and moves fall back to so potential case
1591+
# conflicts can be detected and handled.
1592+
if not is_s3express_bucket(
1593+
split_s3_bucket_key(self.parameters['src'])[0]
1594+
):
1595+
self._modify_instructions_for_case_conflicts(
1596+
command_dict, rev_files, rev_generator
1597+
)
1598+
return
1599+
# `skip` and `error` are not valid choices in this case because
1600+
# it's not possible to detect case conflicts.
1601+
if self.parameters['case_conflict'] not in {'ignore', 'warn'}:
1602+
raise ParamValidationError(
1603+
f"`{self.parameters['case_conflict']}` is not a valid value "
1604+
"for `--case-conflict` when operating on S3 Express "
1605+
"directory buckets. Valid values: `warn`, `ignore`."
1606+
)
1607+
msg = (
1608+
"warning: Recursive copies/moves from an S3 Express "
1609+
"directory bucket to a case-insensitive local filesystem "
1610+
"may result in undefined behavior if there are "
1611+
"S3 object key names that differ only by case. To disable "
1612+
"this warning, set the `--case-conflict` parameter to `ignore`. "
1613+
f"For more information, see {CaseConflictSync.DOC_URI}."
1614+
)
1615+
uni_print(msg, sys.stderr)
1616+
1617+
def _modify_instructions_for_case_conflicts(
1618+
self, command_dict, rev_files, rev_generator
1619+
):
1620+
# Command will perform recursive S3 to local downloads.
1621+
# Checking for potential case conflicts requires knowledge
1622+
# of local files. Instead of writing a separate validation
1623+
# mechanism for recursive downloads, we modify the instructions
1624+
# to mimic a sync command.
1625+
sync_strategies = {
1626+
# Local filename exists with exact case match. Always sync
1627+
# because it's a copy operation.
1628+
'file_at_src_and_dest_sync_strategy': AlwaysSync(),
1629+
# Local filename either doesn't exist or differs only by case.
1630+
# Let `CaseConflictSync` determine which it is and handle it
1631+
# according to configured `--case-conflict` parameter.
1632+
'file_not_at_dest_sync_strategy': CaseConflictSync(
1633+
on_case_conflict=self.parameters['case_conflict']
1634+
),
1635+
# Copy is one-way so never sync if not at source.
1636+
'file_not_at_src_sync_strategy': NeverSync(),
1637+
}
1638+
command_dict['setup'].append(rev_files)
1639+
command_dict['file_generator'].append(rev_generator)
1640+
command_dict['filters'].append(create_filter(self.parameters))
1641+
command_dict['comparator'] = [Comparator(**sync_strategies)]
1642+
self.instructions.insert(
1643+
self.instructions.index('file_info_builder'), 'comparator'
1644+
)
1645+
15201646

15211647
# TODO: This class is fairly quirky in the sense that it is both a builder
15221648
# and a data object. In the future we should make the following refactorings

awscli/customizations/s3/subscribers.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,20 @@ def on_queued(self, future, **kwargs):
9999
)
100100

101101

102+
class CaseConflictCleanupSubscriber(BaseSubscriber):
103+
"""
104+
A subscriber which removes object compare key from case conflict set
105+
when download finishes.
106+
"""
107+
108+
def __init__(self, submitted, case_conflict_key):
109+
self._submitted = submitted
110+
self._key = case_conflict_key
111+
112+
def on_done(self, future, **kwargs):
113+
self._submitted.discard(self._key)
114+
115+
102116
class DeleteSourceSubscriber(OnDoneFilteredSubscriber):
103117
"""A subscriber which deletes the source of the transfer."""
104118

awscli/customizations/s3/syncstrategy/base.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,3 +264,12 @@ def determine_should_sync(self, src_file, dest_file):
264264
src_file.dest,
265265
)
266266
return True
267+
268+
269+
class AlwaysSync(BaseSync):
270+
def __init__(self, sync_type='file_at_src_and_dest'):
271+
super(AlwaysSync, self).__init__(sync_type)
272+
273+
def determine_should_sync(self, src_file, dest_file):
274+
LOG.debug(f"syncing: {src_file.src} -> {src_file.dest}")
275+
return True

0 commit comments

Comments
 (0)