From cb60175c9514c87d26bf0323819b07d0c5d31c4a Mon Sep 17 00:00:00 2001 From: Ahmed Moustafa <35640105+aemous@users.noreply.github.com> Date: Thu, 3 Oct 2024 16:36:22 -0400 Subject: [PATCH] Flexible checksums for S3 high-level commands (#8955) --- .changes/next-release/feature-s3-95496.json | 5 ++ awscli/customizations/s3/subcommands.py | 37 +++++++- awscli/customizations/s3/utils.py | 13 +++ tests/functional/s3/test_cp_command.py | 81 +++++++++++++++++ tests/functional/s3/test_mv_command.py | 23 +++++ tests/functional/s3/test_sync_command.py | 87 +++++++++++++++++++ .../customizations/s3/test_subcommands.py | 40 +++++++++ tests/unit/customizations/s3/test_utils.py | 50 +++++++++++ 8 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 .changes/next-release/feature-s3-95496.json diff --git a/.changes/next-release/feature-s3-95496.json b/.changes/next-release/feature-s3-95496.json new file mode 100644 index 000000000000..8d1d36b64f23 --- /dev/null +++ b/.changes/next-release/feature-s3-95496.json @@ -0,0 +1,5 @@ +{ + "type": "feature", + "category": "s3", + "description": "Adds ``--checksum-mode`` and ``--checksum-algorithm`` parameters to high-level ``s3`` commands." +} diff --git a/awscli/customizations/s3/subcommands.py b/awscli/customizations/s3/subcommands.py index b045b2380ccc..e5a3f8a4d4fe 100644 --- a/awscli/customizations/s3/subcommands.py +++ b/awscli/customizations/s3/subcommands.py @@ -451,6 +451,17 @@ ) } +CHECKSUM_MODE = { + 'name': 'checksum-mode', 'choices': ['ENABLED'], + 'help_text': 'To retrieve the checksum, this mode must be enabled. If the object has a ' + 'checksum, it will be verified.' +} + +CHECKSUM_ALGORITHM = { + 'name': 'checksum-algorithm', 'choices': ['CRC32', 'SHA256', 'SHA1', 'CRC32C'], + 'help_text': 'Indicates the algorithm used to create the checksum for the object.' +} + TRANSFER_ARGS = [DRYRUN, QUIET, INCLUDE, EXCLUDE, ACL, FOLLOW_SYMLINKS, NO_FOLLOW_SYMLINKS, NO_GUESS_MIME_TYPE, SSE, SSE_C, SSE_C_KEY, SSE_KMS_KEY_ID, SSE_C_COPY_SOURCE, @@ -459,7 +470,7 @@ CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, EXPIRES, SOURCE_REGION, ONLY_SHOW_ERRORS, NO_PROGRESS, PAGE_SIZE, IGNORE_GLACIER_WARNINGS, FORCE_GLACIER_TRANSFER, - REQUEST_PAYER] + REQUEST_PAYER, CHECKSUM_MODE, CHECKSUM_ALGORITHM] def get_client(session, region, endpoint_url, verify, config=None): @@ -1242,6 +1253,17 @@ def _validate_path_args(self): if self._should_emit_validate_s3_paths_warning(): self._emit_validate_s3_paths_warning() + if params.get('checksum_algorithm'): + self._raise_if_paths_type_incorrect_for_param( + CHECKSUM_ALGORITHM['name'], + params['paths_type'], + ['locals3', 's3s3']) + if params.get('checksum_mode'): + self._raise_if_paths_type_incorrect_for_param( + CHECKSUM_MODE['name'], + params['paths_type'], + ['s3local']) + # If the user provided local path does not exist, hard fail because # we know that we will not be able to upload the file. if 'locals3' == params['paths_type'] and not params['is_stream']: @@ -1325,6 +1347,19 @@ def _raise_if_mv_same_paths(self, src, dest): f"{self.parameters['src']} - {self.parameters['dest']}" ) + def _raise_if_paths_type_incorrect_for_param(self, param, paths_type, allowed_paths): + if paths_type not in allowed_paths: + expected_usage_map = { + 'locals3': ' ', + 's3s3': ' ', + 's3local': ' ', + 's3': '' + } + raise ValueError( + f"Expected {param} parameter to be used with one of following path formats: " + f"{', '.join([expected_usage_map[path] for path in allowed_paths])}. Instead, received {expected_usage_map[paths_type]}." + ) + def _normalize_s3_trailing_slash(self, paths): for i, path in enumerate(paths): if path.startswith('s3://'): diff --git a/awscli/customizations/s3/utils.py b/awscli/customizations/s3/utils.py index 8dda7331c4c0..fab04fc53788 100644 --- a/awscli/customizations/s3/utils.py +++ b/awscli/customizations/s3/utils.py @@ -474,12 +474,14 @@ def map_put_object_params(cls, request_params, cli_params): cls._set_sse_request_params(request_params, cli_params) cls._set_sse_c_request_params(request_params, cli_params) cls._set_request_payer_param(request_params, cli_params) + cls._set_checksum_algorithm_param(request_params, cli_params) @classmethod def map_get_object_params(cls, request_params, cli_params): """Map CLI params to GetObject request params""" cls._set_sse_c_request_params(request_params, cli_params) cls._set_request_payer_param(request_params, cli_params) + cls._set_checksum_mode_param(request_params, cli_params) @classmethod def map_copy_object_params(cls, request_params, cli_params): @@ -492,6 +494,7 @@ def map_copy_object_params(cls, request_params, cli_params): cls._set_sse_c_and_copy_source_request_params( request_params, cli_params) cls._set_request_payer_param(request_params, cli_params) + cls._set_checksum_algorithm_param(request_params, cli_params) @classmethod def map_head_object_params(cls, request_params, cli_params): @@ -534,6 +537,16 @@ def _set_request_payer_param(cls, request_params, cli_params): if cli_params.get('request_payer'): request_params['RequestPayer'] = cli_params['request_payer'] + @classmethod + def _set_checksum_mode_param(cls, request_params, cli_params): + if cli_params.get('checksum_mode'): + request_params['ChecksumMode'] = cli_params['checksum_mode'] + + @classmethod + def _set_checksum_algorithm_param(cls, request_params, cli_params): + if cli_params.get('checksum_algorithm'): + request_params['ChecksumAlgorithm'] = cli_params['checksum_algorithm'] + @classmethod def _set_general_object_params(cls, request_params, cli_params): # Parameters set in this method should be applicable to the following diff --git a/tests/functional/s3/test_cp_command.py b/tests/functional/s3/test_cp_command.py index 441689da3774..adaccb7878aa 100644 --- a/tests/functional/s3/test_cp_command.py +++ b/tests/functional/s3/test_cp_command.py @@ -693,6 +693,87 @@ def test_cp_with_error_and_warning_permissions(self): self.assertIn('upload failed', stderr) self.assertIn('warning: File has an invalid timestamp.', stderr) + def test_upload_with_checksum_algorithm_crc32(self): + full_path = self.files.create_file('foo.txt', 'contents') + cmdline = f'{self.prefix} {full_path} s3://bucket/key.txt --checksum-algorithm CRC32' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[0][0].name, 'PutObject') + self.assertEqual(self.operations_called[0][1]['ChecksumAlgorithm'], 'CRC32') + + @requires_crt + def test_upload_with_checksum_algorithm_crc32c(self): + full_path = self.files.create_file('foo.txt', 'contents') + cmdline = f'{self.prefix} {full_path} s3://bucket/key.txt --checksum-algorithm CRC32C' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[0][0].name, 'PutObject') + self.assertEqual(self.operations_called[0][1]['ChecksumAlgorithm'], 'CRC32C') + + def test_multipart_upload_with_checksum_algorithm_crc32(self): + full_path = self.files.create_file('foo.txt', 'a' * 10 * (1024 ** 2)) + self.parsed_responses = [ + {'UploadId': 'foo'}, + {'ETag': 'foo-e1', 'ChecksumCRC32': 'foo-1'}, + {'ETag': 'foo-e2', 'ChecksumCRC32': 'foo-2'}, + {} + ] + cmdline = ('%s %s s3://bucket/key2.txt' + ' --checksum-algorithm CRC32' % (self.prefix, full_path)) + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(len(self.operations_called), 4, self.operations_called) + self.assertEqual(self.operations_called[0][0].name, 'CreateMultipartUpload') + self.assertEqual(self.operations_called[0][1]['ChecksumAlgorithm'], 'CRC32') + self.assertEqual(self.operations_called[1][0].name, 'UploadPart') + self.assertEqual(self.operations_called[1][1]['ChecksumAlgorithm'], 'CRC32') + self.assertEqual(self.operations_called[3][0].name, 'CompleteMultipartUpload') + self.assertIn({'ETag': 'foo-e1', 'ChecksumCRC32': 'foo-1', 'PartNumber': 1}, + self.operations_called[3][1]['MultipartUpload']['Parts']) + self.assertIn({'ETag': 'foo-e2', 'ChecksumCRC32': 'foo-2', 'PartNumber': 2}, + self.operations_called[3][1]['MultipartUpload']['Parts']) + + def test_copy_with_checksum_algorithm_crc32(self): + self.parsed_responses = [ + self.head_object_response(), + # Mocked CopyObject response with a CRC32 checksum specified + { + 'ETag': 'foo-1', + 'ChecksumCRC32': 'Tq0H4g==' + } + ] + cmdline = f'{self.prefix} s3://bucket1/key.txt s3://bucket2/key.txt --checksum-algorithm CRC32' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[1][0].name, 'CopyObject') + self.assertEqual(self.operations_called[1][1]['ChecksumAlgorithm'], 'CRC32') + + def test_download_with_checksum_mode_crc32(self): + self.parsed_responses = [ + self.head_object_response(), + # Mocked GetObject response with a checksum algorithm specified + { + 'ETag': 'foo-1', + 'ChecksumCRC32': 'Tq0H4g==', + 'Body': BytesIO(b'foo') + } + ] + cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[1][0].name, 'GetObject') + self.assertEqual(self.operations_called[1][1]['ChecksumMode'], 'ENABLED') + + def test_download_with_checksum_mode_crc32c(self): + self.parsed_responses = [ + self.head_object_response(), + # Mocked GetObject response with a checksum algorithm specified + { + 'ETag': 'foo-1', + 'ChecksumCRC32C': 'checksum', + 'Body': BytesIO(b'foo') + } + ] + cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[1][0].name, 'GetObject') + self.assertEqual(self.operations_called[1][1]['ChecksumMode'], 'ENABLED') + class TestStreamingCPCommand(BaseAWSCommandParamsTest): def test_streaming_upload(self): diff --git a/tests/functional/s3/test_mv_command.py b/tests/functional/s3/test_mv_command.py index da8637dff8b3..bb673c6f123f 100644 --- a/tests/functional/s3/test_mv_command.py +++ b/tests/functional/s3/test_mv_command.py @@ -132,6 +132,29 @@ def test_copy_move_with_request_payer(self): ] ) + def test_upload_with_checksum_algorithm_crc32(self): + full_path = self.files.create_file('foo.txt', 'contents') + cmdline = f'{self.prefix} {full_path} s3://bucket/key.txt --checksum-algorithm CRC32' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[0][0].name, 'PutObject') + self.assertEqual(self.operations_called[0][1]['ChecksumAlgorithm'], 'CRC32') + + def test_download_with_checksum_mode_crc32(self): + self.parsed_responses = [ + self.head_object_response(), + # Mocked GetObject response with a checksum algorithm specified + { + 'ETag': 'foo-1', + 'ChecksumCRC32': 'checksum', + 'Body': BytesIO(b'foo') + }, + self.delete_object_response() + ] + cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[1][0].name, 'GetObject') + self.assertEqual(self.operations_called[1][1]['ChecksumMode'], 'ENABLED') + class TestMvCommandWithValidateSameS3Paths(BaseS3TransferCommandTest): diff --git a/tests/functional/s3/test_sync_command.py b/tests/functional/s3/test_sync_command.py index 2e94f30986d2..b3978edcf426 100644 --- a/tests/functional/s3/test_sync_command.py +++ b/tests/functional/s3/test_sync_command.py @@ -288,6 +288,93 @@ def test_with_accesspoint_arn(self): ] ) + def test_upload_with_checksum_algorithm_sha1(self): + self.files.create_file('foo.txt', 'contents') + cmdline = f'{self.prefix} {self.files.rootdir} s3://bucket/ --checksum-algorithm SHA1' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[1][0].name, 'PutObject') + self.assertEqual(self.operations_called[1][1]['ChecksumAlgorithm'], 'SHA1') + + def test_copy_with_checksum_algorithm_update_sha1(self): + cmdline = f'{self.prefix} s3://src-bucket/ s3://dest-bucket/ --checksum-algorithm SHA1' + self.parsed_responses = [ + # Response for ListObjects on source bucket + { + 'Contents': [ + { + 'Key': 'mykey', + 'LastModified': '00:00:00Z', + 'Size': 100, + 'ChecksumAlgorithm': 'SHA1' + } + ], + 'CommonPrefixes': [] + }, + # Response for ListObjects on destination bucket + self.list_objects_response([]), + # Response for CopyObject + { + 'ChecksumSHA1': 'sha1-checksum' + } + ] + self.run_cmd(cmdline, expected_rc=0) + self.assert_operations_called( + [ + self.list_objects_request('src-bucket'), + self.list_objects_request('dest-bucket'), + ( + 'CopyObject', { + 'CopySource': { + 'Bucket': 'src-bucket', + 'Key': 'mykey' + }, + 'Bucket': 'dest-bucket', + 'Key': 'mykey', + 'ChecksumAlgorithm': 'SHA1' + } + ) + ] + ) + + def test_upload_with_checksum_algorithm_sha256(self): + self.files.create_file('foo.txt', 'contents') + cmdline = f'{self.prefix} {self.files.rootdir} s3://bucket/ --checksum-algorithm SHA256' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[1][0].name, 'PutObject') + self.assertEqual(self.operations_called[1][1]['ChecksumAlgorithm'], 'SHA256') + + def test_download_with_checksum_mode_sha1(self): + self.parsed_responses = [ + self.list_objects_response(['bucket']), + # Mocked GetObject response with a checksum algorithm specified + { + 'ETag': 'foo-1', + 'ChecksumSHA1': 'checksum', + 'Body': BytesIO(b'foo') + } + ] + cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[0][0].name, 'ListObjectsV2') + self.assertEqual(self.operations_called[1][0].name, 'GetObject') + self.assertIn(('ChecksumMode', 'ENABLED'), self.operations_called[1][1].items()) + + def test_download_with_checksum_mode_sha256(self): + self.parsed_responses = [ + self.list_objects_response(['bucket']), + # Mocked GetObject response with a checksum algorithm specified + { + 'ETag': 'foo-1', + 'ChecksumSHA256': 'checksum', + 'Body': BytesIO(b'foo') + } + ] + cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED' + self.run_cmd(cmdline, expected_rc=0) + self.assertEqual(self.operations_called[0][0].name, 'ListObjectsV2') + self.assertEqual(self.operations_called[1][0].name, 'GetObject') + self.assertIn(('ChecksumMode', 'ENABLED'), self.operations_called[1][1].items()) + class TestSyncCommandWithS3Express(BaseS3TransferCommandTest): diff --git a/tests/unit/customizations/s3/test_subcommands.py b/tests/unit/customizations/s3/test_subcommands.py index f68f3cc642bd..53016bbdb04c 100644 --- a/tests/unit/customizations/s3/test_subcommands.py +++ b/tests/unit/customizations/s3/test_subcommands.py @@ -649,6 +649,46 @@ def test_validate_no_streaming_paths(self): cmd_params.add_paths(paths) self.assertFalse(cmd_params.parameters['is_stream']) + def test_validate_checksum_algorithm_download_error(self): + paths = ['s3://bucket/key', self.file_creator.rootdir] + parameters = {'checksum_algorithm': 'CRC32'} + cmd_params = CommandParameters('cp', parameters, '') + with self.assertRaises(ValueError) as cm: + cmd_params.add_paths(paths) + self.assertIn('Expected checksum-algorithm parameter to be used with one of following path formats', cm.msg) + + def test_validate_checksum_algorithm_sync_download_error(self): + paths = ['s3://bucket/key', self.file_creator.rootdir] + parameters = {'checksum_algorithm': 'CRC32C'} + cmd_params = CommandParameters('sync', parameters, '') + with self.assertRaises(ValueError) as cm: + cmd_params.add_paths(paths) + self.assertIn('Expected checksum-algorithm parameter to be used with one of following path formats', cm.msg) + + def test_validate_checksum_mode_upload_error(self): + paths = [self.file_creator.rootdir, 's3://bucket/key'] + parameters = {'checksum_mode': 'ENABLED'} + cmd_params = CommandParameters('cp', parameters, '') + with self.assertRaises(ValueError) as cm: + cmd_params.add_paths(paths) + self.assertIn('Expected checksum-mode parameter to be used with one of following path formats', cm.msg) + + def test_validate_checksum_mode_sync_upload_error(self): + paths = [self.file_creator.rootdir, 's3://bucket/key'] + parameters = {'checksum_mode': 'ENABLED'} + cmd_params = CommandParameters('sync', parameters, '') + with self.assertRaises(ValueError) as cm: + cmd_params.add_paths(paths) + self.assertIn('Expected checksum-mode parameter to be used with one of following path formats', cm.msg) + + def test_validate_checksum_mode_move_error(self): + paths = ['s3://bucket/key', 's3://bucket2/key'] + parameters = {'checksum_mode': 'ENABLED'} + cmd_params = CommandParameters('mv', parameters, '') + with self.assertRaises(ValueError) as cm: + cmd_params.add_paths(paths) + self.assertIn('Expected checksum-mode parameter to be used with one of following path formats', cm.msg) + def test_validate_streaming_paths_error(self): parameters = {'src': '-', 'dest': 's3://bucket'} cmd_params = CommandParameters('sync', parameters, '') diff --git a/tests/unit/customizations/s3/test_utils.py b/tests/unit/customizations/s3/test_utils.py index 0256677f08b2..910145383e9b 100644 --- a/tests/unit/customizations/s3/test_utils.py +++ b/tests/unit/customizations/s3/test_utils.py @@ -665,6 +665,56 @@ def test_upload_part_copy(self): 'SSECustomerKey': 'my-sse-c-key'}) +class TestRequestParamsMapperChecksumAlgorithm: + @pytest.fixture + def cli_params(self): + return {'checksum_algorithm': 'CRC32'} + + @pytest.fixture + def cli_params_no_algorithm(self): + return {} + + def test_put_object(self, cli_params): + request_params = {} + RequestParamsMapper.map_put_object_params(request_params, cli_params) + assert request_params == {'ChecksumAlgorithm': 'CRC32'} + + def test_put_object_no_checksum(self, cli_params_no_algorithm): + request_params = {} + RequestParamsMapper.map_put_object_params(request_params, cli_params_no_algorithm) + assert 'ChecksumAlgorithm' not in request_params + + def test_copy_object(self, cli_params): + request_params = {} + RequestParamsMapper.map_copy_object_params(request_params, cli_params) + assert request_params == {'ChecksumAlgorithm': 'CRC32'} + + def test_copy_object_no_checksum(self, cli_params_no_algorithm): + request_params = {} + RequestParamsMapper.map_put_object_params(request_params, cli_params_no_algorithm) + assert 'ChecksumAlgorithm' not in request_params + + +class TestRequestParamsMapperChecksumMode: + @pytest.fixture + def cli_params(self): + return {'checksum_mode': 'ENABLED'} + + @pytest.fixture + def cli_params_no_checksum(self): + return {} + + def test_get_object(self, cli_params): + request_params = {} + RequestParamsMapper.map_get_object_params(request_params, cli_params) + assert request_params == {'ChecksumMode': 'ENABLED'} + + def test_get_object_no_checksums(self, cli_params_no_checksum): + request_params = {} + RequestParamsMapper.map_get_object_params(request_params, cli_params_no_checksum) + assert 'ChecksumMode' not in request_params + + class TestRequestParamsMapperRequestPayer(unittest.TestCase): def setUp(self): self.cli_params = {'request_payer': 'requester'}