From 2e82eeb5dbffa55fac15436a88f8abaa7ad9a36b Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Thu, 1 Jul 2021 10:21:54 -0700 Subject: [PATCH 01/18] fix default time for compilation jobs --- src/sagemaker/estimator.py | 2 +- src/sagemaker/model.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index aae66bc8ba..35960afab9 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -718,7 +718,7 @@ def compile_model( 'onnx', 'xgboost' framework_version (str): The version of the framework compile_max_run (int): Timeout in seconds for compilation (default: - 3 * 60). After this amount of time Amazon SageMaker Neo + 15 * 60). After this amount of time Amazon SageMaker Neo terminates the compilation job regardless of its current status. tags (list[dict]): List of tags for labeling a compilation job. For more, see diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 8ebd79147e..755b1d5fbd 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -559,7 +559,7 @@ def compile( role, tags=None, job_name=None, - compile_max_run=5 * 60, + compile_max_run=15 * 60, framework=None, framework_version=None, target_platform_os=None, @@ -588,7 +588,7 @@ def compile( https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. job_name (str): The name of the compilation job compile_max_run (int): Timeout in seconds for compilation (default: - 3 * 60). After this amount of time Amazon SageMaker Neo + 15 * 60). After this amount of time Amazon SageMaker Neo terminates the compilation job regardless of its current status. framework (str): The framework that is used to train the original model. Allowed values: 'mxnet', 'tensorflow', 'keras', 'pytorch', From b19b4a3c38dd303133dfb9413500de4eba2faa17 Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Wed, 6 Nov 2024 14:11:12 -0800 Subject: [PATCH 02/18] changes for blackbird - model sharding --- src/sagemaker/model.py | 6 ++++ .../serve/builder/jumpstart_builder.py | 7 ++-- src/sagemaker/serve/builder/model_builder.py | 23 ++++++++++++- src/sagemaker/serve/utils/optimize_utils.py | 8 ++++- .../serve/builder/test_model_builder.py | 34 +++++++++++++++++++ .../serve/utils/test_optimize_utils.py | 25 ++++++++++++-- 6 files changed, 97 insertions(+), 6 deletions(-) diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 340d35b250..d3ff3cc934 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -372,6 +372,7 @@ def __init__( self.endpoint_name = None self.inference_component_name = None self._is_compiled_model = False + self._is_sharded_model = False self._compilation_job_name = None self._is_edge_packaged_model = False self.inference_recommender_job_results = None @@ -1599,6 +1600,11 @@ def deploy( if self._base_name is not None: self._base_name = "-".join((self._base_name, compiled_model_suffix)) + if self._is_sharded_model and endpoint_type != EndpointType.INFERENCE_COMPONENT_BASED: + logging.warning("Forcing INFERENCE_COMPONENT_BASED endpoint for sharded model. ADVISORY - " + "Use INFERENCE_COMPONENT_BASED endpoints over MODEL_BASED endpoints.") + endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED + # Support multiple models on same endpoint if endpoint_type == EndpointType.INFERENCE_COMPONENT_BASED: if endpoint_name: diff --git a/src/sagemaker/serve/builder/jumpstart_builder.py b/src/sagemaker/serve/builder/jumpstart_builder.py index cfb43b813a..c058337470 100644 --- a/src/sagemaker/serve/builder/jumpstart_builder.py +++ b/src/sagemaker/serve/builder/jumpstart_builder.py @@ -681,6 +681,7 @@ def _optimize_for_jumpstart( quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, speculative_decoding_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None, env_vars: Optional[Dict] = None, vpc_config: Optional[Dict] = None, kms_key: Optional[str] = None, @@ -702,6 +703,8 @@ def _optimize_for_jumpstart( compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``. speculative_decoding_config (Optional[Dict]): Speculative decoding configuration. Defaults to ``None`` + sharding_config (Optional[Dict]): Model sharding configuration. + Defaults to ``None`` env_vars (Optional[Dict]): Additional environment variables to run the optimization container. Defaults to ``None``. vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``. @@ -727,7 +730,7 @@ def _optimize_for_jumpstart( pysdk_model_env_vars = self._get_neuron_model_env_vars(instance_type) optimization_config, override_env = _extract_optimization_config_and_env( - quantization_config, compilation_config + quantization_config, compilation_config, sharding_config ) if not optimization_config and is_compilation: override_env = override_env or pysdk_model_env_vars @@ -792,7 +795,7 @@ def _optimize_for_jumpstart( optimization_env_vars = _update_environment_variables(optimization_env_vars, override_env) if optimization_env_vars: self.pysdk_model.env.update(optimization_env_vars) - if quantization_config or is_compilation: + if quantization_config or sharding_config or is_compilation: return create_optimization_job_args return None diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index d1f1ab6ba2..1ae65536ee 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -1119,6 +1119,7 @@ def optimize( quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, speculative_decoding_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None, env_vars: Optional[Dict] = None, vpc_config: Optional[Dict] = None, kms_key: Optional[str] = None, @@ -1142,6 +1143,8 @@ def optimize( compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``. speculative_decoding_config (Optional[Dict]): Speculative decoding configuration. Defaults to ``None`` + sharding_config (Optional[Dict]): Model sharding configuration. + Defaults to ``None`` env_vars (Optional[Dict]): Additional environment variables to run the optimization container. Defaults to ``None``. vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``. @@ -1170,6 +1173,7 @@ def optimize( quantization_config=quantization_config, compilation_config=compilation_config, speculative_decoding_config=speculative_decoding_config, + sharding_config=sharding_config, env_vars=env_vars, vpc_config=vpc_config, kms_key=kms_key, @@ -1189,6 +1193,7 @@ def _model_builder_optimize_wrapper( quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, speculative_decoding_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None, env_vars: Optional[Dict] = None, vpc_config: Optional[Dict] = None, kms_key: Optional[str] = None, @@ -1212,6 +1217,8 @@ def _model_builder_optimize_wrapper( compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``. speculative_decoding_config (Optional[Dict]): Speculative decoding configuration. Defaults to ``None`` + sharding_config (Optional[Dict]): Model sharding configuration. + Defaults to ``None`` env_vars (Optional[Dict]): Additional environment variables to run the optimization container. Defaults to ``None``. vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``. @@ -1238,6 +1245,12 @@ def _model_builder_optimize_wrapper( if quantization_config and compilation_config: raise ValueError("Quantization config and compilation config are mutually exclusive.") + if sharding_config and (quantization_config or compilation_config or speculative_decoding_config): + raise ValueError("Sharding config is mutually exclusive and cannot be combined with any other optimization.") + + if sharding_config and ((env_vars and "OPTION_TENSOR_PARALLEL_DEGREE" not in env_vars) or (sharding_config.get("OverrideEnvironment") and "OPTION_TENSOR_PARALLEL_DEGREE" not in sharding_config["OverrideEnvironment"])): + raise ValueError("OPTION_TENSOR_PARALLEL_DEGREE is required environment variable with Sharding config.") + self.sagemaker_session = sagemaker_session or self.sagemaker_session or Session() self.instance_type = instance_type or self.instance_type self.role_arn = role_arn or self.role_arn @@ -1254,6 +1267,7 @@ def _model_builder_optimize_wrapper( quantization_config=quantization_config, compilation_config=compilation_config, speculative_decoding_config=speculative_decoding_config, + sharding_config=sharding_config, env_vars=env_vars, vpc_config=vpc_config, kms_key=kms_key, @@ -1272,6 +1286,7 @@ def _model_builder_optimize_wrapper( quantization_config=quantization_config, compilation_config=compilation_config, speculative_decoding_config=speculative_decoding_config, + sharding_config=sharding_config, env_vars=env_vars, vpc_config=vpc_config, kms_key=kms_key, @@ -1287,6 +1302,9 @@ def _model_builder_optimize_wrapper( if not speculative_decoding_config: self.pysdk_model.remove_tag_with_key(Tag.SPECULATIVE_DRAFT_MODEL_PROVIDER) + if sharding_config: + self.pysdk_model._is_sharded_model = True + return self.pysdk_model def _optimize_for_hf( @@ -1297,6 +1315,7 @@ def _optimize_for_hf( quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, speculative_decoding_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None, env_vars: Optional[Dict] = None, vpc_config: Optional[Dict] = None, kms_key: Optional[str] = None, @@ -1312,6 +1331,8 @@ def _optimize_for_hf( compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``. speculative_decoding_config (Optional[Dict]): Speculative decoding configuration. Defaults to ``None`` + sharding_config (Optional[Dict]): Model sharding configuration. + Defaults to ``None`` env_vars (Optional[Dict]): Additional environment variables to run the optimization container. Defaults to ``None``. vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``. @@ -1327,7 +1348,7 @@ def _optimize_for_hf( self.pysdk_model, speculative_decoding_config, False ) - if quantization_config or compilation_config: + if quantization_config or compilation_config or sharding_config: create_optimization_job_args = { "OptimizationJobName": job_name, "DeploymentInstanceType": self.instance_type, diff --git a/src/sagemaker/serve/utils/optimize_utils.py b/src/sagemaker/serve/utils/optimize_utils.py index 5781c0bade..cacf647bcd 100644 --- a/src/sagemaker/serve/utils/optimize_utils.py +++ b/src/sagemaker/serve/utils/optimize_utils.py @@ -259,13 +259,15 @@ def _is_s3_uri(s3_uri: Optional[str]) -> bool: def _extract_optimization_config_and_env( - quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None + quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None ) -> Optional[Tuple[Optional[Dict], Optional[Dict]]]: """Extracts optimization config and environment variables. Args: quantization_config (Optional[Dict]): The quantization config. compilation_config (Optional[Dict]): The compilation config. + sharding_config (Optional[Dict]): The sharding config. Returns: Optional[Tuple[Optional[Dict], Optional[Dict]]]: @@ -279,6 +281,10 @@ def _extract_optimization_config_and_env( return {"ModelCompilationConfig": compilation_config}, compilation_config.get( "OverrideEnvironment" ) + if sharding_config: + return {"ModelShardingConfig": sharding_config}, sharding_config.get( + "OverrideEnvironment" + ) return None, None diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index b50aa17c34..ddbab8e54d 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -2667,6 +2667,40 @@ def test_optimize_exclusive_args(self, mock_get_serve_setting): ), ) + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + def test_optimize_exclusive_sharding(self, mock_get_serve_setting): + mock_sagemaker_session = Mock() + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-70b", + sagemaker_session=mock_sagemaker_session, + ) + + self.assertRaisesRegex( + ValueError, + "Sharding config is mutually exclusive and cannot be combined with any other optimization.", + lambda: model_builder.optimize( + quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + compilation_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + ), + ) + + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + def test_optimize_exclusive_sharding_args(self, mock_get_serve_setting): + mock_sagemaker_session = Mock() + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-70b", + sagemaker_session=mock_sagemaker_session, + ) + + self.assertRaisesRegex( + ValueError, + "OPTION_TENSOR_PARALLEL_DEGREE is required environment variable with Sharding config.", + lambda: model_builder.optimize( + sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + ), + ) + @patch.object(ModelBuilder, "_prepare_for_mode") @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) def test_optimize_for_hf_with_custom_s3_path( diff --git a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py index a8dc6d74f4..32fd93ddaa 100644 --- a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py +++ b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py @@ -261,7 +261,7 @@ def test_is_s3_uri(s3_uri, expected): @pytest.mark.parametrize( - "quantization_config, compilation_config, expected_config, expected_env", + "quantization_config, compilation_config, sharding_config, expected_config, expected_env", [ ( None, @@ -270,6 +270,7 @@ def test_is_s3_uri(s3_uri, expected): "OPTION_TENSOR_PARALLEL_DEGREE": "2", } }, + None, { "ModelCompilationConfig": { "OverrideEnvironment": { @@ -288,6 +289,7 @@ def test_is_s3_uri(s3_uri, expected): } }, None, + None, { "ModelQuantizationConfig": { "OverrideEnvironment": { @@ -299,7 +301,26 @@ def test_is_s3_uri(s3_uri, expected): "OPTION_TENSOR_PARALLEL_DEGREE": "2", }, ), - (None, None, None, None), + ( + None, + None, + { + "OverrideEnvironment": { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + } + }, + { + "ModelShardingConfig": { + "OverrideEnvironment": { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + } + }, + }, + { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + }, + ), + (None, None, None, None, None), ], ) def test_extract_optimization_config_and_env( From 5833143787cd35215d934dc7ec0d5725e3de1dff Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Tue, 12 Nov 2024 13:39:51 -0800 Subject: [PATCH 03/18] remove UTs for now --- .../serve/builder/test_model_builder.py | 34 ------------------- .../serve/utils/test_optimize_utils.py | 25 ++------------ 2 files changed, 2 insertions(+), 57 deletions(-) diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index ddbab8e54d..b50aa17c34 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -2667,40 +2667,6 @@ def test_optimize_exclusive_args(self, mock_get_serve_setting): ), ) - @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) - def test_optimize_exclusive_sharding(self, mock_get_serve_setting): - mock_sagemaker_session = Mock() - model_builder = ModelBuilder( - model="meta-textgeneration-llama-3-70b", - sagemaker_session=mock_sagemaker_session, - ) - - self.assertRaisesRegex( - ValueError, - "Sharding config is mutually exclusive and cannot be combined with any other optimization.", - lambda: model_builder.optimize( - quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, - compilation_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, - sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, - ), - ) - - @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) - def test_optimize_exclusive_sharding_args(self, mock_get_serve_setting): - mock_sagemaker_session = Mock() - model_builder = ModelBuilder( - model="meta-textgeneration-llama-3-70b", - sagemaker_session=mock_sagemaker_session, - ) - - self.assertRaisesRegex( - ValueError, - "OPTION_TENSOR_PARALLEL_DEGREE is required environment variable with Sharding config.", - lambda: model_builder.optimize( - sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, - ), - ) - @patch.object(ModelBuilder, "_prepare_for_mode") @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) def test_optimize_for_hf_with_custom_s3_path( diff --git a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py index 32fd93ddaa..a8dc6d74f4 100644 --- a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py +++ b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py @@ -261,7 +261,7 @@ def test_is_s3_uri(s3_uri, expected): @pytest.mark.parametrize( - "quantization_config, compilation_config, sharding_config, expected_config, expected_env", + "quantization_config, compilation_config, expected_config, expected_env", [ ( None, @@ -270,7 +270,6 @@ def test_is_s3_uri(s3_uri, expected): "OPTION_TENSOR_PARALLEL_DEGREE": "2", } }, - None, { "ModelCompilationConfig": { "OverrideEnvironment": { @@ -289,7 +288,6 @@ def test_is_s3_uri(s3_uri, expected): } }, None, - None, { "ModelQuantizationConfig": { "OverrideEnvironment": { @@ -301,26 +299,7 @@ def test_is_s3_uri(s3_uri, expected): "OPTION_TENSOR_PARALLEL_DEGREE": "2", }, ), - ( - None, - None, - { - "OverrideEnvironment": { - "OPTION_TENSOR_PARALLEL_DEGREE": "2", - } - }, - { - "ModelShardingConfig": { - "OverrideEnvironment": { - "OPTION_TENSOR_PARALLEL_DEGREE": "2", - } - }, - }, - { - "OPTION_TENSOR_PARALLEL_DEGREE": "2", - }, - ), - (None, None, None, None, None), + (None, None, None, None), ], ) def test_extract_optimization_config_and_env( From e40fad72011bda10cb334658e31a083f9e095a51 Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Wed, 13 Nov 2024 11:33:56 -0800 Subject: [PATCH 04/18] add unit tests --- .../serve/builder/test_js_builder.py | 51 +++++++++++++++++++ .../serve/builder/test_model_builder.py | 33 ++++++++++++ .../serve/utils/test_optimize_utils.py | 29 +++++++++-- 3 files changed, 109 insertions(+), 4 deletions(-) diff --git a/tests/unit/sagemaker/serve/builder/test_js_builder.py b/tests/unit/sagemaker/serve/builder/test_js_builder.py index 248955c273..16c8c5c390 100644 --- a/tests/unit/sagemaker/serve/builder/test_js_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_js_builder.py @@ -1198,6 +1198,57 @@ def test_optimize_quantize_for_jumpstart( self.assertIsNotNone(out_put) + @patch("sagemaker.serve.builder.jumpstart_builder._capture_telemetry", side_effect=None) + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + def test_optimize_sharding_for_jumpstart( + self, + mock_serve_settings, + mock_telemetry, + ): + mock_sagemaker_session = Mock() + + mock_pysdk_model = Mock() + mock_pysdk_model.env = {"SAGEMAKER_ENV": "1"} + mock_pysdk_model.model_data = mock_model_data + mock_pysdk_model.image_uri = mock_tgi_image_uri + mock_pysdk_model.list_deployment_configs.return_value = DEPLOYMENT_CONFIGS + mock_pysdk_model.deployment_config = DEPLOYMENT_CONFIGS[0] + + sample_input = { + "inputs": "The diamondback terrapin or simply terrapin is a species " + "of turtle native to the brackish coastal tidal marshes of the", + "parameters": {"max_new_tokens": 1024}, + } + sample_output = [ + { + "generated_text": "The diamondback terrapin or simply terrapin is a " + "species of turtle native to the brackish coastal " + "tidal marshes of the east coast." + } + ] + + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-70b", + schema_builder=SchemaBuilder(sample_input, sample_output), + sagemaker_session=mock_sagemaker_session, + ) + + model_builder.pysdk_model = mock_pysdk_model + + out_put = model_builder._optimize_for_jumpstart( + accept_eula=True, + sharding_config={ + "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, + }, + env_vars={ + "OPTION_TENSOR_PARALLEL_DEGREE": "1", + "OPTION_MAX_ROLLING_BATCH_SIZE": "2", + }, + output_path="s3://bucket/code/", + ) + + self.assertIsNotNone(out_put) + @patch("sagemaker.serve.builder.jumpstart_builder._capture_telemetry", side_effect=None) @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) @patch( diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index b50aa17c34..6e0f1bfe5e 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -2667,6 +2667,39 @@ def test_optimize_exclusive_args(self, mock_get_serve_setting): ), ) + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + def test_optimize_exclusive_sharding(self, mock_get_serve_setting): + mock_sagemaker_session = Mock() + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-70b", + sagemaker_session=mock_sagemaker_session, + ) + + self.assertRaisesRegex( + ValueError, + "Sharding config is mutually exclusive and cannot be combined with any other optimization.", + lambda: model_builder.optimize( + compilation_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + ), + ) + + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + def test_optimize_exclusive_sharding_args(self, mock_get_serve_setting): + mock_sagemaker_session = Mock() + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-70b", + sagemaker_session=mock_sagemaker_session, + ) + + self.assertRaisesRegex( + ValueError, + "OPTION_TENSOR_PARALLEL_DEGREE is required environment variable with Sharding config.", + lambda: model_builder.optimize( + sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + ), + ) + @patch.object(ModelBuilder, "_prepare_for_mode") @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) def test_optimize_for_hf_with_custom_s3_path( diff --git a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py index a8dc6d74f4..38e4d0c6fe 100644 --- a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py +++ b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py @@ -261,7 +261,7 @@ def test_is_s3_uri(s3_uri, expected): @pytest.mark.parametrize( - "quantization_config, compilation_config, expected_config, expected_env", + "quantization_config, compilation_config, sharding_config, expected_config, expected_env", [ ( None, @@ -270,6 +270,7 @@ def test_is_s3_uri(s3_uri, expected): "OPTION_TENSOR_PARALLEL_DEGREE": "2", } }, + None, { "ModelCompilationConfig": { "OverrideEnvironment": { @@ -288,6 +289,7 @@ def test_is_s3_uri(s3_uri, expected): } }, None, + None, { "ModelQuantizationConfig": { "OverrideEnvironment": { @@ -299,13 +301,32 @@ def test_is_s3_uri(s3_uri, expected): "OPTION_TENSOR_PARALLEL_DEGREE": "2", }, ), - (None, None, None, None), + ( + None, + None, + { + "OverrideEnvironment": { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + } + }, + { + "ModelShardingConfig": { + "OverrideEnvironment": { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + } + }, + }, + { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + }, + ), + (None, None, None, None, None), ], ) def test_extract_optimization_config_and_env( - quantization_config, compilation_config, expected_config, expected_env + quantization_config, compilation_config, sharding_config, expected_config, expected_env ): - assert _extract_optimization_config_and_env(quantization_config, compilation_config) == ( + assert _extract_optimization_config_and_env(quantization_config, compilation_config, sharding_config) == ( expected_config, expected_env, ) From 63cfdf445c288018fdd0574c7d033a23994f3b87 Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Wed, 6 Nov 2024 14:11:12 -0800 Subject: [PATCH 05/18] changes for blackbird - model sharding --- src/sagemaker/model.py | 6 ++++ .../serve/builder/jumpstart_builder.py | 7 ++-- src/sagemaker/serve/builder/model_builder.py | 23 ++++++++++++- src/sagemaker/serve/utils/optimize_utils.py | 8 ++++- .../serve/builder/test_model_builder.py | 34 +++++++++++++++++++ .../serve/utils/test_optimize_utils.py | 25 ++++++++++++-- 6 files changed, 97 insertions(+), 6 deletions(-) diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 340d35b250..d3ff3cc934 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -372,6 +372,7 @@ def __init__( self.endpoint_name = None self.inference_component_name = None self._is_compiled_model = False + self._is_sharded_model = False self._compilation_job_name = None self._is_edge_packaged_model = False self.inference_recommender_job_results = None @@ -1599,6 +1600,11 @@ def deploy( if self._base_name is not None: self._base_name = "-".join((self._base_name, compiled_model_suffix)) + if self._is_sharded_model and endpoint_type != EndpointType.INFERENCE_COMPONENT_BASED: + logging.warning("Forcing INFERENCE_COMPONENT_BASED endpoint for sharded model. ADVISORY - " + "Use INFERENCE_COMPONENT_BASED endpoints over MODEL_BASED endpoints.") + endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED + # Support multiple models on same endpoint if endpoint_type == EndpointType.INFERENCE_COMPONENT_BASED: if endpoint_name: diff --git a/src/sagemaker/serve/builder/jumpstart_builder.py b/src/sagemaker/serve/builder/jumpstart_builder.py index cfb43b813a..c058337470 100644 --- a/src/sagemaker/serve/builder/jumpstart_builder.py +++ b/src/sagemaker/serve/builder/jumpstart_builder.py @@ -681,6 +681,7 @@ def _optimize_for_jumpstart( quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, speculative_decoding_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None, env_vars: Optional[Dict] = None, vpc_config: Optional[Dict] = None, kms_key: Optional[str] = None, @@ -702,6 +703,8 @@ def _optimize_for_jumpstart( compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``. speculative_decoding_config (Optional[Dict]): Speculative decoding configuration. Defaults to ``None`` + sharding_config (Optional[Dict]): Model sharding configuration. + Defaults to ``None`` env_vars (Optional[Dict]): Additional environment variables to run the optimization container. Defaults to ``None``. vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``. @@ -727,7 +730,7 @@ def _optimize_for_jumpstart( pysdk_model_env_vars = self._get_neuron_model_env_vars(instance_type) optimization_config, override_env = _extract_optimization_config_and_env( - quantization_config, compilation_config + quantization_config, compilation_config, sharding_config ) if not optimization_config and is_compilation: override_env = override_env or pysdk_model_env_vars @@ -792,7 +795,7 @@ def _optimize_for_jumpstart( optimization_env_vars = _update_environment_variables(optimization_env_vars, override_env) if optimization_env_vars: self.pysdk_model.env.update(optimization_env_vars) - if quantization_config or is_compilation: + if quantization_config or sharding_config or is_compilation: return create_optimization_job_args return None diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index d1f1ab6ba2..1ae65536ee 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -1119,6 +1119,7 @@ def optimize( quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, speculative_decoding_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None, env_vars: Optional[Dict] = None, vpc_config: Optional[Dict] = None, kms_key: Optional[str] = None, @@ -1142,6 +1143,8 @@ def optimize( compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``. speculative_decoding_config (Optional[Dict]): Speculative decoding configuration. Defaults to ``None`` + sharding_config (Optional[Dict]): Model sharding configuration. + Defaults to ``None`` env_vars (Optional[Dict]): Additional environment variables to run the optimization container. Defaults to ``None``. vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``. @@ -1170,6 +1173,7 @@ def optimize( quantization_config=quantization_config, compilation_config=compilation_config, speculative_decoding_config=speculative_decoding_config, + sharding_config=sharding_config, env_vars=env_vars, vpc_config=vpc_config, kms_key=kms_key, @@ -1189,6 +1193,7 @@ def _model_builder_optimize_wrapper( quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, speculative_decoding_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None, env_vars: Optional[Dict] = None, vpc_config: Optional[Dict] = None, kms_key: Optional[str] = None, @@ -1212,6 +1217,8 @@ def _model_builder_optimize_wrapper( compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``. speculative_decoding_config (Optional[Dict]): Speculative decoding configuration. Defaults to ``None`` + sharding_config (Optional[Dict]): Model sharding configuration. + Defaults to ``None`` env_vars (Optional[Dict]): Additional environment variables to run the optimization container. Defaults to ``None``. vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``. @@ -1238,6 +1245,12 @@ def _model_builder_optimize_wrapper( if quantization_config and compilation_config: raise ValueError("Quantization config and compilation config are mutually exclusive.") + if sharding_config and (quantization_config or compilation_config or speculative_decoding_config): + raise ValueError("Sharding config is mutually exclusive and cannot be combined with any other optimization.") + + if sharding_config and ((env_vars and "OPTION_TENSOR_PARALLEL_DEGREE" not in env_vars) or (sharding_config.get("OverrideEnvironment") and "OPTION_TENSOR_PARALLEL_DEGREE" not in sharding_config["OverrideEnvironment"])): + raise ValueError("OPTION_TENSOR_PARALLEL_DEGREE is required environment variable with Sharding config.") + self.sagemaker_session = sagemaker_session or self.sagemaker_session or Session() self.instance_type = instance_type or self.instance_type self.role_arn = role_arn or self.role_arn @@ -1254,6 +1267,7 @@ def _model_builder_optimize_wrapper( quantization_config=quantization_config, compilation_config=compilation_config, speculative_decoding_config=speculative_decoding_config, + sharding_config=sharding_config, env_vars=env_vars, vpc_config=vpc_config, kms_key=kms_key, @@ -1272,6 +1286,7 @@ def _model_builder_optimize_wrapper( quantization_config=quantization_config, compilation_config=compilation_config, speculative_decoding_config=speculative_decoding_config, + sharding_config=sharding_config, env_vars=env_vars, vpc_config=vpc_config, kms_key=kms_key, @@ -1287,6 +1302,9 @@ def _model_builder_optimize_wrapper( if not speculative_decoding_config: self.pysdk_model.remove_tag_with_key(Tag.SPECULATIVE_DRAFT_MODEL_PROVIDER) + if sharding_config: + self.pysdk_model._is_sharded_model = True + return self.pysdk_model def _optimize_for_hf( @@ -1297,6 +1315,7 @@ def _optimize_for_hf( quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, speculative_decoding_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None, env_vars: Optional[Dict] = None, vpc_config: Optional[Dict] = None, kms_key: Optional[str] = None, @@ -1312,6 +1331,8 @@ def _optimize_for_hf( compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``. speculative_decoding_config (Optional[Dict]): Speculative decoding configuration. Defaults to ``None`` + sharding_config (Optional[Dict]): Model sharding configuration. + Defaults to ``None`` env_vars (Optional[Dict]): Additional environment variables to run the optimization container. Defaults to ``None``. vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``. @@ -1327,7 +1348,7 @@ def _optimize_for_hf( self.pysdk_model, speculative_decoding_config, False ) - if quantization_config or compilation_config: + if quantization_config or compilation_config or sharding_config: create_optimization_job_args = { "OptimizationJobName": job_name, "DeploymentInstanceType": self.instance_type, diff --git a/src/sagemaker/serve/utils/optimize_utils.py b/src/sagemaker/serve/utils/optimize_utils.py index 5781c0bade..cacf647bcd 100644 --- a/src/sagemaker/serve/utils/optimize_utils.py +++ b/src/sagemaker/serve/utils/optimize_utils.py @@ -259,13 +259,15 @@ def _is_s3_uri(s3_uri: Optional[str]) -> bool: def _extract_optimization_config_and_env( - quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None + quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None ) -> Optional[Tuple[Optional[Dict], Optional[Dict]]]: """Extracts optimization config and environment variables. Args: quantization_config (Optional[Dict]): The quantization config. compilation_config (Optional[Dict]): The compilation config. + sharding_config (Optional[Dict]): The sharding config. Returns: Optional[Tuple[Optional[Dict], Optional[Dict]]]: @@ -279,6 +281,10 @@ def _extract_optimization_config_and_env( return {"ModelCompilationConfig": compilation_config}, compilation_config.get( "OverrideEnvironment" ) + if sharding_config: + return {"ModelShardingConfig": sharding_config}, sharding_config.get( + "OverrideEnvironment" + ) return None, None diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index b50aa17c34..ddbab8e54d 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -2667,6 +2667,40 @@ def test_optimize_exclusive_args(self, mock_get_serve_setting): ), ) + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + def test_optimize_exclusive_sharding(self, mock_get_serve_setting): + mock_sagemaker_session = Mock() + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-70b", + sagemaker_session=mock_sagemaker_session, + ) + + self.assertRaisesRegex( + ValueError, + "Sharding config is mutually exclusive and cannot be combined with any other optimization.", + lambda: model_builder.optimize( + quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + compilation_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + ), + ) + + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + def test_optimize_exclusive_sharding_args(self, mock_get_serve_setting): + mock_sagemaker_session = Mock() + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-70b", + sagemaker_session=mock_sagemaker_session, + ) + + self.assertRaisesRegex( + ValueError, + "OPTION_TENSOR_PARALLEL_DEGREE is required environment variable with Sharding config.", + lambda: model_builder.optimize( + sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + ), + ) + @patch.object(ModelBuilder, "_prepare_for_mode") @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) def test_optimize_for_hf_with_custom_s3_path( diff --git a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py index a8dc6d74f4..32fd93ddaa 100644 --- a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py +++ b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py @@ -261,7 +261,7 @@ def test_is_s3_uri(s3_uri, expected): @pytest.mark.parametrize( - "quantization_config, compilation_config, expected_config, expected_env", + "quantization_config, compilation_config, sharding_config, expected_config, expected_env", [ ( None, @@ -270,6 +270,7 @@ def test_is_s3_uri(s3_uri, expected): "OPTION_TENSOR_PARALLEL_DEGREE": "2", } }, + None, { "ModelCompilationConfig": { "OverrideEnvironment": { @@ -288,6 +289,7 @@ def test_is_s3_uri(s3_uri, expected): } }, None, + None, { "ModelQuantizationConfig": { "OverrideEnvironment": { @@ -299,7 +301,26 @@ def test_is_s3_uri(s3_uri, expected): "OPTION_TENSOR_PARALLEL_DEGREE": "2", }, ), - (None, None, None, None), + ( + None, + None, + { + "OverrideEnvironment": { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + } + }, + { + "ModelShardingConfig": { + "OverrideEnvironment": { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + } + }, + }, + { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + }, + ), + (None, None, None, None, None), ], ) def test_extract_optimization_config_and_env( From 65f4cc3a4869ffc272740bc6e3e62e205cd59d3b Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Wed, 13 Nov 2024 11:45:29 -0800 Subject: [PATCH 06/18] add more tests --- tests/unit/sagemaker/model/test_model.py | 48 ++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/unit/sagemaker/model/test_model.py b/tests/unit/sagemaker/model/test_model.py index e43ad0ed0a..92bb6e05b9 100644 --- a/tests/unit/sagemaker/model/test_model.py +++ b/tests/unit/sagemaker/model/test_model.py @@ -958,6 +958,54 @@ def test_all_framework_models_inference_component_based_endpoint_deploy_path( sagemaker_session.endpoint_in_service_or_not.reset_mock() sagemaker_session.create_model.reset_mock() +@patch("sagemaker.utils.repack_model") +@patch("sagemaker.fw_utils.tar_and_upload_dir") +def test_sharded_model_force_inference_component_based_endpoint_deploy_path( + repack_model, tar_and_uload_dir, sagemaker_session +): + framework_model_classes_to_kwargs = { + HuggingFaceModel: { + "pytorch_version": "1.7.1", + "py_version": "py36", + "transformers_version": "4.6.1" + }, + } + + sagemaker_session.settings = SessionSettings(include_jumpstart_tags=False) + + source_dir = "s3://blah/blah/blah" + for framework_model_class, kwargs in framework_model_classes_to_kwargs.items(): + test_sharded_model = framework_model_class( + entry_point=ENTRY_POINT_INFERENCE, + role=ROLE, + sagemaker_session=sagemaker_session, + model_data=source_dir, + **kwargs, + ) + test_sharded_model._is_sharded_model = True + test_sharded_model.deploy( + instance_type="ml.m2.xlarge", + initial_instance_count=INSTANCE_COUNT, + endpoint_type=EndpointType.MODEL_BASED, + resources=ResourceRequirements( + requests={ + "num_accelerators": 1, + "memory": 8192, + "copies": 1, + }, + limits={}, + ), + ) + + # Verified inference component based endpoint and inference component creation + # path + sagemaker_session.endpoint_in_service_or_not.assert_called_once() + sagemaker_session.create_model.assert_called_once() + sagemaker_session.create_inference_component.assert_called_once() + + sagemaker_session.create_inference_component.reset_mock() + sagemaker_session.endpoint_in_service_or_not.reset_mock() + sagemaker_session.create_model.reset_mock() @patch("sagemaker.utils.repack_model") def test_repack_code_location_with_key_prefix(repack_model, sagemaker_session): From 741d0a6e428c4bebe20a962441560d49eb6827d3 Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Wed, 13 Nov 2024 12:54:28 -0800 Subject: [PATCH 07/18] fix sharded model flag --- src/sagemaker/serve/builder/model_builder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index 1ae65536ee..71469eb44c 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -1293,6 +1293,9 @@ def _model_builder_optimize_wrapper( max_runtime_in_sec=max_runtime_in_sec, ) + if sharding_config: + self.pysdk_model._is_sharded_model = True + if input_args: self.sagemaker_session.sagemaker_client.create_optimization_job(**input_args) job_status = self.sagemaker_session.wait_for_optimization_job(job_name) @@ -1302,9 +1305,6 @@ def _model_builder_optimize_wrapper( if not speculative_decoding_config: self.pysdk_model.remove_tag_with_key(Tag.SPECULATIVE_DRAFT_MODEL_PROVIDER) - if sharding_config: - self.pysdk_model._is_sharded_model = True - return self.pysdk_model def _optimize_for_hf( From a3cb44488efd3c15b535fee5ba8b85debfd30ded Mon Sep 17 00:00:00 2001 From: Haotian An <33510317+Captainia@users.noreply.github.com> Date: Wed, 13 Nov 2024 16:14:52 -0500 Subject: [PATCH 08/18] Revert "change: add TGI 2.4.0 image uri (#4922)" (#4926) --- CONTRIBUTING.md | 2 +- .../image_uri_config/huggingface-llm.json | 47 ------------------- .../image_uris/test_huggingface_llm.py | 1 - 3 files changed, 1 insertion(+), 49 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b2bcf44cd1..24226af4ee 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -76,7 +76,7 @@ Before sending us a pull request, please ensure that: 1. Install tox using `pip install tox` 1. Install coverage using `pip install .[test]` 1. cd into the sagemaker-python-sdk folder: `cd sagemaker-python-sdk` or `cd /environment/sagemaker-python-sdk` -1. Run the following tox command and verify that all code checks and unit tests pass: `tox -- tests/unit` +1. Run the following tox command and verify that all code checks and unit tests pass: `tox tests/unit` 1. You can also run a single test with the following command: `tox -e py310 -- -s -vv ::` 1. You can run coverage via runcvoerage env : `tox -e runcoverage -- tests/unit` or `tox -e py310 -- tests/unit --cov=sagemaker --cov-append --cov-report xml` * Note that the coverage test will fail if you only run a single test, so make sure to surround the command with `export IGNORE_COVERAGE=-` and `unset IGNORE_COVERAGE` diff --git a/src/sagemaker/image_uri_config/huggingface-llm.json b/src/sagemaker/image_uri_config/huggingface-llm.json index 42f160eff1..24cbd5ca96 100644 --- a/src/sagemaker/image_uri_config/huggingface-llm.json +++ b/src/sagemaker/image_uri_config/huggingface-llm.json @@ -766,53 +766,6 @@ "container_version": { "gpu": "cu124-ubuntu22.04" } - }, - "2.4.0": { - "py_versions": [ - "py311" - ], - "registries": { - "af-south-1": "626614931356", - "il-central-1": "780543022126", - "ap-east-1": "871362719292", - "ap-northeast-1": "763104351884", - "ap-northeast-2": "763104351884", - "ap-northeast-3": "364406365360", - "ap-south-1": "763104351884", - "ap-south-2": "772153158452", - "ap-southeast-1": "763104351884", - "ap-southeast-2": "763104351884", - "ap-southeast-3": "907027046896", - "ap-southeast-4": "457447274322", - "ca-central-1": "763104351884", - "cn-north-1": "727897471807", - "cn-northwest-1": "727897471807", - "eu-central-1": "763104351884", - "eu-central-2": "380420809688", - "eu-north-1": "763104351884", - "eu-west-1": "763104351884", - "eu-west-2": "763104351884", - "eu-west-3": "763104351884", - "eu-south-1": "692866216735", - "eu-south-2": "503227376785", - "me-south-1": "217643126080", - "me-central-1": "914824155844", - "sa-east-1": "763104351884", - "us-east-1": "763104351884", - "us-east-2": "763104351884", - "us-gov-east-1": "446045086412", - "us-gov-west-1": "442386744353", - "us-iso-east-1": "886529160074", - "us-isob-east-1": "094389454867", - "us-west-1": "763104351884", - "us-west-2": "763104351884", - "ca-west-1": "204538143572" - }, - "tag_prefix": "2.4.0-tgi2.4.0", - "repository": "huggingface-pytorch-tgi-inference", - "container_version": { - "gpu": "cu124-ubuntu22.04" - } } } } diff --git a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py index d993979cfd..28525a390c 100644 --- a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py +++ b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py @@ -46,7 +46,6 @@ "2.0.2": "2.3.0-tgi2.0.2-gpu-py310-cu121-ubuntu22.04", "2.2.0": "2.3.0-tgi2.2.0-gpu-py310-cu121-ubuntu22.04-v2.0", "2.3.1": "2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04", - "2.4.0": "2.4.0-tgi2.4.0-gpu-py311-cu124-ubuntu22.04", }, "inf2": { "0.0.16": "1.13.1-optimum0.0.16-neuronx-py310-ubuntu22.04", From 37e26f20ab22edd3af47554378e534fe2fe4f75c Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Wed, 6 Nov 2024 14:11:12 -0800 Subject: [PATCH 09/18] changes for blackbird - model sharding --- tests/unit/sagemaker/serve/builder/test_model_builder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index 6e0f1bfe5e..ddbab8e54d 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -2679,6 +2679,7 @@ def test_optimize_exclusive_sharding(self, mock_get_serve_setting): ValueError, "Sharding config is mutually exclusive and cannot be combined with any other optimization.", lambda: model_builder.optimize( + quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, compilation_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, ), From bb4a718fae3b45eaeffdf219d3269808f8e2d16e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gary=20Wang=20=F0=9F=98=A4?= Date: Wed, 13 Nov 2024 01:58:29 +0000 Subject: [PATCH 10/18] add optimization validations --- src/sagemaker/serve/builder/model_builder.py | 10 ++ .../serve/validations/optimization.py | 165 ++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 src/sagemaker/serve/validations/optimization.py diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index 71469eb44c..adff7f3d32 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -104,6 +104,7 @@ get_huggingface_model_metadata, download_huggingface_model_metadata, ) +from sagemaker.serve.validations.optimization import validate_optimization_configuration logger = logging.getLogger(__name__) @@ -1160,6 +1161,15 @@ def optimize( Model: A deployable ``Model`` object. """ + # TODO: ideally these dictionaries need to be sagemaker_core shapes + validate_optimization_configuration( + instance_type=instance_type, + quantization_config=quantization_config, + compilation_config=compilation_config, + sharding_config=sharding_config, + speculative_decoding_config=speculative_decoding_config, + ) + # need to get telemetry_opt_out info before telemetry decorator is called self.serve_settings = self._get_serve_setting() diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py new file mode 100644 index 0000000000..d7ecbfaae5 --- /dev/null +++ b/src/sagemaker/serve/validations/optimization.py @@ -0,0 +1,165 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +"""Holds the validation logic used for the .optimize() function""" +from typing import Any, Dict, Set +from enum import Enum +from pydantic import BaseModel +import textwrap +import logging + +logger = logging.getLogger(__name__) + + +class OptimizationContainer(Enum): + TRT = "trt" + VLLM = "vllm" + NEURON = "neuron" + + +class OptimizationCombination(BaseModel): + optimization_container: OptimizationContainer = None + compilation: bool + speculative_decoding: bool + sharding: bool + quantization_technique: Set[str | None] + + def validate_against(self, optimization_combination, rule_set: OptimizationContainer): + if not optimization_combination.compilation == self.compilation: + raise ValueError("model compilation is not supported") + if not optimization_combination.quantization_technique.issubset(self.quantization_technique): + raise ValueError("model quantization is not supported") + if not optimization_combination.speculative_decoding == self.speculative_decoding: + raise ValueError("speculative decoding is not supported") + if not optimization_combination.sharding == self.sharding: + raise ValueError("model sharding is not supported") + + if rule_set == OptimizationContainer == OptimizationContainer.TRT: + if optimization_combination.compilation and optimization_combination.speculative_decoding: + raise ValueError("model compilation and speculative decoding provided together ") + else: + if optimization_combination.compilation and optimization_combination.quantization_technique: + raise ValueError("model compilation and model quantization provided together is not supported") + + +TRT_CONFIGURATION = { + "supported_instance_families": {"p4d", "p4de", "p5", "g5", "g6"}, + "optimization_combination": OptimizationCombination( + optimization_container=OptimizationContainer.TRT, + compilation=True, + quantization_technique={"awq", "fp8", "smooth_quant"}, + speculative_decoding=False, + sharding=False, + ) +} +VLLM_CONFIGURATION = { + "supported_instance_families": {"p4d", "p4de", "p5", "g5", "g6"}, + "optimization_combination": OptimizationCombination( + optimization_container=OptimizationContainer.VLLM, + compilation=False, + quantization_technique={"awq", "fp8"}, + speculative_decoding=True, + sharding=True + ) +} +NEURON_CONFIGURATION = { + "supported_instance_families": {"inf2", "trn1", "trn1n"}, + "optimization_combination": OptimizationCombination( + optimization_container=OptimizationContainer.NEURON, + compilation=True, + quantization_technique=set(), + speculative_decoding=False, + sharding=False + ) +} + +VALIDATION_ERROR_MSG = ( + "The model cannot be optimized with the provided configurations on " + "{optimization_container} supported {instance_type} because {validation_error}." +) + + +def validate_optimization_configuration( + instance_type: str, + quantization_config: Dict[str, Any], + compilation_config: Dict[str, Any], + sharding_config: Dict[str, Any], + speculative_decoding_config: Dict[str, Any] +): + split_instance_type = instance_type.split(".") + instance_family = None + if len(split_instance_type) == 3: # invalid instance type will be caught below + instance_family = split_instance_type[1] + + if ( + not instance_family in TRT_CONFIGURATION["supported_instance_families"] and + not instance_family in VLLM_CONFIGURATION["supported_instance_families"] and + not instance_family in NEURON_CONFIGURATION["supported_instance_families"] + ): + invalid_instance_type_msg = f""" + The model cannot be optimized on {instance_type}. Please optimize on the following instance type families: + - For {OptimizationContainer.TRT} optimized container: {TRT_CONFIGURATION["supported_instance_families"]} + - For {OptimizationContainer.VLLM} optimized container: {VLLM_CONFIGURATION["supported_instance_families"]} + - For {OptimizationContainer.NEURON} optimized container: {NEURON_CONFIGURATION["supported_instance_families"]} + """ + raise ValueError(textwrap.dedent(invalid_instance_type_msg)) + + optimization_combination = OptimizationCombination( + compilation=not compilation_config, + speculative_decoding=not speculative_decoding_config, + sharding=not sharding_config, + quantization_technique={quantization_config.get("OPTION_QUANTIZE") if quantization_config else None} + ) + + if instance_type in NEURON_CONFIGURATION["supported_instance_families"]: + try: + ( + NEURON_CONFIGURATION["optimization_combination"] + .validate_against(optimization_combination, rule_set=OptimizationContainer.VLLM) + ) + except ValueError as neuron_compare_error: + raise ValueError( + VALIDATION_ERROR_MSG.format( + optimization_container=OptimizationContainer.NEURON.value, + instance_type=instance_type, + validation_error=neuron_compare_error + ) + ) + else: + try: + ( + TRT_CONFIGURATION["optimization_combination"] + .validate_against(optimization_combination, rule_set=OptimizationContainer.TRT) + ) + except ValueError as trt_compare_error: + try: + ( + VLLM_CONFIGURATION["optimization_combination"] + .validate_against(optimization_combination, rule_set=OptimizationContainer.VLLM) + ) + except ValueError as vllm_compare_error: + trt_error_msg = VALIDATION_ERROR_MSG.format( + optimization_container=OptimizationContainer.TRT.value, + instance_type=instance_type, + validation_error=trt_compare_error + ) + vllm_error_msg = VALIDATION_ERROR_MSG.format( + optimization_container=OptimizationContainer.VLLM.value, + instance_type=instance_type, + validation_error=vllm_compare_error + ) + joint_error_msg = f""" + The model cannot be optimized for the following reasons: + - {trt_error_msg} + - {vllm_error_msg} + """ + raise ValueError(textwrap.dedent(joint_error_msg)) From 3d043848a3f382227c71f05574a82b4f30134bde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gary=20Wang=20=F0=9F=98=A4?= Date: Fri, 15 Nov 2024 17:19:40 +0000 Subject: [PATCH 11/18] fix formatting and msging --- src/sagemaker/model.py | 6 +- src/sagemaker/serve/builder/model_builder.py | 48 ++++-- src/sagemaker/serve/utils/optimize_utils.py | 9 +- .../serve/validations/optimization.py | 142 ++++++++++-------- tests/unit/sagemaker/model/test_model.py | 4 +- .../serve/utils/test_optimize_utils.py | 4 +- 6 files changed, 129 insertions(+), 84 deletions(-) diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index d3ff3cc934..577261cda7 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -1601,8 +1601,10 @@ def deploy( self._base_name = "-".join((self._base_name, compiled_model_suffix)) if self._is_sharded_model and endpoint_type != EndpointType.INFERENCE_COMPONENT_BASED: - logging.warning("Forcing INFERENCE_COMPONENT_BASED endpoint for sharded model. ADVISORY - " - "Use INFERENCE_COMPONENT_BASED endpoints over MODEL_BASED endpoints.") + logging.warning( + "Forcing INFERENCE_COMPONENT_BASED endpoint for sharded model. ADVISORY - " + "Use INFERENCE_COMPONENT_BASED endpoints over MODEL_BASED endpoints." + ) endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED # Support multiple models on same endpoint diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index adff7f3d32..2deac5d922 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -104,7 +104,7 @@ get_huggingface_model_metadata, download_huggingface_model_metadata, ) -from sagemaker.serve.validations.optimization import validate_optimization_configuration +from sagemaker.serve.validations.optimization import _validate_optimization_configuration logger = logging.getLogger(__name__) @@ -1161,15 +1161,6 @@ def optimize( Model: A deployable ``Model`` object. """ - # TODO: ideally these dictionaries need to be sagemaker_core shapes - validate_optimization_configuration( - instance_type=instance_type, - quantization_config=quantization_config, - compilation_config=compilation_config, - sharding_config=sharding_config, - speculative_decoding_config=speculative_decoding_config, - ) - # need to get telemetry_opt_out info before telemetry decorator is called self.serve_settings = self._get_serve_setting() @@ -1243,6 +1234,17 @@ def _model_builder_optimize_wrapper( Returns: Model: A deployable ``Model`` object. """ + + # TODO: ideally these dictionaries need to be sagemaker_core shapes + # TODO: for organization, abstract all validation behind this fn + _validate_optimization_configuration( + instance_type=instance_type, + quantization_config=quantization_config, + compilation_config=compilation_config, + sharding_config=sharding_config, + speculative_decoding_config=speculative_decoding_config, + ) + self.is_compiled = compilation_config is not None self.is_quantized = quantization_config is not None self.speculative_decoding_draft_model_source = _extract_speculative_draft_model_provider( @@ -1255,11 +1257,29 @@ def _model_builder_optimize_wrapper( if quantization_config and compilation_config: raise ValueError("Quantization config and compilation config are mutually exclusive.") - if sharding_config and (quantization_config or compilation_config or speculative_decoding_config): - raise ValueError("Sharding config is mutually exclusive and cannot be combined with any other optimization.") + if sharding_config and ( + quantization_config or compilation_config or speculative_decoding_config + ): + raise ValueError( + ( + "Sharding config is mutually exclusive " + "and cannot be combined with any other optimization." + ) + ) - if sharding_config and ((env_vars and "OPTION_TENSOR_PARALLEL_DEGREE" not in env_vars) or (sharding_config.get("OverrideEnvironment") and "OPTION_TENSOR_PARALLEL_DEGREE" not in sharding_config["OverrideEnvironment"])): - raise ValueError("OPTION_TENSOR_PARALLEL_DEGREE is required environment variable with Sharding config.") + if sharding_config and ( + (env_vars and "OPTION_TENSOR_PARALLEL_DEGREE" not in env_vars) + or ( + sharding_config.get("OverrideEnvironment") + and "OPTION_TENSOR_PARALLEL_DEGREE" not in sharding_config["OverrideEnvironment"] + ) + ): + raise ValueError( + ( + "OPTION_TENSOR_PARALLEL_DEGREE is required " + "environment variable with Sharding config." + ) + ) self.sagemaker_session = sagemaker_session or self.sagemaker_session or Session() self.instance_type = instance_type or self.instance_type diff --git a/src/sagemaker/serve/utils/optimize_utils.py b/src/sagemaker/serve/utils/optimize_utils.py index cacf647bcd..e7a06a7716 100644 --- a/src/sagemaker/serve/utils/optimize_utils.py +++ b/src/sagemaker/serve/utils/optimize_utils.py @@ -259,8 +259,9 @@ def _is_s3_uri(s3_uri: Optional[str]) -> bool: def _extract_optimization_config_and_env( - quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None, - sharding_config: Optional[Dict] = None + quantization_config: Optional[Dict] = None, + compilation_config: Optional[Dict] = None, + sharding_config: Optional[Dict] = None, ) -> Optional[Tuple[Optional[Dict], Optional[Dict]]]: """Extracts optimization config and environment variables. @@ -282,9 +283,7 @@ def _extract_optimization_config_and_env( "OverrideEnvironment" ) if sharding_config: - return {"ModelShardingConfig": sharding_config}, sharding_config.get( - "OverrideEnvironment" - ) + return {"ModelShardingConfig": sharding_config}, sharding_config.get("OverrideEnvironment") return None, None diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py index d7ecbfaae5..be64115d53 100644 --- a/src/sagemaker/serve/validations/optimization.py +++ b/src/sagemaker/serve/validations/optimization.py @@ -10,155 +10,175 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -"""Holds the validation logic used for the .optimize() function""" +"""Holds the validation logic used for the .optimize() function. INTERNAL only""" +from __future__ import absolute_import + +import textwrap +import logging from typing import Any, Dict, Set from enum import Enum from pydantic import BaseModel -import textwrap -import logging logger = logging.getLogger(__name__) -class OptimizationContainer(Enum): +class _OptimizationContainer(Enum): + """Optimization containers""" + TRT = "trt" VLLM = "vllm" NEURON = "neuron" -class OptimizationCombination(BaseModel): - optimization_container: OptimizationContainer = None +class _OptimizationCombination(BaseModel): + """Optimization ruleset data structure for comparing input to ruleset""" + + optimization_container: _OptimizationContainer = None compilation: bool speculative_decoding: bool sharding: bool quantization_technique: Set[str | None] - def validate_against(self, optimization_combination, rule_set: OptimizationContainer): + def validate_against(self, optimization_combination, rule_set: _OptimizationContainer): + """Validator for optimization containers""" + if not optimization_combination.compilation == self.compilation: - raise ValueError("model compilation is not supported") - if not optimization_combination.quantization_technique.issubset(self.quantization_technique): - raise ValueError("model quantization is not supported") + raise ValueError("Compilation") + if not optimization_combination.quantization_technique.issubset( + self.quantization_technique + ): + raise ValueError( + f"Quantization:{optimization_combination.quantization_technique.pop()}" + ) if not optimization_combination.speculative_decoding == self.speculative_decoding: - raise ValueError("speculative decoding is not supported") + raise ValueError("Speculative Decoding") if not optimization_combination.sharding == self.sharding: - raise ValueError("model sharding is not supported") + raise ValueError("Sharding") - if rule_set == OptimizationContainer == OptimizationContainer.TRT: - if optimization_combination.compilation and optimization_combination.speculative_decoding: - raise ValueError("model compilation and speculative decoding provided together ") + if rule_set == _OptimizationContainer == _OptimizationContainer.TRT: + if ( + optimization_combination.compilation + and optimization_combination.speculative_decoding + ): + raise ValueError("Compilation and Speculative Decoding") else: - if optimization_combination.compilation and optimization_combination.quantization_technique: - raise ValueError("model compilation and model quantization provided together is not supported") + if ( + optimization_combination.compilation + and optimization_combination.quantization_technique + ): + raise ValueError( + f"Compilation and Quantization:{optimization_combination.quantization_technique.pop()}" + ) TRT_CONFIGURATION = { "supported_instance_families": {"p4d", "p4de", "p5", "g5", "g6"}, - "optimization_combination": OptimizationCombination( - optimization_container=OptimizationContainer.TRT, + "optimization_combination": _OptimizationCombination( + optimization_container=_OptimizationContainer.TRT, compilation=True, quantization_technique={"awq", "fp8", "smooth_quant"}, speculative_decoding=False, sharding=False, - ) + ), } VLLM_CONFIGURATION = { "supported_instance_families": {"p4d", "p4de", "p5", "g5", "g6"}, - "optimization_combination": OptimizationCombination( - optimization_container=OptimizationContainer.VLLM, + "optimization_combination": _OptimizationCombination( + optimization_container=_OptimizationContainer.VLLM, compilation=False, quantization_technique={"awq", "fp8"}, speculative_decoding=True, - sharding=True - ) + sharding=True, + ), } NEURON_CONFIGURATION = { "supported_instance_families": {"inf2", "trn1", "trn1n"}, - "optimization_combination": OptimizationCombination( - optimization_container=OptimizationContainer.NEURON, + "optimization_combination": _OptimizationCombination( + optimization_container=_OptimizationContainer.NEURON, compilation=True, quantization_technique=set(), speculative_decoding=False, - sharding=False - ) + sharding=False, + ), } VALIDATION_ERROR_MSG = ( - "The model cannot be optimized with the provided configurations on " - "{optimization_container} supported {instance_type} because {validation_error}." + "Optimizations that use {optimization_technique} " + "are not currently supported on {instance_type} instances" ) -def validate_optimization_configuration( +def _validate_optimization_configuration( instance_type: str, quantization_config: Dict[str, Any], compilation_config: Dict[str, Any], sharding_config: Dict[str, Any], - speculative_decoding_config: Dict[str, Any] + speculative_decoding_config: Dict[str, Any], ): + """Validate .optimize() input off of standard ruleset""" + split_instance_type = instance_type.split(".") instance_family = None if len(split_instance_type) == 3: # invalid instance type will be caught below instance_family = split_instance_type[1] if ( - not instance_family in TRT_CONFIGURATION["supported_instance_families"] and - not instance_family in VLLM_CONFIGURATION["supported_instance_families"] and - not instance_family in NEURON_CONFIGURATION["supported_instance_families"] + instance_family not in TRT_CONFIGURATION["supported_instance_families"] + and instance_family not in VLLM_CONFIGURATION["supported_instance_families"] + and instance_family not in NEURON_CONFIGURATION["supported_instance_families"] ): - invalid_instance_type_msg = f""" - The model cannot be optimized on {instance_type}. Please optimize on the following instance type families: - - For {OptimizationContainer.TRT} optimized container: {TRT_CONFIGURATION["supported_instance_families"]} - - For {OptimizationContainer.VLLM} optimized container: {VLLM_CONFIGURATION["supported_instance_families"]} - - For {OptimizationContainer.NEURON} optimized container: {NEURON_CONFIGURATION["supported_instance_families"]} - """ - raise ValueError(textwrap.dedent(invalid_instance_type_msg)) - - optimization_combination = OptimizationCombination( + invalid_instance_type_msg = ( + f"Optimizations that use {instance_type} are not currently supported" + ) + raise ValueError(invalid_instance_type_msg) + + optimization_combination = _OptimizationCombination( compilation=not compilation_config, speculative_decoding=not speculative_decoding_config, sharding=not sharding_config, - quantization_technique={quantization_config.get("OPTION_QUANTIZE") if quantization_config else None} + quantization_technique={ + quantization_config.get("OPTION_QUANTIZE") if quantization_config else None + }, ) if instance_type in NEURON_CONFIGURATION["supported_instance_families"]: try: ( - NEURON_CONFIGURATION["optimization_combination"] - .validate_against(optimization_combination, rule_set=OptimizationContainer.VLLM) + NEURON_CONFIGURATION["optimization_combination"].validate_against( + optimization_combination, rule_set=_OptimizationContainer.VLLM + ) ) except ValueError as neuron_compare_error: raise ValueError( VALIDATION_ERROR_MSG.format( - optimization_container=OptimizationContainer.NEURON.value, - instance_type=instance_type, - validation_error=neuron_compare_error + optimization_container=str(neuron_compare_error), + instance_type="Neuron", ) ) else: try: ( - TRT_CONFIGURATION["optimization_combination"] - .validate_against(optimization_combination, rule_set=OptimizationContainer.TRT) + TRT_CONFIGURATION["optimization_combination"].validate_against( + optimization_combination, rule_set=_OptimizationContainer.TRT + ) ) except ValueError as trt_compare_error: try: ( - VLLM_CONFIGURATION["optimization_combination"] - .validate_against(optimization_combination, rule_set=OptimizationContainer.VLLM) + VLLM_CONFIGURATION["optimization_combination"].validate_against( + optimization_combination, rule_set=_OptimizationContainer.VLLM + ) ) except ValueError as vllm_compare_error: trt_error_msg = VALIDATION_ERROR_MSG.format( - optimization_container=OptimizationContainer.TRT.value, - instance_type=instance_type, - validation_error=trt_compare_error + optimization_container=str(trt_compare_error), instance_type="GPU" ) vllm_error_msg = VALIDATION_ERROR_MSG.format( - optimization_container=OptimizationContainer.VLLM.value, - instance_type=instance_type, - validation_error=vllm_compare_error + optimization_container=str(vllm_compare_error), + instance_type="GPU", ) joint_error_msg = f""" - The model cannot be optimized for the following reasons: + Optimization cannot be performed for the following reasons: - {trt_error_msg} - {vllm_error_msg} """ diff --git a/tests/unit/sagemaker/model/test_model.py b/tests/unit/sagemaker/model/test_model.py index 92bb6e05b9..316df7420d 100644 --- a/tests/unit/sagemaker/model/test_model.py +++ b/tests/unit/sagemaker/model/test_model.py @@ -958,6 +958,7 @@ def test_all_framework_models_inference_component_based_endpoint_deploy_path( sagemaker_session.endpoint_in_service_or_not.reset_mock() sagemaker_session.create_model.reset_mock() + @patch("sagemaker.utils.repack_model") @patch("sagemaker.fw_utils.tar_and_upload_dir") def test_sharded_model_force_inference_component_based_endpoint_deploy_path( @@ -967,7 +968,7 @@ def test_sharded_model_force_inference_component_based_endpoint_deploy_path( HuggingFaceModel: { "pytorch_version": "1.7.1", "py_version": "py36", - "transformers_version": "4.6.1" + "transformers_version": "4.6.1", }, } @@ -1007,6 +1008,7 @@ def test_sharded_model_force_inference_component_based_endpoint_deploy_path( sagemaker_session.endpoint_in_service_or_not.reset_mock() sagemaker_session.create_model.reset_mock() + @patch("sagemaker.utils.repack_model") def test_repack_code_location_with_key_prefix(repack_model, sagemaker_session): diff --git a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py index 38e4d0c6fe..a056ef561c 100644 --- a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py +++ b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py @@ -326,7 +326,9 @@ def test_is_s3_uri(s3_uri, expected): def test_extract_optimization_config_and_env( quantization_config, compilation_config, sharding_config, expected_config, expected_env ): - assert _extract_optimization_config_and_env(quantization_config, compilation_config, sharding_config) == ( + assert _extract_optimization_config_and_env( + quantization_config, compilation_config, sharding_config + ) == ( expected_config, expected_env, ) From 22fdc37cde376628d9b82fd5d946aa9e5fab16a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gary=20Wang=20=F0=9F=98=A4?= Date: Fri, 15 Nov 2024 23:32:35 +0000 Subject: [PATCH 12/18] fixing validation bugs --- .../serve/validations/optimization.py | 119 ++++++++++++------ .../serve/builder/test_model_builder.py | 85 ++++++++++++- 2 files changed, 161 insertions(+), 43 deletions(-) diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py index be64115d53..e9df93f2c1 100644 --- a/src/sagemaker/serve/validations/optimization.py +++ b/src/sagemaker/serve/validations/optimization.py @@ -25,85 +25,107 @@ class _OptimizationContainer(Enum): """Optimization containers""" - TRT = "trt" - VLLM = "vllm" - NEURON = "neuron" + TRT = "TRT" + VLLM = "vLLM" + NEURON = "Neuron" class _OptimizationCombination(BaseModel): """Optimization ruleset data structure for comparing input to ruleset""" optimization_container: _OptimizationContainer = None - compilation: bool - speculative_decoding: bool - sharding: bool + compilation: Set[bool | None] + speculative_decoding: Set[bool | None] + sharding: Set[bool | None] quantization_technique: Set[str | None] def validate_against(self, optimization_combination, rule_set: _OptimizationContainer): """Validator for optimization containers""" - if not optimization_combination.compilation == self.compilation: + # check the case where no optimization combination is provided + if ( + optimization_combination.compilation == {None} + and optimization_combination.quantization_technique == {None} + and optimization_combination.speculative_decoding == {None} + and optimization_combination.sharding == {None} + ): + raise ValueError("Optimizations are not currently supported without optimization configurations.") + + # check the validity of each individual field + if not optimization_combination.compilation.issubset(self.compilation): raise ValueError("Compilation") if not optimization_combination.quantization_technique.issubset( self.quantization_technique ): + copy_quantization_technique = optimization_combination.quantization_technique.copy() raise ValueError( - f"Quantization:{optimization_combination.quantization_technique.pop()}" + f"Quantization:{copy_quantization_technique.pop()}" ) - if not optimization_combination.speculative_decoding == self.speculative_decoding: + if not optimization_combination.speculative_decoding.issubset(self.speculative_decoding): raise ValueError("Speculative Decoding") - if not optimization_combination.sharding == self.sharding: + if not optimization_combination.sharding.issubset(self.sharding): raise ValueError("Sharding") - if rule_set == _OptimizationContainer == _OptimizationContainer.TRT: + # optimization technique combinations that need to be validated + if rule_set == _OptimizationContainer.TRT: if ( optimization_combination.compilation and optimization_combination.speculative_decoding ): - raise ValueError("Compilation and Speculative Decoding") + copy_compilation = optimization_combination.compilation.copy() + copy_speculative_decoding = optimization_combination.speculative_decoding.copy() + if ( + copy_compilation.pop() + and copy_speculative_decoding.pop() + ): # Check that the 2 techniques are not None + raise ValueError("Compilation and Speculative Decoding") else: + copy_compilation = optimization_combination.compilation.copy() + copy_quantization_technique = optimization_combination.quantization_technique.copy() if ( - optimization_combination.compilation - and optimization_combination.quantization_technique - ): + copy_compilation.pop() + and copy_quantization_technique.pop() + ): # Check that the 2 techniques are not None raise ValueError( f"Compilation and Quantization:{optimization_combination.quantization_technique.pop()}" ) +TRUTHY_SET = {None, True} +FALSY_SET = {None, False} TRT_CONFIGURATION = { "supported_instance_families": {"p4d", "p4de", "p5", "g5", "g6"}, "optimization_combination": _OptimizationCombination( optimization_container=_OptimizationContainer.TRT, - compilation=True, - quantization_technique={"awq", "fp8", "smooth_quant"}, - speculative_decoding=False, - sharding=False, + compilation=TRUTHY_SET, + quantization_technique={None, "awq", "fp8", "smooth_quant"}, + speculative_decoding=FALSY_SET, + sharding=FALSY_SET, ), } VLLM_CONFIGURATION = { "supported_instance_families": {"p4d", "p4de", "p5", "g5", "g6"}, "optimization_combination": _OptimizationCombination( optimization_container=_OptimizationContainer.VLLM, - compilation=False, - quantization_technique={"awq", "fp8"}, - speculative_decoding=True, - sharding=True, + compilation=FALSY_SET, + quantization_technique={None, "awq", "fp8"}, + speculative_decoding=TRUTHY_SET, + sharding=TRUTHY_SET, ), } NEURON_CONFIGURATION = { "supported_instance_families": {"inf2", "trn1", "trn1n"}, "optimization_combination": _OptimizationCombination( optimization_container=_OptimizationContainer.NEURON, - compilation=True, - quantization_technique=set(), - speculative_decoding=False, - sharding=False, + compilation=TRUTHY_SET, + quantization_technique={None}, + speculative_decoding=FALSY_SET, + sharding=FALSY_SET, ), } VALIDATION_ERROR_MSG = ( - "Optimizations that use {optimization_technique} " + "Optimizations for {optimization_container} that use {optimization_technique} " "are not currently supported on {instance_type} instances" ) @@ -117,10 +139,11 @@ def _validate_optimization_configuration( ): """Validate .optimize() input off of standard ruleset""" - split_instance_type = instance_type.split(".") instance_family = None - if len(split_instance_type) == 3: # invalid instance type will be caught below - instance_family = split_instance_type[1] + if instance_type: + split_instance_type = instance_type.split(".") + if len(split_instance_type) == 3: + instance_family = split_instance_type[1] if ( instance_family not in TRT_CONFIGURATION["supported_instance_families"] @@ -128,17 +151,29 @@ def _validate_optimization_configuration( and instance_family not in NEURON_CONFIGURATION["supported_instance_families"] ): invalid_instance_type_msg = ( - f"Optimizations that use {instance_type} are not currently supported" + f"Optimizations that uses {instance_type} instance type are not currently supported" ) raise ValueError(invalid_instance_type_msg) + quantization_technique = None + if ( + quantization_config + and quantization_config.get("OverrideEnvironment") + and quantization_config.get("OverrideEnvironment").get("OPTION_QUANTIZE") + ): + quantization_technique = quantization_config.get("OverrideEnvironment").get("OPTION_QUANTIZE") + optimization_combination = _OptimizationCombination( - compilation=not compilation_config, - speculative_decoding=not speculative_decoding_config, - sharding=not sharding_config, - quantization_technique={ - quantization_config.get("OPTION_QUANTIZE") if quantization_config else None + compilation={ + None if compilation_config is None else bool(compilation_config) + }, + speculative_decoding={ + None if speculative_decoding_config is None else bool(speculative_decoding_config) + }, + sharding={ + None if sharding_config is None else bool(sharding_config) }, + quantization_technique={quantization_technique}, ) if instance_type in NEURON_CONFIGURATION["supported_instance_families"]: @@ -151,7 +186,8 @@ def _validate_optimization_configuration( except ValueError as neuron_compare_error: raise ValueError( VALIDATION_ERROR_MSG.format( - optimization_container=str(neuron_compare_error), + optimization_container=_OptimizationContainer.NEURON.value, + optimization_technique=str(neuron_compare_error), instance_type="Neuron", ) ) @@ -171,10 +207,13 @@ def _validate_optimization_configuration( ) except ValueError as vllm_compare_error: trt_error_msg = VALIDATION_ERROR_MSG.format( - optimization_container=str(trt_compare_error), instance_type="GPU" + optimization_container=_OptimizationContainer.TRT.value, + optimization_technique=str(trt_compare_error), + instance_type="GPU" ) vllm_error_msg = VALIDATION_ERROR_MSG.format( - optimization_container=str(vllm_compare_error), + optimization_container=_OptimizationContainer.VLLM.value, + optimization_technique=str(vllm_compare_error), instance_type="GPU", ) joint_error_msg = f""" diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index ddbab8e54d..94b6f39d15 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -11,6 +11,8 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from __future__ import absolute_import + +import textwrap from unittest.mock import MagicMock, patch, Mock, mock_open import unittest @@ -25,6 +27,7 @@ from sagemaker.serve.utils.exceptions import TaskNotFoundException from sagemaker.serve.utils.predictors import TensorflowServingLocalPredictor from sagemaker.serve.utils.types import ModelServer +from sagemaker.serve.validations.optimization import _validate_optimization_configuration from tests.unit.sagemaker.serve.constants import MOCK_IMAGE_CONFIG, MOCK_VPC_CONFIG schema_builder = MagicMock() @@ -2383,7 +2386,7 @@ def test_optimize( builder.pysdk_model = pysdk_model job_name = "my-optimization-job" - instance_type = "ml.inf1.xlarge" + instance_type = "ml.inf2.xlarge" output_path = "s3://my-bucket/output" quantization_config = { "Image": "quantization-image-uri", @@ -2425,7 +2428,7 @@ def test_optimize( mock_send_telemetry.assert_called_once() mock_sagemaker_session.sagemaker_client.create_optimization_job.assert_called_once_with( OptimizationJobName="my-optimization-job", - DeploymentInstanceType="ml.inf1.xlarge", + DeploymentInstanceType="ml.inf2.xlarge", RoleArn="arn:aws:iam::123456789012:role/SageMakerRole", OptimizationEnvironment={"Var1": "value", "Var2": "value"}, ModelSource={"S3": {"S3Uri": "s3://uri"}}, @@ -2646,6 +2649,7 @@ def test_optimize_local_mode(self, mock_get_serve_setting): ValueError, "Model optimization is only supported in Sagemaker Endpoint Mode.", lambda: model_builder.optimize( + instance_type="ml.g5.24xlarge", quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}} ), ) @@ -2662,6 +2666,7 @@ def test_optimize_exclusive_args(self, mock_get_serve_setting): ValueError, "Quantization config and compilation config are mutually exclusive.", lambda: model_builder.optimize( + instance_type="ml.g5.24xlarge", quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, compilation_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, ), @@ -2675,10 +2680,17 @@ def test_optimize_exclusive_sharding(self, mock_get_serve_setting): sagemaker_session=mock_sagemaker_session, ) + expected_error_message = """ + Optimization cannot be performed for the following reasons: + - Optimizations for TRT that use Sharding are not currently supported on GPU instances + - Optimizations for vLLM that use Compilation are not currently supported on GPU instances + """ + self.assertRaisesRegex( ValueError, - "Sharding config is mutually exclusive and cannot be combined with any other optimization.", + textwrap.dedent(expected_error_message), lambda: model_builder.optimize( + instance_type="ml.g5.24xlarge", quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, compilation_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, @@ -2697,6 +2709,7 @@ def test_optimize_exclusive_sharding_args(self, mock_get_serve_setting): ValueError, "OPTION_TENSOR_PARALLEL_DEGREE is required environment variable with Sharding config.", lambda: model_builder.optimize( + instance_type="ml.g5.24xlarge", sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, ), ) @@ -2820,3 +2833,69 @@ def test_optimize_for_hf_without_custom_s3_path( "OutputConfig": {"S3OutputLocation": "s3://bucket/code/"}, }, ) + + +class TestModelBuilderOptimizeValidations(unittest.TestCase): + + def test_corner_cases_throw_errors(self): + self.assertRaisesRegex( + ValueError, + "Optimizations that uses None instance type are not currently supported", + lambda: _validate_optimization_configuration( + sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + instance_type=None, + quantization_config=None, + speculative_decoding_config=None, + compilation_config=None, + ), + ) + self.assertRaisesRegex( + ValueError, + "Optimizations are not currently supported without optimization configurations.", + lambda: _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config=None, + speculative_decoding_config=None, + compilation_config=None, + sharding_config=None, + ), + ) + + def test_trt_and_vllm_configurations_throw_errors_for_rule_set(self): + expected_quantization_error_message = """ + Optimization cannot be performed for the following reasons: + - Optimizations for TRT that use Quantization:test are not currently supported on GPU instances + - Optimizations for vLLM that use Quantization:test are not currently supported on GPU instances + """ + self.assertRaisesRegex( + ValueError, + textwrap.dedent(expected_quantization_error_message), + lambda: _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config={ + "OverrideEnvironment": {"OPTION_QUANTIZE": "test"}, + }, + sharding_config=None, + speculative_decoding_config=None, + compilation_config=None, + ), + ) + + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + def test_neuron_configurations_throw_errors_for_rule_set(self, mock_get_serve_setting): + pass + + def test_trt_configurations_rule_set(self): + _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config={ + "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, + }, + sharding_config=None, + speculative_decoding_config=None, + compilation_config=None, + ) + + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + def test_vllm_configurations_rule_set(self, mock_get_serve_setting): + pass From 57123c9ed163a369956174340dc76902a1e2f4cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gary=20Wang=20=F0=9F=98=A4?= Date: Sat, 16 Nov 2024 00:28:00 +0000 Subject: [PATCH 13/18] add UTs --- .../serve/validations/optimization.py | 33 +++--- .../serve/builder/test_model_builder.py | 105 +++++++++++++++--- 2 files changed, 107 insertions(+), 31 deletions(-) diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py index e9df93f2c1..21a4acc0d3 100644 --- a/src/sagemaker/serve/validations/optimization.py +++ b/src/sagemaker/serve/validations/optimization.py @@ -41,6 +41,9 @@ class _OptimizationCombination(BaseModel): def validate_against(self, optimization_combination, rule_set: _OptimizationContainer): """Validator for optimization containers""" + print(optimization_combination) + print(rule_set) + print(optimization_combination.speculative_decoding.issubset(self.speculative_decoding)) # check the case where no optimization combination is provided if ( @@ -49,7 +52,7 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont and optimization_combination.speculative_decoding == {None} and optimization_combination.sharding == {None} ): - raise ValueError("Optimizations are not currently supported without optimization configurations.") + raise ValueError("no optimization configurations") # check the validity of each individual field if not optimization_combination.compilation.issubset(self.compilation): @@ -58,9 +61,7 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont self.quantization_technique ): copy_quantization_technique = optimization_combination.quantization_technique.copy() - raise ValueError( - f"Quantization:{copy_quantization_technique.pop()}" - ) + raise ValueError(f"Quantization:{copy_quantization_technique.pop()}") if not optimization_combination.speculative_decoding.issubset(self.speculative_decoding): raise ValueError("Speculative Decoding") if not optimization_combination.sharding.issubset(self.sharding): @@ -75,16 +76,14 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont copy_compilation = optimization_combination.compilation.copy() copy_speculative_decoding = optimization_combination.speculative_decoding.copy() if ( - copy_compilation.pop() - and copy_speculative_decoding.pop() + copy_compilation.pop() and copy_speculative_decoding.pop() ): # Check that the 2 techniques are not None raise ValueError("Compilation and Speculative Decoding") else: copy_compilation = optimization_combination.compilation.copy() copy_quantization_technique = optimization_combination.quantization_technique.copy() if ( - copy_compilation.pop() - and copy_quantization_technique.pop() + copy_compilation.pop() and copy_quantization_technique.pop() ): # Check that the 2 techniques are not None raise ValueError( f"Compilation and Quantization:{optimization_combination.quantization_technique.pop()}" @@ -161,26 +160,24 @@ def _validate_optimization_configuration( and quantization_config.get("OverrideEnvironment") and quantization_config.get("OverrideEnvironment").get("OPTION_QUANTIZE") ): - quantization_technique = quantization_config.get("OverrideEnvironment").get("OPTION_QUANTIZE") + quantization_technique = quantization_config.get("OverrideEnvironment").get( + "OPTION_QUANTIZE" + ) optimization_combination = _OptimizationCombination( - compilation={ - None if compilation_config is None else bool(compilation_config) - }, + compilation={None if compilation_config is None else bool(compilation_config)}, speculative_decoding={ None if speculative_decoding_config is None else bool(speculative_decoding_config) }, - sharding={ - None if sharding_config is None else bool(sharding_config) - }, + sharding={None if sharding_config is None else bool(sharding_config)}, quantization_technique={quantization_technique}, ) - if instance_type in NEURON_CONFIGURATION["supported_instance_families"]: + if instance_family in NEURON_CONFIGURATION["supported_instance_families"]: try: ( NEURON_CONFIGURATION["optimization_combination"].validate_against( - optimization_combination, rule_set=_OptimizationContainer.VLLM + optimization_combination, rule_set=_OptimizationContainer.NEURON ) ) except ValueError as neuron_compare_error: @@ -209,7 +206,7 @@ def _validate_optimization_configuration( trt_error_msg = VALIDATION_ERROR_MSG.format( optimization_container=_OptimizationContainer.TRT.value, optimization_technique=str(trt_compare_error), - instance_type="GPU" + instance_type="GPU", ) vllm_error_msg = VALIDATION_ERROR_MSG.format( optimization_container=_OptimizationContainer.VLLM.value, diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index 94b6f39d15..728b1d26fb 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -2386,11 +2386,11 @@ def test_optimize( builder.pysdk_model = pysdk_model job_name = "my-optimization-job" - instance_type = "ml.inf2.xlarge" + instance_type = "ml.g5.24xlarge" output_path = "s3://my-bucket/output" quantization_config = { "Image": "quantization-image-uri", - "OverrideEnvironment": {"ENV_VAR": "value"}, + "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, } env_vars = {"Var1": "value", "Var2": "value"} kms_key = "arn:aws:kms:us-west-2:123456789012:key/my-key-id" @@ -2428,7 +2428,7 @@ def test_optimize( mock_send_telemetry.assert_called_once() mock_sagemaker_session.sagemaker_client.create_optimization_job.assert_called_once_with( OptimizationJobName="my-optimization-job", - DeploymentInstanceType="ml.inf2.xlarge", + DeploymentInstanceType="ml.g5.24xlarge", RoleArn="arn:aws:iam::123456789012:role/SageMakerRole", OptimizationEnvironment={"Var1": "value", "Var2": "value"}, ModelSource={"S3": {"S3Uri": "s3://uri"}}, @@ -2436,7 +2436,7 @@ def test_optimize( { "ModelQuantizationConfig": { "Image": "quantization-image-uri", - "OverrideEnvironment": {"ENV_VAR": "value"}, + "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, } } ], @@ -2650,7 +2650,7 @@ def test_optimize_local_mode(self, mock_get_serve_setting): "Model optimization is only supported in Sagemaker Endpoint Mode.", lambda: model_builder.optimize( instance_type="ml.g5.24xlarge", - quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}} + quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, ), ) @@ -2842,16 +2842,22 @@ def test_corner_cases_throw_errors(self): ValueError, "Optimizations that uses None instance type are not currently supported", lambda: _validate_optimization_configuration( - sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + sharding_config={"key": "value"}, instance_type=None, quantization_config=None, speculative_decoding_config=None, compilation_config=None, ), ) + + expected_missing_optimization_configs_error_message = """ + Optimization cannot be performed for the following reasons: + - Optimizations for TRT that use no optimization configurations are not currently supported on GPU instances + - Optimizations for vLLM that use no optimization configurations are not currently supported on GPU instances + """ self.assertRaisesRegex( ValueError, - "Optimizations are not currently supported without optimization configurations.", + textwrap.dedent(expected_missing_optimization_configs_error_message), lambda: _validate_optimization_configuration( instance_type="ml.g5.24xlarge", quantization_config=None, @@ -2881,11 +2887,39 @@ def test_trt_and_vllm_configurations_throw_errors_for_rule_set(self): ), ) - @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) - def test_neuron_configurations_throw_errors_for_rule_set(self, mock_get_serve_setting): - pass + def test_neuron_configurations_throw_errors_for_rule_set(self): + self.assertRaisesRegex( + ValueError, + ( + "Optimizations for Neuron that use Speculative Decoding " + "are not currently supported on Neuron instances" + ), + lambda: _validate_optimization_configuration( + instance_type="ml.inf2.xlarge", + quantization_config=None, + speculative_decoding_config={"key": "value"}, + compilation_config=None, + sharding_config=None, + ), + ) + + self.assertRaisesRegex( + ValueError, + ( + "Optimizations for Neuron that use Sharding " + "are not currently supported on Neuron instances" + ), + lambda: _validate_optimization_configuration( + instance_type="ml.inf2.xlarge", + quantization_config=None, + speculative_decoding_config=None, + compilation_config=None, + sharding_config={"key": "value"}, + ), + ) def test_trt_configurations_rule_set(self): + # Can be quantized _validate_optimization_configuration( instance_type="ml.g5.24xlarge", quantization_config={ @@ -2896,6 +2930,51 @@ def test_trt_configurations_rule_set(self): compilation_config=None, ) - @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) - def test_vllm_configurations_rule_set(self, mock_get_serve_setting): - pass + # Can be compiled + _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config=None, + sharding_config=None, + speculative_decoding_config=None, + compilation_config={"key": "value"}, + ) + + def test_vllm_configurations_rule_set(self): + # Can be quantized + _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config={ + "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, + }, + sharding_config=None, + speculative_decoding_config=None, + compilation_config=None, + ) + + # Can use speculative decoding + _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config=None, + sharding_config=None, + speculative_decoding_config={"key": "value"}, + compilation_config=None, + ) + + # Can be sharded + _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config=None, + sharding_config={"key": "value"}, + speculative_decoding_config=None, + compilation_config=None, + ) + + def test_neuron_configurations_rule_set(self): + # Can be compiled + _validate_optimization_configuration( + instance_type="ml.inf2.xlarge", + quantization_config=None, + sharding_config=None, + speculative_decoding_config=None, + compilation_config={"key": "value"}, + ) From d1074ebe2dfdb88ee7b83ed56a92c84660827815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gary=20Wang=20=F0=9F=98=A4?= Date: Sat, 16 Nov 2024 00:37:56 +0000 Subject: [PATCH 14/18] simplify logic --- src/sagemaker/serve/validations/optimization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py index 21a4acc0d3..9d4ddcc941 100644 --- a/src/sagemaker/serve/validations/optimization.py +++ b/src/sagemaker/serve/validations/optimization.py @@ -165,11 +165,11 @@ def _validate_optimization_configuration( ) optimization_combination = _OptimizationCombination( - compilation={None if compilation_config is None else bool(compilation_config)}, + compilation={None if compilation_config is None else True}, speculative_decoding={ - None if speculative_decoding_config is None else bool(speculative_decoding_config) + None if speculative_decoding_config is None else True }, - sharding={None if sharding_config is None else bool(sharding_config)}, + sharding={None if sharding_config is None else True}, quantization_technique={quantization_technique}, ) From 74a0e36630f12f8cc10c377f1422ff35f6a07125 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gary=20Wang=20=F0=9F=98=A4?= Date: Sat, 16 Nov 2024 02:15:27 +0000 Subject: [PATCH 15/18] update messaging --- .../serve/validations/optimization.py | 93 +++++++++---------- .../serve/builder/test_model_builder.py | 28 ++---- 2 files changed, 54 insertions(+), 67 deletions(-) diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py index 9d4ddcc941..cc038463f2 100644 --- a/src/sagemaker/serve/validations/optimization.py +++ b/src/sagemaker/serve/validations/optimization.py @@ -41,18 +41,6 @@ class _OptimizationCombination(BaseModel): def validate_against(self, optimization_combination, rule_set: _OptimizationContainer): """Validator for optimization containers""" - print(optimization_combination) - print(rule_set) - print(optimization_combination.speculative_decoding.issubset(self.speculative_decoding)) - - # check the case where no optimization combination is provided - if ( - optimization_combination.compilation == {None} - and optimization_combination.quantization_technique == {None} - and optimization_combination.speculative_decoding == {None} - and optimization_combination.sharding == {None} - ): - raise ValueError("no optimization configurations") # check the validity of each individual field if not optimization_combination.compilation.issubset(self.compilation): @@ -68,17 +56,22 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont raise ValueError("Sharding") # optimization technique combinations that need to be validated + if optimization_combination.compilation and optimization_combination.speculative_decoding: + copy_compilation = optimization_combination.compilation.copy() + copy_speculative_decoding = optimization_combination.speculative_decoding.copy() + if ( + copy_compilation.pop() and copy_speculative_decoding.pop() + ): # Check that the 2 techniques are not None + raise ValueError("Compilation and Speculative Decoding together") + if rule_set == _OptimizationContainer.TRT: if ( optimization_combination.compilation - and optimization_combination.speculative_decoding + and not optimization_combination.quantization_technique + or not optimization_combination.compilation + and optimization_combination.quantization_technique ): - copy_compilation = optimization_combination.compilation.copy() - copy_speculative_decoding = optimization_combination.speculative_decoding.copy() - if ( - copy_compilation.pop() and copy_speculative_decoding.pop() - ): # Check that the 2 techniques are not None - raise ValueError("Compilation and Speculative Decoding") + raise ValueError("Compilation must be provided with Quantization") else: copy_compilation = optimization_combination.compilation.copy() copy_quantization_technique = optimization_combination.quantization_technique.copy() @@ -106,7 +99,7 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont "supported_instance_families": {"p4d", "p4de", "p5", "g5", "g6"}, "optimization_combination": _OptimizationCombination( optimization_container=_OptimizationContainer.VLLM, - compilation=FALSY_SET, + compilation=TRUTHY_SET, quantization_technique={None, "awq", "fp8"}, speculative_decoding=TRUTHY_SET, sharding=TRUTHY_SET, @@ -123,11 +116,6 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont ), } -VALIDATION_ERROR_MSG = ( - "Optimizations for {optimization_container} that use {optimization_technique} " - "are not currently supported on {instance_type} instances" -) - def _validate_optimization_configuration( instance_type: str, @@ -150,7 +138,8 @@ def _validate_optimization_configuration( and instance_family not in NEURON_CONFIGURATION["supported_instance_families"] ): invalid_instance_type_msg = ( - f"Optimizations that uses {instance_type} instance type are not currently supported" + f"Optimizations that uses {instance_type} instance type are " + "not currently supported both on GPU and Neuron instances" ) raise ValueError(invalid_instance_type_msg) @@ -166,13 +155,26 @@ def _validate_optimization_configuration( optimization_combination = _OptimizationCombination( compilation={None if compilation_config is None else True}, - speculative_decoding={ - None if speculative_decoding_config is None else True - }, + speculative_decoding={None if speculative_decoding_config is None else True}, sharding={None if sharding_config is None else True}, quantization_technique={quantization_technique}, ) + # Check the case where no optimization combination is provided + if ( + optimization_combination.compilation == {None} + and optimization_combination.quantization_technique == {None} + and optimization_combination.speculative_decoding == {None} + and optimization_combination.sharding == {None} + ): + raise ValueError( + ( + "Optimizations that provide no optimization configs " + "are currently not support on both GPU and Neuron instances." + ) + ) + + # Validate based off of instance type if instance_family in NEURON_CONFIGURATION["supported_instance_families"]: try: ( @@ -182,11 +184,7 @@ def _validate_optimization_configuration( ) except ValueError as neuron_compare_error: raise ValueError( - VALIDATION_ERROR_MSG.format( - optimization_container=_OptimizationContainer.NEURON.value, - optimization_technique=str(neuron_compare_error), - instance_type="Neuron", - ) + f"Optimizations that use {neuron_compare_error} are not supported on Neuron instances." ) else: try: @@ -203,19 +201,16 @@ def _validate_optimization_configuration( ) ) except ValueError as vllm_compare_error: - trt_error_msg = VALIDATION_ERROR_MSG.format( - optimization_container=_OptimizationContainer.TRT.value, - optimization_technique=str(trt_compare_error), - instance_type="GPU", - ) - vllm_error_msg = VALIDATION_ERROR_MSG.format( - optimization_container=_OptimizationContainer.VLLM.value, - optimization_technique=str(vllm_compare_error), - instance_type="GPU", - ) - joint_error_msg = f""" - Optimization cannot be performed for the following reasons: - - {trt_error_msg} - - {vllm_error_msg} - """ + if trt_compare_error == "Compilation must be provided with Quantization": + joint_error_msg = f""" + Optimization cannot be performed for the following reasons: + - Optimizations that use {trt_compare_error} and vice-versa for GPU instances. + - Optimizations that use {vllm_compare_error} are not supported for GPU instances. + """ + else: + joint_error_msg = f""" + Optimization cannot be performed for the following reasons: + - Optimizations that use {trt_compare_error} are not supported for GPU instances. + - Optimizations that use {vllm_compare_error} are not supported for GPU instances. + """ raise ValueError(textwrap.dedent(joint_error_msg)) diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index 728b1d26fb..1c28f6b6db 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -2682,8 +2682,8 @@ def test_optimize_exclusive_sharding(self, mock_get_serve_setting): expected_error_message = """ Optimization cannot be performed for the following reasons: - - Optimizations for TRT that use Sharding are not currently supported on GPU instances - - Optimizations for vLLM that use Compilation are not currently supported on GPU instances + - Optimizations that use Sharding are not supported for GPU instances. + - Optimizations that use Compilation and Quantization:awq are not supported for GPU instances. """ self.assertRaisesRegex( @@ -2850,14 +2850,12 @@ def test_corner_cases_throw_errors(self): ), ) - expected_missing_optimization_configs_error_message = """ - Optimization cannot be performed for the following reasons: - - Optimizations for TRT that use no optimization configurations are not currently supported on GPU instances - - Optimizations for vLLM that use no optimization configurations are not currently supported on GPU instances - """ self.assertRaisesRegex( ValueError, - textwrap.dedent(expected_missing_optimization_configs_error_message), + ( + "Optimizations that provide no optimization configs " + "are currently not support on both GPU and Neuron instances." + ), lambda: _validate_optimization_configuration( instance_type="ml.g5.24xlarge", quantization_config=None, @@ -2870,8 +2868,8 @@ def test_corner_cases_throw_errors(self): def test_trt_and_vllm_configurations_throw_errors_for_rule_set(self): expected_quantization_error_message = """ Optimization cannot be performed for the following reasons: - - Optimizations for TRT that use Quantization:test are not currently supported on GPU instances - - Optimizations for vLLM that use Quantization:test are not currently supported on GPU instances + - Optimizations that use Quantization:test are not supported for GPU instances. + - Optimizations that use Quantization:test are not supported for GPU instances. """ self.assertRaisesRegex( ValueError, @@ -2890,10 +2888,7 @@ def test_trt_and_vllm_configurations_throw_errors_for_rule_set(self): def test_neuron_configurations_throw_errors_for_rule_set(self): self.assertRaisesRegex( ValueError, - ( - "Optimizations for Neuron that use Speculative Decoding " - "are not currently supported on Neuron instances" - ), + "Optimizations that use Speculative Decoding are not supported on Neuron instances.", lambda: _validate_optimization_configuration( instance_type="ml.inf2.xlarge", quantization_config=None, @@ -2905,10 +2900,7 @@ def test_neuron_configurations_throw_errors_for_rule_set(self): self.assertRaisesRegex( ValueError, - ( - "Optimizations for Neuron that use Sharding " - "are not currently supported on Neuron instances" - ), + "Optimizations that use Sharding are not supported on Neuron instances.", lambda: _validate_optimization_configuration( instance_type="ml.inf2.xlarge", quantization_config=None, From 955479afd44b5e7265d5e6780d4e8130b33c3bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gary=20Wang=20=F0=9F=98=A4?= Date: Sat, 16 Nov 2024 02:20:14 +0000 Subject: [PATCH 16/18] formatting --- src/sagemaker/serve/validations/optimization.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py index cc038463f2..011e5d1ccf 100644 --- a/src/sagemaker/serve/validations/optimization.py +++ b/src/sagemaker/serve/validations/optimization.py @@ -184,7 +184,10 @@ def _validate_optimization_configuration( ) except ValueError as neuron_compare_error: raise ValueError( - f"Optimizations that use {neuron_compare_error} are not supported on Neuron instances." + ( + f"Optimizations that use {neuron_compare_error} " + "are not supported on Neuron instances." + ) ) else: try: From 76a4102ce72b5d4a2841dfc17cd614011b214e4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gary=20Wang=20=F0=9F=98=A4?= Date: Sat, 16 Nov 2024 05:49:35 +0000 Subject: [PATCH 17/18] fix UTs --- .../serve/validations/optimization.py | 30 +++++----- .../serve/builder/test_model_builder.py | 57 +++++++++++-------- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py index 011e5d1ccf..5237f017b2 100644 --- a/src/sagemaker/serve/validations/optimization.py +++ b/src/sagemaker/serve/validations/optimization.py @@ -57,26 +57,21 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont # optimization technique combinations that need to be validated if optimization_combination.compilation and optimization_combination.speculative_decoding: - copy_compilation = optimization_combination.compilation.copy() - copy_speculative_decoding = optimization_combination.speculative_decoding.copy() - if ( - copy_compilation.pop() and copy_speculative_decoding.pop() - ): # Check that the 2 techniques are not None + is_compiled = optimization_combination.compilation.copy().pop() + is_speculative_decoding = optimization_combination.speculative_decoding.copy().pop() + if is_compiled and is_speculative_decoding: raise ValueError("Compilation and Speculative Decoding together") if rule_set == _OptimizationContainer.TRT: - if ( - optimization_combination.compilation - and not optimization_combination.quantization_technique - or not optimization_combination.compilation - and optimization_combination.quantization_technique - ): + is_compiled = optimization_combination.compilation.copy().pop() + is_quantized = optimization_combination.quantization_technique.copy().pop() + if is_compiled and not is_quantized or is_quantized and not is_compiled: raise ValueError("Compilation must be provided with Quantization") else: - copy_compilation = optimization_combination.compilation.copy() - copy_quantization_technique = optimization_combination.quantization_technique.copy() + is_compiled = optimization_combination.compilation.copy().pop() + is_quantization_technique = optimization_combination.quantization_technique.copy().pop() if ( - copy_compilation.pop() and copy_quantization_technique.pop() + is_compiled and is_quantization_technique ): # Check that the 2 techniques are not None raise ValueError( f"Compilation and Quantization:{optimization_combination.quantization_technique.pop()}" @@ -99,8 +94,8 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont "supported_instance_families": {"p4d", "p4de", "p5", "g5", "g6"}, "optimization_combination": _OptimizationCombination( optimization_container=_OptimizationContainer.VLLM, - compilation=TRUTHY_SET, - quantization_technique={None, "awq", "fp8"}, + compilation=FALSY_SET, + quantization_technique={None}, speculative_decoding=TRUTHY_SET, sharding=TRUTHY_SET, ), @@ -203,8 +198,9 @@ def _validate_optimization_configuration( optimization_combination, rule_set=_OptimizationContainer.VLLM ) ) + print("fsdafas") except ValueError as vllm_compare_error: - if trt_compare_error == "Compilation must be provided with Quantization": + if str(trt_compare_error) == "Compilation must be provided with Quantization": joint_error_msg = f""" Optimization cannot be performed for the following reasons: - Optimizations that use {trt_compare_error} and vice-versa for GPU instances. diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index 1c28f6b6db..2ced49c68d 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -2912,37 +2912,44 @@ def test_neuron_configurations_throw_errors_for_rule_set(self): def test_trt_configurations_rule_set(self): # Can be quantized - _validate_optimization_configuration( - instance_type="ml.g5.24xlarge", - quantization_config={ - "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, - }, - sharding_config=None, - speculative_decoding_config=None, - compilation_config=None, + expected_compilation_quantization_error_message = """ + Optimization cannot be performed for the following reasons: + - Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances. + - Optimizations that use Quantization:awq are not supported for GPU instances. + """ + self.assertRaisesRegex( + ValueError, + textwrap.dedent(expected_compilation_quantization_error_message), + lambda: _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config={ + "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, + }, + sharding_config=None, + speculative_decoding_config=None, + compilation_config=None, + ), ) # Can be compiled - _validate_optimization_configuration( - instance_type="ml.g5.24xlarge", - quantization_config=None, - sharding_config=None, - speculative_decoding_config=None, - compilation_config={"key": "value"}, + expected_compilation_quantization_error_message = """ + Optimization cannot be performed for the following reasons: + - Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances. + - Optimizations that use Compilation are not supported for GPU instances. + """ + self.assertRaisesRegex( + ValueError, + textwrap.dedent(expected_compilation_quantization_error_message), + lambda: _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config=None, + sharding_config=None, + speculative_decoding_config=None, + compilation_config={"key": "value"}, + ), ) def test_vllm_configurations_rule_set(self): - # Can be quantized - _validate_optimization_configuration( - instance_type="ml.g5.24xlarge", - quantization_config={ - "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, - }, - sharding_config=None, - speculative_decoding_config=None, - compilation_config=None, - ) - # Can use speculative decoding _validate_optimization_configuration( instance_type="ml.g5.24xlarge", From b7b8d3c1fd5a44cbf23676783ce3031d31cf352e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gary=20Wang=20=F0=9F=98=A4?= Date: Sat, 16 Nov 2024 05:54:43 +0000 Subject: [PATCH 18/18] add more UTs --- .../serve/builder/test_model_builder.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index 2ced49c68d..a3e7a7beba 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -2910,8 +2910,7 @@ def test_neuron_configurations_throw_errors_for_rule_set(self): ), ) - def test_trt_configurations_rule_set(self): - # Can be quantized + def test_trt_configurations_throw_errors_for_rule_se(self): expected_compilation_quantization_error_message = """ Optimization cannot be performed for the following reasons: - Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances. @@ -2931,7 +2930,6 @@ def test_trt_configurations_rule_set(self): ), ) - # Can be compiled expected_compilation_quantization_error_message = """ Optimization cannot be performed for the following reasons: - Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances. @@ -2949,6 +2947,18 @@ def test_trt_configurations_rule_set(self): ), ) + def test_trt_configurations_rule_set(self): + # Can be compiled with quantization + _validate_optimization_configuration( + instance_type="ml.g5.24xlarge", + quantization_config={ + "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, + }, + sharding_config=None, + speculative_decoding_config=None, + compilation_config={"key": "value"}, + ), + def test_vllm_configurations_rule_set(self): # Can use speculative decoding _validate_optimization_configuration(