Skip to content

Commit

Permalink
add UTs
Browse files Browse the repository at this point in the history
  • Loading branch information
gwang111 committed Nov 16, 2024
1 parent 22fdc37 commit 57123c9
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 31 deletions.
33 changes: 15 additions & 18 deletions src/sagemaker/serve/validations/optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ class _OptimizationCombination(BaseModel):

def validate_against(self, optimization_combination, rule_set: _OptimizationContainer):
"""Validator for optimization containers"""
print(optimization_combination)
print(rule_set)
print(optimization_combination.speculative_decoding.issubset(self.speculative_decoding))

# check the case where no optimization combination is provided
if (
Expand All @@ -49,7 +52,7 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont
and optimization_combination.speculative_decoding == {None}
and optimization_combination.sharding == {None}
):
raise ValueError("Optimizations are not currently supported without optimization configurations.")
raise ValueError("no optimization configurations")

# check the validity of each individual field
if not optimization_combination.compilation.issubset(self.compilation):
Expand All @@ -58,9 +61,7 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont
self.quantization_technique
):
copy_quantization_technique = optimization_combination.quantization_technique.copy()
raise ValueError(
f"Quantization:{copy_quantization_technique.pop()}"
)
raise ValueError(f"Quantization:{copy_quantization_technique.pop()}")
if not optimization_combination.speculative_decoding.issubset(self.speculative_decoding):
raise ValueError("Speculative Decoding")
if not optimization_combination.sharding.issubset(self.sharding):
Expand All @@ -75,16 +76,14 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont
copy_compilation = optimization_combination.compilation.copy()
copy_speculative_decoding = optimization_combination.speculative_decoding.copy()
if (
copy_compilation.pop()
and copy_speculative_decoding.pop()
copy_compilation.pop() and copy_speculative_decoding.pop()
): # Check that the 2 techniques are not None
raise ValueError("Compilation and Speculative Decoding")
else:
copy_compilation = optimization_combination.compilation.copy()
copy_quantization_technique = optimization_combination.quantization_technique.copy()
if (
copy_compilation.pop()
and copy_quantization_technique.pop()
copy_compilation.pop() and copy_quantization_technique.pop()
): # Check that the 2 techniques are not None
raise ValueError(
f"Compilation and Quantization:{optimization_combination.quantization_technique.pop()}"
Expand Down Expand Up @@ -161,26 +160,24 @@ def _validate_optimization_configuration(
and quantization_config.get("OverrideEnvironment")
and quantization_config.get("OverrideEnvironment").get("OPTION_QUANTIZE")
):
quantization_technique = quantization_config.get("OverrideEnvironment").get("OPTION_QUANTIZE")
quantization_technique = quantization_config.get("OverrideEnvironment").get(
"OPTION_QUANTIZE"
)

optimization_combination = _OptimizationCombination(
compilation={
None if compilation_config is None else bool(compilation_config)
},
compilation={None if compilation_config is None else bool(compilation_config)},
speculative_decoding={
None if speculative_decoding_config is None else bool(speculative_decoding_config)
},
sharding={
None if sharding_config is None else bool(sharding_config)
},
sharding={None if sharding_config is None else bool(sharding_config)},
quantization_technique={quantization_technique},
)

if instance_type in NEURON_CONFIGURATION["supported_instance_families"]:
if instance_family in NEURON_CONFIGURATION["supported_instance_families"]:
try:
(
NEURON_CONFIGURATION["optimization_combination"].validate_against(
optimization_combination, rule_set=_OptimizationContainer.VLLM
optimization_combination, rule_set=_OptimizationContainer.NEURON
)
)
except ValueError as neuron_compare_error:
Expand Down Expand Up @@ -209,7 +206,7 @@ def _validate_optimization_configuration(
trt_error_msg = VALIDATION_ERROR_MSG.format(
optimization_container=_OptimizationContainer.TRT.value,
optimization_technique=str(trt_compare_error),
instance_type="GPU"
instance_type="GPU",
)
vllm_error_msg = VALIDATION_ERROR_MSG.format(
optimization_container=_OptimizationContainer.VLLM.value,
Expand Down
105 changes: 92 additions & 13 deletions tests/unit/sagemaker/serve/builder/test_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2386,11 +2386,11 @@ def test_optimize(
builder.pysdk_model = pysdk_model

job_name = "my-optimization-job"
instance_type = "ml.inf2.xlarge"
instance_type = "ml.g5.24xlarge"
output_path = "s3://my-bucket/output"
quantization_config = {
"Image": "quantization-image-uri",
"OverrideEnvironment": {"ENV_VAR": "value"},
"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"},
}
env_vars = {"Var1": "value", "Var2": "value"}
kms_key = "arn:aws:kms:us-west-2:123456789012:key/my-key-id"
Expand Down Expand Up @@ -2428,15 +2428,15 @@ def test_optimize(
mock_send_telemetry.assert_called_once()
mock_sagemaker_session.sagemaker_client.create_optimization_job.assert_called_once_with(
OptimizationJobName="my-optimization-job",
DeploymentInstanceType="ml.inf2.xlarge",
DeploymentInstanceType="ml.g5.24xlarge",
RoleArn="arn:aws:iam::123456789012:role/SageMakerRole",
OptimizationEnvironment={"Var1": "value", "Var2": "value"},
ModelSource={"S3": {"S3Uri": "s3://uri"}},
OptimizationConfigs=[
{
"ModelQuantizationConfig": {
"Image": "quantization-image-uri",
"OverrideEnvironment": {"ENV_VAR": "value"},
"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"},
}
}
],
Expand Down Expand Up @@ -2650,7 +2650,7 @@ def test_optimize_local_mode(self, mock_get_serve_setting):
"Model optimization is only supported in Sagemaker Endpoint Mode.",
lambda: model_builder.optimize(
instance_type="ml.g5.24xlarge",
quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}
quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}},
),
)

Expand Down Expand Up @@ -2842,16 +2842,22 @@ def test_corner_cases_throw_errors(self):
ValueError,
"Optimizations that uses None instance type are not currently supported",
lambda: _validate_optimization_configuration(
sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}},
sharding_config={"key": "value"},
instance_type=None,
quantization_config=None,
speculative_decoding_config=None,
compilation_config=None,
),
)

expected_missing_optimization_configs_error_message = """
Optimization cannot be performed for the following reasons:
- Optimizations for TRT that use no optimization configurations are not currently supported on GPU instances
- Optimizations for vLLM that use no optimization configurations are not currently supported on GPU instances
"""
self.assertRaisesRegex(
ValueError,
"Optimizations are not currently supported without optimization configurations.",
textwrap.dedent(expected_missing_optimization_configs_error_message),
lambda: _validate_optimization_configuration(
instance_type="ml.g5.24xlarge",
quantization_config=None,
Expand Down Expand Up @@ -2881,11 +2887,39 @@ def test_trt_and_vllm_configurations_throw_errors_for_rule_set(self):
),
)

@patch.object(ModelBuilder, "_get_serve_setting", autospec=True)
def test_neuron_configurations_throw_errors_for_rule_set(self, mock_get_serve_setting):
pass
def test_neuron_configurations_throw_errors_for_rule_set(self):
self.assertRaisesRegex(
ValueError,
(
"Optimizations for Neuron that use Speculative Decoding "
"are not currently supported on Neuron instances"
),
lambda: _validate_optimization_configuration(
instance_type="ml.inf2.xlarge",
quantization_config=None,
speculative_decoding_config={"key": "value"},
compilation_config=None,
sharding_config=None,
),
)

self.assertRaisesRegex(
ValueError,
(
"Optimizations for Neuron that use Sharding "
"are not currently supported on Neuron instances"
),
lambda: _validate_optimization_configuration(
instance_type="ml.inf2.xlarge",
quantization_config=None,
speculative_decoding_config=None,
compilation_config=None,
sharding_config={"key": "value"},
),
)

def test_trt_configurations_rule_set(self):
# Can be quantized
_validate_optimization_configuration(
instance_type="ml.g5.24xlarge",
quantization_config={
Expand All @@ -2896,6 +2930,51 @@ def test_trt_configurations_rule_set(self):
compilation_config=None,
)

@patch.object(ModelBuilder, "_get_serve_setting", autospec=True)
def test_vllm_configurations_rule_set(self, mock_get_serve_setting):
pass
# Can be compiled
_validate_optimization_configuration(
instance_type="ml.g5.24xlarge",
quantization_config=None,
sharding_config=None,
speculative_decoding_config=None,
compilation_config={"key": "value"},
)

def test_vllm_configurations_rule_set(self):
# Can be quantized
_validate_optimization_configuration(
instance_type="ml.g5.24xlarge",
quantization_config={
"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"},
},
sharding_config=None,
speculative_decoding_config=None,
compilation_config=None,
)

# Can use speculative decoding
_validate_optimization_configuration(
instance_type="ml.g5.24xlarge",
quantization_config=None,
sharding_config=None,
speculative_decoding_config={"key": "value"},
compilation_config=None,
)

# Can be sharded
_validate_optimization_configuration(
instance_type="ml.g5.24xlarge",
quantization_config=None,
sharding_config={"key": "value"},
speculative_decoding_config=None,
compilation_config=None,
)

def test_neuron_configurations_rule_set(self):
# Can be compiled
_validate_optimization_configuration(
instance_type="ml.inf2.xlarge",
quantization_config=None,
sharding_config=None,
speculative_decoding_config=None,
compilation_config={"key": "value"},
)

0 comments on commit 57123c9

Please sign in to comment.