Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Optimize() validations across TRT, VLLM, Neuron container optimizations #4927

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Before sending us a pull request, please ensure that:
1. Install tox using `pip install tox`
1. Install coverage using `pip install .[test]`
1. cd into the sagemaker-python-sdk folder: `cd sagemaker-python-sdk` or `cd /environment/sagemaker-python-sdk`
1. Run the following tox command and verify that all code checks and unit tests pass: `tox -- tests/unit`
1. Run the following tox command and verify that all code checks and unit tests pass: `tox tests/unit`
1. You can also run a single test with the following command: `tox -e py310 -- -s -vv <path_to_file><file_name>::<test_function_name>`
1. You can run coverage via runcvoerage env : `tox -e runcoverage -- tests/unit` or `tox -e py310 -- tests/unit --cov=sagemaker --cov-append --cov-report xml`
* Note that the coverage test will fail if you only run a single test, so make sure to surround the command with `export IGNORE_COVERAGE=-` and `unset IGNORE_COVERAGE`
Expand Down
47 changes: 0 additions & 47 deletions src/sagemaker/image_uri_config/huggingface-llm.json
Original file line number Diff line number Diff line change
Expand Up @@ -766,53 +766,6 @@
"container_version": {
"gpu": "cu124-ubuntu22.04"
}
},
"2.4.0": {
"py_versions": [
"py311"
],
"registries": {
"af-south-1": "626614931356",
"il-central-1": "780543022126",
"ap-east-1": "871362719292",
"ap-northeast-1": "763104351884",
"ap-northeast-2": "763104351884",
"ap-northeast-3": "364406365360",
"ap-south-1": "763104351884",
"ap-south-2": "772153158452",
"ap-southeast-1": "763104351884",
"ap-southeast-2": "763104351884",
"ap-southeast-3": "907027046896",
"ap-southeast-4": "457447274322",
"ca-central-1": "763104351884",
"cn-north-1": "727897471807",
"cn-northwest-1": "727897471807",
"eu-central-1": "763104351884",
"eu-central-2": "380420809688",
"eu-north-1": "763104351884",
"eu-west-1": "763104351884",
"eu-west-2": "763104351884",
"eu-west-3": "763104351884",
"eu-south-1": "692866216735",
"eu-south-2": "503227376785",
"me-south-1": "217643126080",
"me-central-1": "914824155844",
"sa-east-1": "763104351884",
"us-east-1": "763104351884",
"us-east-2": "763104351884",
"us-gov-east-1": "446045086412",
"us-gov-west-1": "442386744353",
"us-iso-east-1": "886529160074",
"us-isob-east-1": "094389454867",
"us-west-1": "763104351884",
"us-west-2": "763104351884",
"ca-west-1": "204538143572"
},
"tag_prefix": "2.4.0-tgi2.4.0",
"repository": "huggingface-pytorch-tgi-inference",
"container_version": {
"gpu": "cu124-ubuntu22.04"
}
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions src/sagemaker/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ def __init__(
self.endpoint_name = None
self.inference_component_name = None
self._is_compiled_model = False
self._is_sharded_model = False
self._compilation_job_name = None
self._is_edge_packaged_model = False
self.inference_recommender_job_results = None
Expand Down Expand Up @@ -1599,6 +1600,13 @@ def deploy(
if self._base_name is not None:
self._base_name = "-".join((self._base_name, compiled_model_suffix))

if self._is_sharded_model and endpoint_type != EndpointType.INFERENCE_COMPONENT_BASED:
logging.warning(
"Forcing INFERENCE_COMPONENT_BASED endpoint for sharded model. ADVISORY - "
"Use INFERENCE_COMPONENT_BASED endpoints over MODEL_BASED endpoints."
)
endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED

# Support multiple models on same endpoint
if endpoint_type == EndpointType.INFERENCE_COMPONENT_BASED:
if endpoint_name:
Expand Down
7 changes: 5 additions & 2 deletions src/sagemaker/serve/builder/jumpstart_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,7 @@ def _optimize_for_jumpstart(
quantization_config: Optional[Dict] = None,
compilation_config: Optional[Dict] = None,
speculative_decoding_config: Optional[Dict] = None,
sharding_config: Optional[Dict] = None,
env_vars: Optional[Dict] = None,
vpc_config: Optional[Dict] = None,
kms_key: Optional[str] = None,
Expand All @@ -702,6 +703,8 @@ def _optimize_for_jumpstart(
compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``.
speculative_decoding_config (Optional[Dict]): Speculative decoding configuration.
Defaults to ``None``
sharding_config (Optional[Dict]): Model sharding configuration.
Defaults to ``None``
env_vars (Optional[Dict]): Additional environment variables to run the optimization
container. Defaults to ``None``.
vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``.
Expand All @@ -727,7 +730,7 @@ def _optimize_for_jumpstart(
pysdk_model_env_vars = self._get_neuron_model_env_vars(instance_type)

optimization_config, override_env = _extract_optimization_config_and_env(
quantization_config, compilation_config
quantization_config, compilation_config, sharding_config
)
if not optimization_config and is_compilation:
override_env = override_env or pysdk_model_env_vars
Expand Down Expand Up @@ -792,7 +795,7 @@ def _optimize_for_jumpstart(
optimization_env_vars = _update_environment_variables(optimization_env_vars, override_env)
if optimization_env_vars:
self.pysdk_model.env.update(optimization_env_vars)
if quantization_config or is_compilation:
if quantization_config or sharding_config or is_compilation:
return create_optimization_job_args
return None

Expand Down
53 changes: 52 additions & 1 deletion src/sagemaker/serve/builder/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
get_huggingface_model_metadata,
download_huggingface_model_metadata,
)
from sagemaker.serve.validations.optimization import _validate_optimization_configuration

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1119,6 +1120,7 @@ def optimize(
quantization_config: Optional[Dict] = None,
compilation_config: Optional[Dict] = None,
speculative_decoding_config: Optional[Dict] = None,
sharding_config: Optional[Dict] = None,
env_vars: Optional[Dict] = None,
vpc_config: Optional[Dict] = None,
kms_key: Optional[str] = None,
Expand All @@ -1142,6 +1144,8 @@ def optimize(
compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``.
speculative_decoding_config (Optional[Dict]): Speculative decoding configuration.
Defaults to ``None``
sharding_config (Optional[Dict]): Model sharding configuration.
Defaults to ``None``
env_vars (Optional[Dict]): Additional environment variables to run the optimization
container. Defaults to ``None``.
vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``.
Expand Down Expand Up @@ -1170,6 +1174,7 @@ def optimize(
quantization_config=quantization_config,
compilation_config=compilation_config,
speculative_decoding_config=speculative_decoding_config,
sharding_config=sharding_config,
env_vars=env_vars,
vpc_config=vpc_config,
kms_key=kms_key,
Expand All @@ -1189,6 +1194,7 @@ def _model_builder_optimize_wrapper(
quantization_config: Optional[Dict] = None,
compilation_config: Optional[Dict] = None,
speculative_decoding_config: Optional[Dict] = None,
sharding_config: Optional[Dict] = None,
env_vars: Optional[Dict] = None,
vpc_config: Optional[Dict] = None,
kms_key: Optional[str] = None,
Expand All @@ -1212,6 +1218,8 @@ def _model_builder_optimize_wrapper(
compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``.
speculative_decoding_config (Optional[Dict]): Speculative decoding configuration.
Defaults to ``None``
sharding_config (Optional[Dict]): Model sharding configuration.
Defaults to ``None``
env_vars (Optional[Dict]): Additional environment variables to run the optimization
container. Defaults to ``None``.
vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``.
Expand All @@ -1226,6 +1234,17 @@ def _model_builder_optimize_wrapper(
Returns:
Model: A deployable ``Model`` object.
"""

# TODO: ideally these dictionaries need to be sagemaker_core shapes
# TODO: for organization, abstract all validation behind this fn
_validate_optimization_configuration(
instance_type=instance_type,
quantization_config=quantization_config,
compilation_config=compilation_config,
sharding_config=sharding_config,
speculative_decoding_config=speculative_decoding_config,
)

self.is_compiled = compilation_config is not None
self.is_quantized = quantization_config is not None
self.speculative_decoding_draft_model_source = _extract_speculative_draft_model_provider(
Expand All @@ -1238,6 +1257,30 @@ def _model_builder_optimize_wrapper(
if quantization_config and compilation_config:
raise ValueError("Quantization config and compilation config are mutually exclusive.")

if sharding_config and (
quantization_config or compilation_config or speculative_decoding_config
):
raise ValueError(
(
"Sharding config is mutually exclusive "
"and cannot be combined with any other optimization."
)
)

if sharding_config and (
(env_vars and "OPTION_TENSOR_PARALLEL_DEGREE" not in env_vars)
or (
sharding_config.get("OverrideEnvironment")
and "OPTION_TENSOR_PARALLEL_DEGREE" not in sharding_config["OverrideEnvironment"]
)
):
raise ValueError(
(
"OPTION_TENSOR_PARALLEL_DEGREE is required "
"environment variable with Sharding config."
)
)

self.sagemaker_session = sagemaker_session or self.sagemaker_session or Session()
self.instance_type = instance_type or self.instance_type
self.role_arn = role_arn or self.role_arn
Expand All @@ -1254,6 +1297,7 @@ def _model_builder_optimize_wrapper(
quantization_config=quantization_config,
compilation_config=compilation_config,
speculative_decoding_config=speculative_decoding_config,
sharding_config=sharding_config,
env_vars=env_vars,
vpc_config=vpc_config,
kms_key=kms_key,
Expand All @@ -1272,12 +1316,16 @@ def _model_builder_optimize_wrapper(
quantization_config=quantization_config,
compilation_config=compilation_config,
speculative_decoding_config=speculative_decoding_config,
sharding_config=sharding_config,
env_vars=env_vars,
vpc_config=vpc_config,
kms_key=kms_key,
max_runtime_in_sec=max_runtime_in_sec,
)

if sharding_config:
self.pysdk_model._is_sharded_model = True

if input_args:
self.sagemaker_session.sagemaker_client.create_optimization_job(**input_args)
job_status = self.sagemaker_session.wait_for_optimization_job(job_name)
Expand All @@ -1297,6 +1345,7 @@ def _optimize_for_hf(
quantization_config: Optional[Dict] = None,
compilation_config: Optional[Dict] = None,
speculative_decoding_config: Optional[Dict] = None,
sharding_config: Optional[Dict] = None,
env_vars: Optional[Dict] = None,
vpc_config: Optional[Dict] = None,
kms_key: Optional[str] = None,
Expand All @@ -1312,6 +1361,8 @@ def _optimize_for_hf(
compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``.
speculative_decoding_config (Optional[Dict]): Speculative decoding configuration.
Defaults to ``None``
sharding_config (Optional[Dict]): Model sharding configuration.
Defaults to ``None``
env_vars (Optional[Dict]): Additional environment variables to run the optimization
container. Defaults to ``None``.
vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``.
Expand All @@ -1327,7 +1378,7 @@ def _optimize_for_hf(
self.pysdk_model, speculative_decoding_config, False
)

if quantization_config or compilation_config:
if quantization_config or compilation_config or sharding_config:
create_optimization_job_args = {
"OptimizationJobName": job_name,
"DeploymentInstanceType": self.instance_type,
Expand Down
7 changes: 6 additions & 1 deletion src/sagemaker/serve/utils/optimize_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,13 +259,16 @@ def _is_s3_uri(s3_uri: Optional[str]) -> bool:


def _extract_optimization_config_and_env(
quantization_config: Optional[Dict] = None, compilation_config: Optional[Dict] = None
quantization_config: Optional[Dict] = None,
compilation_config: Optional[Dict] = None,
sharding_config: Optional[Dict] = None,
) -> Optional[Tuple[Optional[Dict], Optional[Dict]]]:
"""Extracts optimization config and environment variables.

Args:
quantization_config (Optional[Dict]): The quantization config.
compilation_config (Optional[Dict]): The compilation config.
sharding_config (Optional[Dict]): The sharding config.

Returns:
Optional[Tuple[Optional[Dict], Optional[Dict]]]:
Expand All @@ -279,6 +282,8 @@ def _extract_optimization_config_and_env(
return {"ModelCompilationConfig": compilation_config}, compilation_config.get(
"OverrideEnvironment"
)
if sharding_config:
return {"ModelShardingConfig": sharding_config}, sharding_config.get("OverrideEnvironment")
return None, None


Expand Down
Loading
Loading