From 3fa2deac8cb4ec0c21013143ca636416a7b9cf39 Mon Sep 17 00:00:00 2001 From: Manu Seth <22492939+mseth10@users.noreply.github.com> Date: Wed, 6 Apr 2022 15:35:12 -0700 Subject: [PATCH] pass model directory as input to torchserve (#118) * update torchserve * remove repackaging fn * update assert check * update torchserve in general dockerfile * test with dlc dockerfiles * uninstall auto confirmation * uninstall auto confirmation * run only gpu tests * run all integ tests * set default service handler in ts config file * test * test * revert passing handler service to ts config * Revert "revert passing handler service to ts config" This reverts commit d62f5ffdd5af7ce179a9e261992d2ba92b525e7d. * add pytest logs * build/push dockerfile * pass handler fn * skip unit test * add logging to sm tests * test * test * test * fix flake8 * fix unit test * test gpu sm generic * skip sm integration tests with generic image * test generic image * enable all tests --- buildspec.yml | 8 +- .../torchserve.py | 48 +++------- test/container/1.10.2/Dockerfile.dlc.cpu | 3 + test/container/1.10.2/Dockerfile.dlc.gpu | 3 + test/container/1.10.2/Dockerfile.pytorch | 2 +- test/unit/test_model_server.py | 90 ++++--------------- 6 files changed, 38 insertions(+), 116 deletions(-) diff --git a/buildspec.yml b/buildspec.yml index 4923b841..5dab6315 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -42,9 +42,9 @@ phases: - DLC_EIA_TAG="$EIA_FRAMEWORK_VERSION-dlc-eia-$BUILD_ID" # run local CPU integration tests (build and push the image to ECR repo) - - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG" + - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local -vv -rA -s --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG" - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*" - - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG" + - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local -vv -rA -s --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG" - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*" # launch remote GPU instance @@ -65,10 +65,10 @@ phases: # run GPU local integration tests - printf "$SETUP_CMDS" > $SETUP_FILE # no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests - - generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG" + - generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local -vv -rA -s --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG" - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*" - - dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG" + - dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local -vv -rA -s --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG" - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup" - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*" diff --git a/src/sagemaker_pytorch_serving_container/torchserve.py b/src/sagemaker_pytorch_serving_container/torchserve.py index 4ee1129a..048a06b4 100644 --- a/src/sagemaker_pytorch_serving_container/torchserve.py +++ b/src/sagemaker_pytorch_serving_container/torchserve.py @@ -25,13 +25,12 @@ import sagemaker_pytorch_serving_container from sagemaker_pytorch_serving_container import ts_environment -from sagemaker_inference import default_handler_service, environment, utils +from sagemaker_inference import environment, utils from sagemaker_inference.environment import code_dir logger = logging.getLogger() TS_CONFIG_FILE = os.path.join("/etc", "sagemaker-ts.properties") -DEFAULT_HANDLER_SERVICE = default_handler_service.__name__ DEFAULT_TS_CONFIG_FILE = pkg_resources.resource_filename( sagemaker_pytorch_serving_container.__name__, "/etc/default-ts.properties" ) @@ -41,13 +40,11 @@ DEFAULT_TS_LOG_FILE = pkg_resources.resource_filename( sagemaker_pytorch_serving_container.__name__, "/etc/log4j2.xml" ) -DEFAULT_TS_MODEL_DIRECTORY = os.path.join(os.getcwd(), ".sagemaker", "ts", "models") DEFAULT_TS_MODEL_NAME = "model" -DEFAULT_TS_CODE_DIR = "code" DEFAULT_HANDLER_SERVICE = "sagemaker_pytorch_serving_container.handler_service" ENABLE_MULTI_MODEL = os.getenv("SAGEMAKER_MULTI_MODEL", "false") == "true" -MODEL_STORE = "/" if ENABLE_MULTI_MODEL else DEFAULT_TS_MODEL_DIRECTORY +MODEL_STORE = "/" if ENABLE_MULTI_MODEL else os.path.join(os.getcwd(), ".sagemaker", "ts", "models") PYTHON_PATH_ENV = "PYTHONPATH" REQUIREMENTS_PATH = os.path.join(code_dir, "requirements.txt") @@ -73,11 +70,13 @@ def start_torchserve(handler_service=DEFAULT_HANDLER_SERVICE): if ENABLE_MULTI_MODEL: if "SAGEMAKER_HANDLER" not in os.environ: os.environ["SAGEMAKER_HANDLER"] = handler_service - _set_python_path() else: - _adapt_to_ts_format(handler_service) + if not os.path.exists(MODEL_STORE): + os.makedirs(MODEL_STORE) - _create_torchserve_config_file() + _set_python_path() + + _create_torchserve_config_file(handler_service) if os.path.exists(REQUIREMENTS_PATH): _install_requirements() @@ -92,7 +91,7 @@ def start_torchserve(handler_service=DEFAULT_HANDLER_SERVICE): "--log-config", DEFAULT_TS_LOG_FILE, "--models", - "model.mar" + DEFAULT_TS_MODEL_NAME + "=" + environment.model_dir ] print(ts_torchserve_cmd) @@ -107,30 +106,6 @@ def start_torchserve(handler_service=DEFAULT_HANDLER_SERVICE): ts_process.wait() -def _adapt_to_ts_format(handler_service): - if not os.path.exists(DEFAULT_TS_MODEL_DIRECTORY): - os.makedirs(DEFAULT_TS_MODEL_DIRECTORY) - - model_archiver_cmd = [ - "torch-model-archiver", - "--model-name", - DEFAULT_TS_MODEL_NAME, - "--handler", - handler_service, - "--export-path", - DEFAULT_TS_MODEL_DIRECTORY, - "--version", - "1", - "--extra-files", - os.path.join(environment.model_dir) - ] - - logger.info(model_archiver_cmd) - subprocess.check_call(model_archiver_cmd) - - _set_python_path() - - def _set_python_path(): # Torchserve handles code execution by appending the export path, provided # to the model archiver, to the PYTHONPATH env var. @@ -142,19 +117,20 @@ def _set_python_path(): os.environ[PYTHON_PATH_ENV] = environment.code_dir -def _create_torchserve_config_file(): - configuration_properties = _generate_ts_config_properties() +def _create_torchserve_config_file(handler_service): + configuration_properties = _generate_ts_config_properties(handler_service) utils.write_file(TS_CONFIG_FILE, configuration_properties) -def _generate_ts_config_properties(): +def _generate_ts_config_properties(handler_service): env = environment.Environment() user_defined_configuration = { "default_response_timeout": env.model_server_timeout, "default_workers_per_model": env.model_server_workers, "inference_address": "http://0.0.0.0:{}".format(env.inference_http_port), "management_address": "http://0.0.0.0:{}".format(env.management_http_port), + "default_service_handler": handler_service + ":handle", } ts_env = ts_environment.TorchServeEnvironment() diff --git a/test/container/1.10.2/Dockerfile.dlc.cpu b/test/container/1.10.2/Dockerfile.dlc.cpu index d3c7315c..df81de2f 100644 --- a/test/container/1.10.2/Dockerfile.dlc.cpu +++ b/test/container/1.10.2/Dockerfile.dlc.cpu @@ -1,6 +1,9 @@ ARG region FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:1.10.2-cpu-py38-ubuntu20.04-sagemaker +RUN pip uninstall torchserve -y && \ + pip install torchserve-nightly==2022.3.23.post2 + COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \ rm /sagemaker_pytorch_inference.tar.gz diff --git a/test/container/1.10.2/Dockerfile.dlc.gpu b/test/container/1.10.2/Dockerfile.dlc.gpu index 192b6662..2970d718 100644 --- a/test/container/1.10.2/Dockerfile.dlc.gpu +++ b/test/container/1.10.2/Dockerfile.dlc.gpu @@ -1,6 +1,9 @@ ARG region FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker +RUN pip uninstall torchserve -y && \ + pip install torchserve-nightly==2022.3.23.post2 + COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \ rm /sagemaker_pytorch_inference.tar.gz diff --git a/test/container/1.10.2/Dockerfile.pytorch b/test/container/1.10.2/Dockerfile.pytorch index bbc6a83e..69424528 100644 --- a/test/container/1.10.2/Dockerfile.pytorch +++ b/test/container/1.10.2/Dockerfile.pytorch @@ -25,7 +25,7 @@ RUN apt-get update \ RUN conda install -c conda-forge opencv \ && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 -RUN pip install torchserve==$TS_VERSION \ +RUN pip install torchserve-nightly==2022.3.23.post2 \ && pip install torch-model-archiver==$TS_ARCHIVER_VERSION COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz diff --git a/test/unit/test_model_server.py b/test/unit/test_model_server.py index aeaec28e..a4413c6a 100644 --- a/test/unit/test_model_server.py +++ b/test/unit/test_model_server.py @@ -34,9 +34,9 @@ @patch("sagemaker_pytorch_serving_container.torchserve._install_requirements") @patch("os.path.exists", return_value=True) @patch("sagemaker_pytorch_serving_container.torchserve._create_torchserve_config_file") -@patch("sagemaker_pytorch_serving_container.torchserve._adapt_to_ts_format") +@patch("sagemaker_pytorch_serving_container.torchserve._set_python_path") def test_start_torchserve_default_service_handler( - adapt, + set_python_path, create_config, exists, install_requirements, @@ -47,9 +47,8 @@ def test_start_torchserve_default_service_handler( ): torchserve.start_torchserve() - adapt.assert_called_once_with(torchserve.DEFAULT_HANDLER_SERVICE) - create_config.assert_called_once_with() - exists.assert_called_once_with(REQUIREMENTS_PATH) + set_python_path.assert_called_once_with() + create_config.assert_called_once_with(torchserve.DEFAULT_HANDLER_SERVICE) install_requirements.assert_called_once_with() ts_model_server_cmd = [ @@ -62,7 +61,7 @@ def test_start_torchserve_default_service_handler( "--log-config", torchserve.DEFAULT_TS_LOG_FILE, "--models", - "model.mar" + "model=/opt/ml/model" ] subprocess_popen.assert_called_once_with(ts_model_server_cmd) @@ -76,9 +75,9 @@ def test_start_torchserve_default_service_handler( @patch("sagemaker_pytorch_serving_container.torchserve._install_requirements") @patch("os.path.exists", return_value=True) @patch("sagemaker_pytorch_serving_container.torchserve._create_torchserve_config_file") -@patch("sagemaker_pytorch_serving_container.torchserve._adapt_to_ts_format") +@patch("sagemaker_pytorch_serving_container.torchserve._set_python_path") def test_start_torchserve_default_service_handler_multi_model( - adapt, + set_python_path, create_config, exists, install_requirements, @@ -90,7 +89,9 @@ def test_start_torchserve_default_service_handler_multi_model( torchserve.ENABLE_MULTI_MODEL = True torchserve.start_torchserve() torchserve.ENABLE_MULTI_MODEL = False - create_config.assert_called_once_with() + + set_python_path.assert_called_once_with() + create_config.assert_called_once_with(torchserve.DEFAULT_HANDLER_SERVICE) exists.assert_called_once_with(REQUIREMENTS_PATH) install_requirements.assert_called_once_with() @@ -104,74 +105,13 @@ def test_start_torchserve_default_service_handler_multi_model( "--log-config", torchserve.DEFAULT_TS_LOG_FILE, "--models", - "model.mar" + "model=/opt/ml/model" ] subprocess_popen.assert_called_once_with(ts_model_server_cmd) sigterm.assert_called_once_with(retrieve.return_value) -@patch("subprocess.call") -@patch("subprocess.Popen") -@patch("sagemaker_pytorch_serving_container.torchserve._retrieve_ts_server_process") -@patch("sagemaker_pytorch_serving_container.torchserve._add_sigterm_handler") -@patch("sagemaker_pytorch_serving_container.torchserve._create_torchserve_config_file") -@patch("sagemaker_pytorch_serving_container.torchserve._adapt_to_ts_format") -def test_start_torchserve_custom_handler_service( - adapt, create_config, sigterm, retrieve, subprocess_popen, subprocess_call -): - handler_service = Mock() - - torchserve.start_torchserve(handler_service) - - adapt.assert_called_once_with(handler_service) - - -@patch("sagemaker_pytorch_serving_container.torchserve._set_python_path") -@patch("subprocess.check_call") -@patch("os.makedirs") -@patch("os.path.exists", return_value=False) -def test_adapt_to_ts_format(path_exists, make_dir, subprocess_check_call, set_python_path): - handler_service = Mock() - - torchserve._adapt_to_ts_format(handler_service) - - path_exists.assert_called_once_with(torchserve.DEFAULT_TS_MODEL_DIRECTORY) - make_dir.assert_called_once_with(torchserve.DEFAULT_TS_MODEL_DIRECTORY) - - model_archiver_cmd = [ - "torch-model-archiver", - "--model-name", - torchserve.DEFAULT_TS_MODEL_NAME, - "--handler", - handler_service, - "--export-path", - torchserve.DEFAULT_TS_MODEL_DIRECTORY, - "--version", - "1", - "--extra-files", - environment.model_dir - ] - - subprocess_check_call.assert_called_once_with(model_archiver_cmd) - set_python_path.assert_called_once_with() - - -@patch("sagemaker_pytorch_serving_container.torchserve._set_python_path") -@patch("subprocess.check_call") -@patch("os.makedirs") -@patch("os.path.exists", return_value=True) -def test_adapt_to_ts_format_existing_path( - path_exists, make_dir, subprocess_check_call, set_python_path -): - handler_service = Mock() - - torchserve._adapt_to_ts_format(handler_service) - - path_exists.assert_called_once_with(torchserve.DEFAULT_TS_MODEL_DIRECTORY) - make_dir.assert_not_called() - - @patch.dict(os.environ, {torchserve.PYTHON_PATH_ENV: PYTHON_PATH}, clear=True) def test_set_existing_python_path(): torchserve._set_python_path() @@ -193,7 +133,7 @@ def test_new_python_path(): @patch("sagemaker_pytorch_serving_container.torchserve._generate_ts_config_properties") @patch("sagemaker_inference.utils.write_file") def test_create_torchserve_config_file(write_file, generate_ts_config_props): - torchserve._create_torchserve_config_file() + torchserve._create_torchserve_config_file(torchserve.DEFAULT_HANDLER_SERVICE) write_file.assert_called_once_with( torchserve.TS_CONFIG_FILE, generate_ts_config_props.return_value @@ -211,7 +151,7 @@ def test_generate_ts_config_properties(env, read_file): env.return_value.model_sever_workerse = model_server_workers env.return_value.inference_http_port = http_port - ts_config_properties = torchserve._generate_ts_config_properties() + ts_config_properties = torchserve._generate_ts_config_properties(torchserve.DEFAULT_HANDLER_SERVICE) inference_address = "inference_address=http://0.0.0.0:{}\n".format(http_port) server_timeout = "default_response_timeout={}\n".format(model_server_timeout) @@ -228,7 +168,7 @@ def test_generate_ts_config_properties(env, read_file): def test_generate_ts_config_properties_default_workers(env, read_file): env.return_value.model_server_workers = None - ts_config_properties = torchserve._generate_ts_config_properties() + ts_config_properties = torchserve._generate_ts_config_properties(torchserve.DEFAULT_HANDLER_SERVICE) workers = "default_workers_per_model={}".format(None) @@ -244,7 +184,7 @@ def test_generate_ts_config_properties_multi_model(env, read_file): env.return_value.model_server_workers = None torchserve.ENABLE_MULTI_MODEL = True - ts_config_properties = torchserve._generate_ts_config_properties() + ts_config_properties = torchserve._generate_ts_config_properties(torchserve.DEFAULT_HANDLER_SERVICE) torchserve.ENABLE_MULTI_MODEL = False workers = "default_workers_per_model={}".format(None)