Skip to content

Commit

Permalink
Fix integration tests and update Python versions (#154)
Browse files Browse the repository at this point in the history
* Fix integration tests and update Python versions

* Fix test dependency versions, change GPU instance type, update GPU MNIST script, add environment variable to GPU tests

* Modify Dockerfiles and set default value of env_vars to None

* Remove instance type from SageMaker integration tests

* Add instance_type to SageMaker tests

* Fix buildspec.yml
  • Loading branch information
sachanub authored Oct 9, 2023
1 parent c302b64 commit 1fbd3f7
Show file tree
Hide file tree
Showing 14 changed files with 188 additions and 107 deletions.
2 changes: 1 addition & 1 deletion buildspec-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ phases:
# run unit tests
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
tox -e py36,py37 -- test/unit
tox -e py38,py39,py310 -- test/unit

# run local integ tests
#- $(aws ecr get-login --no-include-email --region us-west-2)
Expand Down
117 changes: 65 additions & 52 deletions buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSION: '1.10.2'
EIA_FRAMEWORK_VERSION: '1.3.1'
FRAMEWORK_VERSIONS: '2.0.0 2.0.1'
# EIA_FRAMEWORK_VERSION: '1.3.1'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
GPU_INSTANCE_TYPE: 'ml.p3.8xlarge'
EIA_ACCELERATOR_TYPE: 'ml.eia2.medium'
GPU_INSTANCE_TYPE: 'ml.g4dn.12xlarge'
# EIA_ACCELERATOR_TYPE: 'ml.eia2.medium'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-pytorch-serving-container'
DLC_ACCOUNT: '763104351884'
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip\npython3.6 -m pip install -U -e .\npython3.6 -m pip install -U -e .[test]'
SETUP_CMDS: '#!/bin/bash\npython3.8 -m pip install --upgrade pip\npython3.8 -m pip install -U -e .\npython3.8 -m pip install -U -e .[test]'


phases:
Expand All @@ -33,68 +33,81 @@ phases:
- tox -e flake8,twine

# run unit tests
- tox -e py36,py37 test/unit
- tox -e py38,py39,py310 test/unit

# define tags
- GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID"
- DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
- DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
- DLC_EIA_TAG="$EIA_FRAMEWORK_VERSION-dlc-eia-$BUILD_ID"
# define EIA tag
# - DLC_EIA_TAG="$EIA_FRAMEWORK_VERSION-dlc-eia-$BUILD_ID"

# run local CPU integration tests (build and push the image to ECR repo)
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local -vv -rA -s --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local -vv -rA -s --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"

# launch remote GPU instance
- |
for FRAMEWORK_VERSION in $FRAMEWORK_VERSIONS;
do
DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID";
test_cmd="IGNORE_COVERAGE=- tox -e py38 -- test/integration/local -vv -rA -s --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG";
execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg";
docker system prune --all --force;
done
# launch remote GPU instance with Deep Learning AMI GPU PyTorch 1.9 (Ubuntu 20.04)
- prefix='ml.'
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
- create-key-pair
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest


# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
- python3 setup.py sdist
- build_dir="test/container/$FRAMEWORK_VERSION"
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
# push DLC GPU image to ECR
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker push $PREPROD_IMAGE:$DLC_GPU_TAG

# run GPU local integration tests
- printf "$SETUP_CMDS" > $SETUP_FILE
# no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
- generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local -vv -rA -s --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local -vv -rA -s --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- |
for FRAMEWORK_VERSION in $FRAMEWORK_VERSIONS;
do
create-key-pair;
launch-ec2-instance --instance-type $instance_type --ami-name ami-03e3ef8c92fdb39ad;
DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID";
build_dir="test/container/$FRAMEWORK_VERSION";
docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .;
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION);
docker push $PREPROD_IMAGE:$DLC_GPU_TAG;
printf "$SETUP_CMDS" > $SETUP_FILE;
dlc_cmd="IGNORE_COVERAGE=- tox -e py38 -- test/integration/local -vv -rA -s --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG";
test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --python-version \"3.8\"";
execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg";
docker system prune --all --force;
cleanup-gpu-instances;
cleanup-key-pairs;
done
# run CPU sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- |
for FRAMEWORK_VERSION in $FRAMEWORK_VERSIONS;
do
DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID";
test_cmd="IGNORE_COVERAGE=- tox -e py38 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG";
execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg";
docker system prune --all --force;
done
# run GPU sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- |
for FRAMEWORK_VERSION in $FRAMEWORK_VERSIONS;
do
DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID";
test_cmd="IGNORE_COVERAGE=- tox -e py38 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG";
execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg";
docker system prune --all --force;
done
# run EIA sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --build-image --push-image --dockerfile-type dlc.eia --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $EIA_FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --accelerator-type $EIA_ACCELERATOR_TYPE --tag $DLC_EIA_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
# - test_cmd="IGNORE_COVERAGE=- tox -e py38 -- test/integration/sagemaker --build-image --push-image --dockerfile-type dlc.eia --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $EIA_FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --accelerator-type $EIA_ACCELERATOR_TYPE --tag $DLC_EIA_TAG"
# - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg"

finally:
# shut down remote GPU instance
- cleanup-gpu-instances
- cleanup-key-pairs

# remove ECR image
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_EIA_TAG
- |
for FRAMEWORK_VERSION in $FRAMEWORK_VERSIONS;
do
DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID";
DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID";
aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG;
aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG;
done
# - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_EIA_TAG
16 changes: 8 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,19 @@ def read(fname):
"Natural Language :: English",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python",
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10'
],

# We don't declare our dependency on torch here because we build with
# different packages for different variants
install_requires=['numpy', 'retrying', 'sagemaker-inference>=1.3.1'],
install_requires=['numpy==1.24.4', 'retrying==1.3.4', 'sagemaker-inference==1.10.0'],
extras_require={
'test': ['boto3>=1.10.44', 'coverage==4.5.3', 'docker-compose==1.23.2', 'flake8==3.7.7', 'Flask==1.1.1',
'mock==2.0.0', 'pytest==4.4.0', 'pytest-cov==2.7.1', 'pytest-xdist==1.28.0', 'PyYAML==3.10',
'sagemaker==1.56.3', 'sagemaker-containers>=2.5.4', 'six==1.12.0', 'requests==2.20.0',
'requests_mock==1.6.0', 'torch==1.6.0', 'torchvision==0.7.0', 'tox==3.7.0']
'test': ['boto3==1.28.60', 'coverage==7.3.2', 'docker-compose==1.29.2', 'flake8==6.1.0', 'Flask==3.0.0',
'mock==5.1.0', 'pytest==7.4.2', 'pytest-cov==4.1.0', 'pytest-xdist==3.3.1', 'PyYAML==5.4.1',
'sagemaker==2.125.0', 'six==1.16.0', 'requests==2.31.0',
'requests_mock==1.11.0', 'torch==2.1.0', 'torchvision==0.16.0', 'tox==4.11.3']
},

entry_points={
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def default_output_fn(self, prediction, accept):
Returns: output data serialized
"""
if type(prediction) == torch.Tensor:
if type(prediction) is torch.Tensor:
prediction = prediction.detach().cpu().numpy().tolist()

for content_type in utils.parse_accept(accept):
Expand Down
7 changes: 4 additions & 3 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,16 @@ def pytest_addoption(parser):
parser.addoption('--build-image', '-B', action='store_true')
parser.addoption('--push-image', '-P', action='store_true')
parser.addoption('--dockerfile-type', '-T',
choices=['dlc.cpu', 'dlc.gpu', 'dlc.eia', 'pytorch'],
default='pytorch')
# choices=['dlc.cpu', 'dlc.gpu', 'dlc.eia', 'pytorch'],
choices=['dlc.cpu', 'dlc.gpu'],
default='dlc.cpu')
parser.addoption('--dockerfile', '-D', default=None)
parser.addoption('--aws-id', default=None)
parser.addoption('--instance-type')
parser.addoption('--accelerator-type')
parser.addoption('--docker-base-name', default='sagemaker-pytorch-inference')
parser.addoption('--region', default='us-west-2')
parser.addoption('--framework-version', default="1.6.0")
parser.addoption('--framework-version', default="2.0.0")
parser.addoption('--py-version', choices=['2', '3'], default='3')
# Processor is still "cpu" for EIA tests
parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu')
Expand Down
8 changes: 8 additions & 0 deletions test/container/2.0.0/Dockerfile.dlc.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.0.0-cpu-py310-ubuntu20.04-sagemaker

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz

RUN pip uninstall -y sagemaker_pytorch_inference && \
pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz
8 changes: 8 additions & 0 deletions test/container/2.0.0/Dockerfile.dlc.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz

RUN pip uninstall -y sagemaker_pytorch_inference && \
pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz
8 changes: 8 additions & 0 deletions test/container/2.0.1/Dockerfile.dlc.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.0.1-cpu-py310-ubuntu20.04-sagemaker

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz

RUN pip uninstall -y sagemaker_pytorch_inference && \
pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz
8 changes: 8 additions & 0 deletions test/container/2.0.1/Dockerfile.dlc.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz

RUN pip uninstall -y sagemaker_pytorch_inference && \
pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz
Loading

0 comments on commit 1fbd3f7

Please sign in to comment.