From 539002506aa98ca18cb4b2713617d1fcdb169279 Mon Sep 17 00:00:00 2001 From: Ankith Gunapal Date: Fri, 14 Jul 2023 11:01:20 -0700 Subject: [PATCH] Docker Regression Tests Github action (#2403) * Testing docker regression * Run docker regression nightly * Addressed review comments * code cleanup * clean up gpu runner * changes based on feedback * lint failure --- .github/workflows/regression_tests_docker.yml | 55 +++++++++++++++++++ .github/workflows/regression_tests_gpu.yml | 9 ++- docker/Dockerfile | 47 +++++++++++++++- docker/README.md | 4 +- docker/build_image.sh | 5 +- examples/dcgan_fashiongen/create_mar.sh | 1 - ...est_example_intel_extension_for_pytorch.py | 16 ++++++ .../test_example_scriptable_tokenzier.py | 6 +- test/pytest/test_handler.py | 4 ++ test/pytest/test_sm_mme_requirements.py | 12 ++++ test/pytest/test_torch_compile.py | 12 ++++ test/regression_tests.py | 19 +++---- ts_scripts/spellcheck_conf/wordlist.txt | 1 + 13 files changed, 167 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/regression_tests_docker.yml diff --git a/.github/workflows/regression_tests_docker.yml b/.github/workflows/regression_tests_docker.yml new file mode 100644 index 0000000000..4f952b2382 --- /dev/null +++ b/.github/workflows/regression_tests_docker.yml @@ -0,0 +1,55 @@ +name: Run Regression Tests on Docker + +on: + # run every day at 5:15am + schedule: + - cron: '15 5 * * *' + +concurrency: + group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }} + cancel-in-progress: true + +jobs: + docker-regression: + strategy: + fail-fast: false + matrix: + hardware: [ubuntu-20.04, [self-hosted, regression-test-gpu]] + runs-on: + - ${{ matrix.hardware }} + steps: + - name: Clean up previous run + run: | + echo "Cleaning up previous run" + ls -la ./ + sudo rm -rf ./* || true + sudo rm -rf ./.??* || true + ls -la ./ + docker system prune -f + - name: Checkout TorchServe + uses: actions/checkout@v3 + - name: Branch name + run: | + echo $GITHUB_REF_NAME + - name: Build CPU Docker Image + if: contains(matrix.hardware, 'ubuntu') + run: | + cd docker + ./build_image.sh -bt ci -b $GITHUB_REF_NAME -t pytorch/torchserve:ci + - name: Build GPU Docker Image + if: false == contains(matrix.hardware, 'ubuntu') + run: | + cd docker + ./build_image.sh -g -cv cu117 -bt ci -b $GITHUB_REF_NAME -t pytorch/torchserve:ci + - name: Torchserve GPU Regression Tests + if: false == contains(matrix.hardware, 'ubuntu') + run: | + docker run --gpus all -v $GITHUB_WORKSPACE:/home/serve pytorch/torchserve:ci + - name: Torchserve CPU Regression Tests + if: contains(matrix.hardware, 'ubuntu') + run: | + docker run -v $GITHUB_WORKSPACE:/home/serve pytorch/torchserve:ci + - name: Cleanup Docker Images + if: success() + run: | + docker system prune -f && docker rmi pytorch/torchserve:ci diff --git a/.github/workflows/regression_tests_gpu.yml b/.github/workflows/regression_tests_gpu.yml index ff5a2bc8ea..cdf0812230 100644 --- a/.github/workflows/regression_tests_gpu.yml +++ b/.github/workflows/regression_tests_gpu.yml @@ -20,11 +20,10 @@ jobs: - name: Clean up previous run run: | echo "Cleaning up previous run" - cd $RUNNER_WORKSPACE - pwd - cd .. - pwd - rm -rf _tool + ls -la ./ + sudo rm -rf ./* || true + sudo rm -rf ./.??* || true + ls -la ./ - name: Update git run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt-get update && sudo apt-get install git -y - name: Check git version diff --git a/docker/Dockerfile b/docker/Dockerfile index 333f585047..6d811a07db 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -85,7 +85,7 @@ RUN \ RUN python -m pip install --no-cache-dir torchserve torch-model-archiver torch-workflow-archiver # Final image for production -FROM ${BASE_IMAGE} AS runtime-image +FROM ${BASE_IMAGE} AS production-image # Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top) ARG PYTHON_VERSION ENV PYTHONUNBUFFERED TRUE @@ -130,3 +130,48 @@ WORKDIR /home/model-server ENV TEMP=/home/model-server/tmp ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] CMD ["serve"] + +# Final image for docker regression +FROM ${BASE_IMAGE} AS ci-image +# Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top) +ARG PYTHON_VERSION +ARG BRANCH_NAME +ENV PYTHONUNBUFFERED TRUE + +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update && \ + apt-get upgrade -y && \ + apt-get install software-properties-common -y && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt remove python-pip python3-pip && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + python$PYTHON_VERSION \ + python3-distutils \ + python$PYTHON_VERSION-dev \ + python$PYTHON_VERSION-venv \ + # using openjdk-17-jdk due to circular dependency(ca-certificates) bug in openjdk-17-jre-headless debian package + # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1009905 + openjdk-17-jdk \ + build-essential \ + wget \ + numactl \ + nodejs \ + npm \ + zip \ + unzip \ + && npm install -g newman newman-reporter-htmlextra markdown-link-check \ + && rm -rf /var/lib/apt/lists/* \ + && cd /tmp + + +COPY --from=compile-image /home/venv /home/venv + +ENV PATH="/home/venv/bin:$PATH" + +RUN python -m pip install --no-cache-dir -r https://raw.githubusercontent.com/pytorch/serve/$BRANCH_NAME/requirements/developer.txt + +RUN mkdir /home/serve +ENV TS_RUN_IN_DOCKER True + +WORKDIR /home/serve +CMD ["python", "test/regression_tests.py"] diff --git a/docker/README.md b/docker/README.md index fc65749532..b4a9e707fa 100644 --- a/docker/README.md +++ b/docker/README.md @@ -28,13 +28,13 @@ cd serve/docker # Create TorchServe docker image -Use `build_image.sh` script to build the docker images. The script builds the `production`, `dev` and `codebuild` docker images. +Use `build_image.sh` script to build the docker images. The script builds the `production`, `dev` , `ci` and `codebuild` docker images. | Parameter | Description | |------|------| |-h, --help|Show script help| |-b, --branch_name|Specify a branch name to use. Default: master | |-g, --gpu|Build image with GPU based ubuntu base image| -|-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, codebuild| +|-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, ci, codebuild| |-t, --tag|Tag name for image. If not specified, script uses torchserve default tag names.| |-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`. Default `cu117`| |-ipex, --build-with-ipex| Specify to build with intel_extension_for_pytorch. If not specified, script builds without intel_extension_for_pytorch.| diff --git a/docker/build_image.sh b/docker/build_image.sh index 0a8f3b6a39..d652c12896 100755 --- a/docker/build_image.sh +++ b/docker/build_image.sh @@ -137,7 +137,10 @@ fi if [ "${BUILD_TYPE}" == "production" ] then - DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}" -t "${DOCKER_TAG}" . + DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}" -t "${DOCKER_TAG}" --target production-image . +elif [ "${BUILD_TYPE}" == "ci" ] +then + DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}" --build-arg BRANCH_NAME="${BRANCH_NAME}" -t "${DOCKER_TAG}" --target ci-image . elif [ "${BUILD_TYPE}" == "benchmark" ] then DOCKER_BUILDKIT=1 docker build --pull --no-cache --file Dockerfile.benchmark --build-arg USE_LOCAL_SERVE_FOLDER=$USE_LOCAL_SERVE_FOLDER --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg CUDA_VERSION="${CUDA_VERSION}" --build-arg MACHINE_TYPE="${MACHINE}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}" -t "${DOCKER_TAG}" . diff --git a/examples/dcgan_fashiongen/create_mar.sh b/examples/dcgan_fashiongen/create_mar.sh index 03c038031b..999cb31935 100755 --- a/examples/dcgan_fashiongen/create_mar.sh +++ b/examples/dcgan_fashiongen/create_mar.sh @@ -16,7 +16,6 @@ function cleanup { trap cleanup EXIT # Download and Extract model's source code -sudo apt-get install zip unzip -y wget https://github.com/facebookresearch/pytorch_GAN_zoo/archive/$SRCZIP unzip $SRCZIP diff --git a/test/pytest/test_example_intel_extension_for_pytorch.py b/test/pytest/test_example_intel_extension_for_pytorch.py index 5ff882a800..e92a626b5e 100644 --- a/test/pytest/test_example_intel_extension_for_pytorch.py +++ b/test/pytest/test_example_intel_extension_for_pytorch.py @@ -88,6 +88,10 @@ def scale_workers_with_core_pinning(scaled_num_workers): or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()), reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available", ) +@pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", +) def test_single_worker_affinity(): num_workers = 1 worker_idx = 0 @@ -112,6 +116,10 @@ def test_single_worker_affinity(): or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()), reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available", ) +@pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", +) def test_multi_worker_affinity(): num_workers = 2 setup_torchserve() @@ -138,6 +146,10 @@ def test_multi_worker_affinity(): or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()), reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available", ) +@pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", +) def test_worker_scale_up_affinity(): initial_num_workers = 1 setup_torchserve() @@ -171,6 +183,10 @@ def test_worker_scale_up_affinity(): or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()), reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available", ) +@pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", +) def test_worker_scale_down_affinity(): initial_num_workers = 2 setup_torchserve() diff --git a/test/pytest/test_example_scriptable_tokenzier.py b/test/pytest/test_example_scriptable_tokenzier.py index ca1909edc6..8c1d617270 100644 --- a/test/pytest/test_example_scriptable_tokenzier.py +++ b/test/pytest/test_example_scriptable_tokenzier.py @@ -318,8 +318,10 @@ def test_inference_with_pretrained_model(model_store, test_file, torchserve): assert "Positive" in result_entries assert float(result_entries["Negative"]) == pytest.approx( - 0.0001851904089562595, 1e-3 + 0.0001851904089562595, abs=1e-6 + ) + assert float(result_entries["Positive"]) == pytest.approx( + 0.9998148083686829, abs=1e-6 ) - assert float(result_entries["Positive"]) == pytest.approx(0.9998148083686829, 1e-3) test_utils.unregister_model(model_name) diff --git a/test/pytest/test_handler.py b/test/pytest/test_handler.py index 5f46ba2275..a14af050a4 100644 --- a/test/pytest/test_handler.py +++ b/test/pytest/test_handler.py @@ -285,6 +285,10 @@ def test_kserve_mnist_model_register_and_inference_on_valid_model_explain(): test_utils.unregister_model("mnist") +@pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", +) def test_huggingface_bert_batch_inference(): batch_size = 2 batch_delay = 10000 # 10 seconds diff --git a/test/pytest/test_sm_mme_requirements.py b/test/pytest/test_sm_mme_requirements.py index a667b72883..f933efcfae 100644 --- a/test/pytest/test_sm_mme_requirements.py +++ b/test/pytest/test_sm_mme_requirements.py @@ -15,6 +15,10 @@ ) +@pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", +) def test_no_model_loaded(): """ Validates that TorchServe returns reponse code 404 if no model is loaded. @@ -34,6 +38,10 @@ def test_no_model_loaded(): not ((torch.cuda.device_count() > 0) and torch.cuda.is_available()), reason="Test to be run on GPU only", ) +@pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", +) def test_oom_on_model_load(): """ Validates that TorchServe returns reponse code 507 if there is OOM on model loading. @@ -63,6 +71,10 @@ def test_oom_on_model_load(): not ((torch.cuda.device_count() > 0) and torch.cuda.is_available()), reason="Test to be run on GPU only", ) +@pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", +) def test_oom_on_invoke(): # Create model store directory pathlib.Path(test_utils.MODEL_STORE).mkdir(parents=True, exist_ok=True) diff --git a/test/pytest/test_torch_compile.py b/test/pytest/test_torch_compile.py index 9983707c8b..7666b3a4e7 100644 --- a/test/pytest/test_torch_compile.py +++ b/test/pytest/test_torch_compile.py @@ -64,6 +64,10 @@ def test_start_torchserve(self): assert len(glob.glob("logs/model_log.log")) == 1 assert len(glob.glob("logs/ts_log.log")) == 1 + @pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", + ) def test_server_status(self): result = subprocess.run( "curl http://localhost:8080/ping", @@ -75,6 +79,10 @@ def test_server_status(self): expected_server_status = json.loads(expected_server_status_str) assert json.loads(result.stdout) == expected_server_status + @pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", + ) def test_registered_model(self): result = subprocess.run( "curl http://localhost:8081/models", @@ -86,6 +94,10 @@ def test_registered_model(self): expected_registered_model = json.loads(expected_registered_model_str) assert json.loads(result.stdout) == expected_registered_model + @pytest.mark.skipif( + os.environ.get("TS_RUN_IN_DOCKER", False), + reason="Test to be run outside docker", + ) def test_serve_inference(self): request_data = {"instances": [[1.0], [2.0], [3.0]]} request_json = json.dumps(request_data) diff --git a/test/regression_tests.py b/test/regression_tests.py index 2f7a6221a7..a4f0bdd468 100644 --- a/test/regression_tests.py +++ b/test/regression_tests.py @@ -1,28 +1,23 @@ -import sys import os -from pygit2 import Repository +import sys # To help discover local modules REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") sys.path.append(REPO_ROOT) +import datetime + +from ts_scripts import marsgen as mg +from ts_scripts.api_utils import test_api from ts_scripts.install_from_src import install_from_src from ts_scripts.regression_utils import test_regression -from ts_scripts.api_utils import test_api -from ts_scripts import print_env_info as build_hdr_printer from ts_scripts.utils import check_python_version -from ts_scripts import marsgen as mg - -import datetime now = datetime.datetime.now() print("Current date and time : " + now.strftime("%Y-%m-%d %H:%M:%S")) check_python_version() -git_branch = Repository('.').head.shorthand -build_hdr_printer.main(git_branch) - # Install from source install_from_src() @@ -32,10 +27,10 @@ # Run newman api tests test_api( "all" -) #"all" > management, inference, increased_timeout_inference, https collections +) # "all" > management, inference, increased_timeout_inference, https collections # Run regression tests test_regression() # delete mar_gen_dir -mg.delete_model_store_gen_dir() \ No newline at end of file +mg.delete_model_store_gen_dir() diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index 5c5e1f4b88..f6474c9370 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1062,4 +1062,5 @@ XLA inferentia ActionSLAM statins +ci chatGPT