diff --git a/docker/Dockerfile b/docker/Dockerfile index 94f4a1ba99..05cfa9fe77 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -37,12 +37,12 @@ ARG BRANCH_NAME ARG REPO_URL=https://github.com/pytorch/serve.git ENV PYTHONUNBUFFERED TRUE -RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ +RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ apt-get update && \ apt-get upgrade -y && \ apt-get install software-properties-common -y && \ add-apt-repository -y ppa:deadsnakes/ppa && \ - apt remove python-pip python3-pip && \ + apt remove -y python-pip python3-pip && \ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ ca-certificates \ g++ \ @@ -55,6 +55,13 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ git \ && rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ + if [ "$USE_ROCM_VERSION" ]; then \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y rocm-dev amd-smi-lib \ + && rm -rf /var/lib/apt/lists/* ; \ + fi + # Make the virtual environment and "activating" it by adding it first to the path. # From here on the python$PYTHON_VERSION interpreter is used and the packages # are installed in /home/venv which is what we need for the "runtime-image" @@ -67,6 +74,7 @@ RUN python -m pip install -U pip setuptools RUN export USE_CUDA=1 ARG USE_CUDA_VERSION="" +ARG USE_ROCM_VERSION="" COPY ./ serve @@ -76,7 +84,6 @@ RUN \ git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \ fi - WORKDIR "serve" RUN cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh @@ -90,6 +97,14 @@ RUN \ else \ python ./ts_scripts/install_dependencies.py;\ fi; \ + elif echo "${BASE_IMAGE}" | grep -q "rocm/"; then \ + # Install ROCm version specific binary when ROCm version is specified as a build arg + if [ "$USE_ROCM_VERSION" ]; then \ + python ./ts_scripts/install_dependencies.py --rocm $USE_ROCM_VERSION;\ + # Install the binary with the latest CPU image on a ROCm base image + else \ + python ./ts_scripts/install_dependencies.py; \ + fi; \ # Install the CPU binary else \ python ./ts_scripts/install_dependencies.py; \ @@ -111,13 +126,14 @@ FROM ${BASE_IMAGE} AS production-image # Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top) ARG PYTHON_VERSION ENV PYTHONUNBUFFERED TRUE +ARG USE_ROCM_VERSION -RUN --mount=type=cache,target=/var/cache/apt \ +RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ apt-get update && \ apt-get upgrade -y && \ apt-get install software-properties-common -y && \ add-apt-repository ppa:deadsnakes/ppa -y && \ - apt remove python-pip python3-pip && \ + apt remove -y python-pip python3-pip && \ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ python$PYTHON_VERSION \ python3-distutils \ @@ -130,6 +146,13 @@ RUN --mount=type=cache,target=/var/cache/apt \ && rm -rf /var/lib/apt/lists/* \ && cd /tmp +RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ + if [ "$USE_ROCM_VERSION" ]; then \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y rocm-dev amd-smi-lib \ + && rm -rf /var/lib/apt/lists/* ; \ + fi + RUN useradd -m model-server \ && mkdir -p /home/model-server/tmp @@ -137,6 +160,11 @@ COPY --chown=model-server --from=compile-image /home/venv /home/venv COPY --from=compile-image /usr/local/bin/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh ENV PATH="/home/venv/bin:$PATH" +RUN \ + if [ "$USE_ROCM_VERSION" ]; then \ + python -m pip install /opt/rocm/share/amd_smi; \ + fi + RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh \ && chown -R model-server /home/model-server @@ -157,13 +185,14 @@ FROM ${BASE_IMAGE} AS ci-image ARG PYTHON_VERSION ARG BRANCH_NAME ENV PYTHONUNBUFFERED TRUE +ARG USE_ROCM_VERSION -RUN --mount=type=cache,target=/var/cache/apt \ +RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ apt-get update && \ apt-get upgrade -y && \ apt-get install software-properties-common -y && \ add-apt-repository -y ppa:deadsnakes/ppa && \ - apt remove python-pip python3-pip && \ + apt remove -y python-pip python3-pip && \ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ python$PYTHON_VERSION \ python3-distutils \ @@ -183,6 +212,12 @@ RUN --mount=type=cache,target=/var/cache/apt \ && rm -rf /var/lib/apt/lists/* \ && cd /tmp +RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ + if [ "$USE_ROCM_VERSION" ]; then \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y rocm-dev amd-smi-lib \ + && rm -rf /var/lib/apt/lists/* ; \ + fi COPY --from=compile-image /home/venv /home/venv @@ -190,6 +225,11 @@ ENV PATH="/home/venv/bin:$PATH" RUN python -m pip install --no-cache-dir -r https://raw.githubusercontent.com/pytorch/serve/$BRANCH_NAME/requirements/developer.txt +RUN \ + if [ "$USE_ROCM_VERSION" ]; then \ + python -m pip install /opt/rocm/share/amd_smi; \ + fi + RUN mkdir /home/serve ENV TS_RUN_IN_DOCKER True @@ -203,11 +243,13 @@ ARG PYTHON_VERSION ARG BRANCH_NAME ARG BUILD_FROM_SRC ARG LOCAL_CHANGES +ARG USE_ROCM_VERSION ARG BUILD_WITH_IPEX ARG IPEX_VERSION=1.11.0 ARG IPEX_URL=https://software.intel.com/ipex-whl-stable ENV PYTHONUNBUFFERED TRUE -RUN --mount=type=cache,target=/var/cache/apt \ + +RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ apt-get update && \ apt-get upgrade -y && \ apt-get install software-properties-common -y && \ @@ -227,9 +269,15 @@ RUN --mount=type=cache,target=/var/cache/apt \ # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1009905 openjdk-17-jdk \ build-essential \ + wget \ curl \ vim \ numactl \ + nodejs \ + npm \ + zip \ + unzip \ + && npm install -g newman@5.3.2 newman-reporter-htmlextra markdown-link-check \ && if [ "$BUILD_WITH_IPEX" = "true" ]; then apt-get update && apt-get install -y libjemalloc-dev libgoogle-perftools-dev libomp-dev && ln -s /usr/lib/x86_64-linux-gnu/libjemalloc.so /usr/lib/libjemalloc.so && ln -s /usr/lib/x86_64-linux-gnu/libtcmalloc.so /usr/lib/libtcmalloc.so && ln -s /usr/lib/x86_64-linux-gnu/libiomp5.so /usr/lib/libiomp5.so; fi \ && rm -rf /var/lib/apt/lists/* @@ -243,10 +291,17 @@ RUN \ COPY --from=compile-image /home/venv /home/venv ENV PATH="/home/venv/bin:$PATH" + +RUN \ + if [ "$USE_ROCM_VERSION" ]; then \ + python -m pip install /opt/rocm/share/amd_smi; \ + fi + WORKDIR "serve" + RUN python -m pip install -U pip setuptools \ && python -m pip install --no-cache-dir -r requirements/developer.txt \ - && python ts_scripts/install_from_src.py \ + && python ts_scripts/install_from_src.py --environment=dev\ && useradd -m model-server \ && mkdir -p /home/model-server/tmp \ && cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh \ diff --git a/docker/README.md b/docker/README.md index 9e5ca8a229..9232f6294a 100644 --- a/docker/README.md +++ b/docker/README.md @@ -44,6 +44,7 @@ Use `build_image.sh` script to build the docker images. The script builds the `p |-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, ci| |-t, --tag|Tag name for image. If not specified, script uses torchserve default tag names.| |-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`. `cu121`, Default `cu121`| +|-rv, --rocmversion| Specify to rocm version to use. Supported values `rocm60`, `rocm61`, `rocm62` | |-ipex, --build-with-ipex| Specify to build with intel_extension_for_pytorch. If not specified, script builds without intel_extension_for_pytorch.| |-cpp, --build-cpp specify to build TorchServe CPP| |-n, --nightly| Specify to build with TorchServe nightly.| @@ -62,9 +63,9 @@ Creates a docker image with publicly available `torchserve` and `torch-model-arc ./build_image.sh ``` - - To create a GPU based image with cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118` + - To create a GPU based image with cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118` for CUDA and `rocm60`, `rocm61`, `rocm62` for ROCm. - - GPU images are built with NVIDIA CUDA base image. If you want to use ONNX, please specify the base image as shown in the next section. + - GPU images are built with either NVIDIA CUDA base image amd AMD ROCm base image. If you want to use ONNX, please specify the base image as shown in the next section. ```bash ./build_image.sh -g -cv cu117 @@ -132,6 +133,24 @@ Creates a docker image with `torchserve` and `torch-model-archiver` installed fr ./build_image.sh -bt dev -g -cv cu92 ``` +- For creating GPU based image with rocm version 6.0: + +```bash +./build_image.sh -bt dev -g -rv rocm60 +``` + +- For creating GPU based image with rocm version 6.1: + +```bash +./build_image.sh -bt dev -g -rv rocm61 +``` + +- For creating GPU based image with rocm version 6.2: + +```bash +./build_image.sh -bt dev -g -rv rocm62 +``` + - For creating GPU based image with a different branch: ```bash @@ -164,7 +183,6 @@ Creates a docker image with `torchserve` and `torch-model-archiver` installed fr ./build_image.sh -bt dev -g [-cv cu121|cu118] -cpp ``` -- For ROCm support (*experimental*), refer to [this documentation](../docs/hardware_support/amd_support.md). ## Start a container with a TorchServe image @@ -204,6 +222,12 @@ For GPU latest image with gpu devices 1 and 2: docker run --rm -it --gpus '"device=1,2"' -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:7070:7070 -p 127.0.0.1:7071:7071 pytorch/torchserve:latest-gpu ``` +For GPU with ROCm support with gpu devices 1 and 2: + +```bash +docker run --rm -it --device=/dev/kfd --device=/dev/dri -e HIP_VISIBLE_DEVICES=1,2 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:7070:7070 -p 127.0.0.1:7071:7071 pytorch/torchserve:latest-gpu +``` + For specific versions you can pass in the specific tag to use (ex: `0.1.1-cuda10.1-cudnn7-runtime`): ```bash diff --git a/docker/build_image.sh b/docker/build_image.sh index b5b9f8e87e..a6ddac2fee 100755 --- a/docker/build_image.sh +++ b/docker/build_image.sh @@ -11,6 +11,7 @@ BASE_IMAGE="ubuntu:20.04" UPDATE_BASE_IMAGE=false USE_CUSTOM_TAG=false CUDA_VERSION="" +ROCM_VERSION="" USE_LOCAL_SERVE_FOLDER=false BUILD_WITH_IPEX=false BUILD_CPP=false @@ -33,6 +34,7 @@ do echo "-bi, --baseimage specify base docker image. Example: nvidia/cuda:11.7.0-cudnn8-runtime-ubuntu20.04 " echo "-bt, --buildtype specify for type of created image. Possible values: production, dev, ci." echo "-cv, --cudaversion specify to cuda version to use" + echo "-rv, --rocmversion spesify to rocm version to use" echo "-t, --tag specify tag name for docker image" echo "-lf, --use-local-serve-folder specify this option for the benchmark image if the current 'serve' folder should be used during automated benchmarks" echo "-ipex, --build-with-ipex specify to build with intel_extension_for_pytorch" @@ -167,6 +169,24 @@ do shift shift ;; + -rv|--rocmversion) + ROCM_VERSION="$2" + if [ "${ROCM_VERSION}" == "rocm60" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.0.2" + elif [ "${ROCM_VERSION}" == "rocm61" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.1.2" + elif [ "${ROCM_VERSION}" == "rocm62" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.2.4" + else + echo "ROCM version not supported" + exit 1 + fi + shift + shift + ;; esac done @@ -218,6 +238,23 @@ then exit 1 fi fi + + if [[ "${MACHINE}" == "gpu" || "${ROCM_VERSION}" != "" ]]; + then + if [ "${ROCM_VERSION}" == "rocm60" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.0.2-complete" + elif [ "${ROCM_VERSION}" == "rocm61" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.1.2-complete" + elif [ "${ROCM_VERSION}" == "rocm62" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.2.4-complete" + else + echo "ROCm version $ROCM_VERSION is not supported for CPP" + exit 1 + fi + fi fi if [ "${BUILD_TYPE}" == "production" ]; then @@ -232,16 +269,16 @@ if [ "${BUILD_TYPE}" == "production" ]; then fi elif [ "${BUILD_TYPE}" == "ci" ]; then - DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ + DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\ --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --target ci-image ../ else if [ "${BUILD_CPP}" == "true" ] then - DOCKER_BUILDKIT=1 docker build --file Dockerfile.cpp --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ + DOCKER_BUILDKIT=1 docker build --file Dockerfile.cpp --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" -t "${DOCKER_TAG}" --target cpp-dev-image . else - DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ + DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}" --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}"\ --build-arg BUILD_WITH_IPEX="${BUILD_WITH_IPEX}" -t "${DOCKER_TAG}" --target dev-image ../ fi diff --git a/docs/hardware_support/amd_support.md b/docs/hardware_support/amd_support.md index 55de40f6d4..bc2090917d 100644 --- a/docs/hardware_support/amd_support.md +++ b/docs/hardware_support/amd_support.md @@ -5,7 +5,7 @@ TorchServe can be run on any combination of operating system and device that is ## Supported Versions of ROCm -The current stable `major.patch` version of ROCm and the previous path version will be supported. For example version `N.2` and `N.1` where `N` is the current major version. +The current stable `major.patch` version of ROCm and the previous patch version will be supported. For example version `N.2` and `N.1` where `N` is the current major version. ## Installation @@ -35,7 +35,7 @@ The current stable `major.patch` version of ROCm and the previous path version w - install the dependencies needed for ROCm support. ```bash - python ./ts_scripts/install_dependencies.py --rocm=rocm61 + python ./ts_scripts/install_dependencies.py --rocm=rocm62 python ./ts_scripts/install_from_src.py ``` - enable amd-smi in the python virtual environment @@ -60,20 +60,6 @@ If you have 8 accelerators but only want TorchServe to see the last four of them > ⚠️ Setting both `CUDA_VISIBLE_DEVICES` and `HIP_VISIBLE_DEVICES` may cause unintended behaviour and should be avoided. > Doing so may cause an exception in the future. -## Docker - -**In Development** - -`Dockerfile.rocm` provides preliminary ROCm support for TorchServe. - -Building and running `dev-image`: - -```bash -docker build --file docker/Dockerfile.rocm --target dev-image -t torch-serve-dev-image-rocm --build-arg USE_ROCM_VERSION=rocm62 --build-arg BUILD_FROM_SRC=true . - -docker run -it --rm --device=/dev/kfd --device=/dev/dri torch-serve-dev-image-rocm bash -``` - ## Example Usage After installing TorchServe with the required dependencies for ROCm you should be ready to serve your model. diff --git a/ts_scripts/api_utils.py b/ts_scripts/api_utils.py index 02e1fa4bc3..80db6c8548 100755 --- a/ts_scripts/api_utils.py +++ b/ts_scripts/api_utils.py @@ -2,6 +2,7 @@ import os import shutil import sys +import time REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") sys.path.append(REPO_ROOT) @@ -108,10 +109,35 @@ def cleanup_model_store(): os.remove(f) -def move_logs(log_file, artifact_dir): +def move_logs(log_file, artifact_dir, retries=5): logs_dir = os.path.join("logs") - os.rename(log_file, os.path.join(logs_dir, log_file)) # mv file logs/ - os.rename(logs_dir, os.path.join(artifact_dir, logs_dir)) # mv logs/ dir + + if not os.path.exists(logs_dir): + os.makedirs(logs_dir) + + shutil.move(log_file, os.path.join(logs_dir, log_file)) # mv file logs/ + + destination_dir = os.path.join(artifact_dir, logs_dir) + + for attempt in range(retries): + try: + if os.path.exists(destination_dir): + # Merge contents if destination directory already exists + for root, dirs, files in os.walk(logs_dir): + for file in files: + shutil.move( + os.path.join(root, file), + os.path.join(destination_dir, file), + ) + shutil.rmtree(logs_dir) # Remove the empty logs directory + else: + shutil.move(logs_dir, destination_dir) # mv logs/ dir + break + except: + if attempt < retries - 1: + time.sleep(2) + else: + raise def trigger_management_tests():