diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml deleted file mode 100644 index 4afa1eb..0000000 --- a/.github/workflows/image.yml +++ /dev/null @@ -1,171 +0,0 @@ -name: "Build image and test" - -on: - schedule: - - cron: "20 4 * * 1" # once a week - workflow_dispatch: - - push: - branches: [main] - - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -env: - KIND_CLUSTER_NAME: "kserve-testing" - ISVC_NAME: "vllm-tgis-isvc" - TEST_NS: "kserve-demo" - # note: knative serving will try to resolve the image tag unless the registry is kind.local - # See the deployment configmap: - # registries-skipping-tag-resolving: "kind.local,ko.local,dev.local" - DEV_IMAGE: "kind.local/vllm-tgis:dev" - QUAY_IMAGE: "quay.io/dtrifiro/vllm-tgis" - -jobs: - build-image: - timeout-minutes: 40 - name: "Build image" - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Free Disk Space - uses: jlumbroso/free-disk-space@v1.3.1 - with: - tool-cache: false - large-packages: false - docker-images: false - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - # TODO: setup caching for mount=type=cache ? - # - .nox (nox envs) - # - /root/.cache/pip - # https://docs.docker.com/build/ci/github-actions/cache/#cache-mounts - - - name: Login to Quay - uses: docker/login-action@v3 - with: - registry: quay.io - username: ${{ secrets.QUAY_USERNAME }} - password: ${{ secrets.QUAY_TOKEN }} - - - name: Build and export - uses: docker/build-push-action@v5 - with: - context: . - file: ./Dockerfile - tags: ${{ env.DEV_IMAGE }} - # outputs: type=oci,dest=/tmp/image.tar - outputs: type=docker,dest=/tmp/image.tar - cache-from: type=gha - cache-to: type=gha,mode=max - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.QUAY_IMAGE }} - tags: | - type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'main') }} - type=sha - type=ref,event=branch - type=ref,event=pr - - - name: Push image - uses: docker/build-push-action@v5 - with: - context: . - file: ./Dockerfile - load: true - cache-from: type=gha - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - - - name: "Push to quay" # NOTE: we need this step since build and push hangs on push - run: | - docker push --all-tags "${QUAY_IMAGE}" - - # - name: Upload artifact - # uses: actions/upload-artifact@v4 - # with: - # name: vllm-tgis - # path: /tmp/image.tar - - # Temporarily disabled: need to make sure the runner can run vLLM (AVX512 instruction set or GPU) - # kserve-smoke-test: - # needs: build-image - - # name: Kserve Smoke Test - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v4 - - # - name: Setup kind/kserve - # uses: dtrifiro/setup-kserve@v0.0.2 - # with: - # cluster_name: ${{ env.KIND_CLUSTER_NAME }} - # namespace: ${{ env.TEST_NS }} - - # - name: Download built image - # uses: actions/download-artifact@v4 - - # - name: Load built image into kind - # run: | - # kind load image-archive --name ${{ env.KIND_CLUSTER_NAME }} vllm-tgis/image.tar - - # - name: Free Disk Space - # uses: jlumbroso/free-disk-space@v1.3.1 - # with: - # tool-cache: false - - # - name: Setup flan-t5-small model volume - # run: | - # sed 's|quay.io/dtrifiro/vllm-tgis:fast|${{ env.DEV_IMAGE }}|g' .github/test/kserve/setup.yaml | \ - # kubectl apply -f - - - # max_retries=10 - # wait_time=60s - # until kubectl wait --for=jsonpath='{.status.phase}'=Succeeded pod/setup-flan-t5-small --timeout ${wait_time}; do - # echo "Current status:" - # kubectl describe pod,pv,pvc - # max_retries=$((max_retries-1)) - # if [[ $max_retries -le 0 ]]; then - # echo "Failed to setup model" - # kubectl logs pod/setup-flan-t5-small --all-containers - # exit 1 - # fi - # echo "-------------------" - # done - - # - name: Deploy ServingRuntime/InferenceService - # run: | - # sed 's|quay.io/opendatahub/vllm:fast|${{ env.DEV_IMAGE }}|g' .github/test/kserve/vllm-tgis.yaml | \ - # kubectl apply -f - - - # max_retries=10 - # wait_time=60s - # until kubectl wait isvc/${ISVC_NAME} --for=condition=Ready --timeout=${wait_time}; do - # echo "Current status:" - # kubectl describe isvc,servingruntime,pod - # max_retries=$((max_retries-1)) - # if [[ $max_retries -le 0 ]]; then - # exit 1 - # fi - # echo "-------------------" - # done - - # - name: Perform test inference (http) - # run: | - # export ISVC_URL="$(oc get isvc ${ISVC_NAME} -o jsonpath='{.status.components.predictor.url}')" - # export ISVC_HOSTNAME=$(echo $ISVC_URL | cut -d/ -f3-) - # echo "Querying ISVC at: ${ISVC_URL}" - - # # We can't query the service via hostname, we need to add the entry to /etc/hosts - # echo "127.0.0.1 ${ISVC_HOSTNAME}" | sudo tee -a /etc/hosts - - # python examples/inference.py diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 49ce02e..0000000 --- a/Dockerfile +++ /dev/null @@ -1,122 +0,0 @@ -ARG BASE_UBI_IMAGE_TAG=9.3-1612 -ARG PYTHON_VERSION=3.11 - -FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base - -ENV LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 - -RUN microdnf install -y \ - python3.11-pip python3.11-wheel \ - && microdnf clean all - -WORKDIR /workspace - - -FROM base as python-base - -ARG PYTHON_VERSION - -ENV VIRTUAL_ENV=/opt/vllm -ENV PATH="$VIRTUAL_ENV/bin:$PATH" - -# hadolint ignore=DL3041 -RUN microdnf install -y \ - git \ - python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ - python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && microdnf clean all - -# hadolint ignore=DL3042,DL3013 -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -U pip - -# TODO: add flash attention build - -FROM python-base AS build -ARG PYTHON_VERSION - -ENV VIRTUAL_ENV=/opt/vllm -ENV PATH="$VIRTUAL_ENV/bin:$PATH" - -COPY --from=python-base /opt/vllm /opt/vllm - -# hadolint ignore=DL3042 -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install nox==2023.4.22 # TODO: setup renovate - -COPY README.md . - -COPY pyproject.toml . -COPY noxfile.py . - -COPY src src - -# setuptools scm requires the git directory to infer version from git tags, so we bind-mount the git dir when building -RUN --mount=type=bind,source=.git,target=.git \ - --mount=type=cache,target=.nox \ - --mount=type=cache,target=/root/.cache/pip \ - nox -s build-${PYTHON_VERSION} - - -FROM base AS deploy -ARG flash_attn_version=2.5.8 -ARG cuda_version_flashattn=122 -ARG torch_version=2.1 - -WORKDIR /workspace - -COPY --from=python-base /opt/vllm /opt/vllm - -ENV VIRTUAL_ENV=/opt/vllm -ENV PATH=$VIRTUAL_ENV/bin/:$PATH - -# Triton needs a CC compiler -# hadolint ignore=DL3041 -RUN microdnf install -y gcc \ - && microdnf clean all - -ENV FLASH_ATTN_VERSION=${flash_attn_version} -ENV CUDA_VERSION_FLASHATTN=${cuda_version_flashattn} -ENV CUDA_VERSION="12.0.0" -ENV TORCH_VERSION=${torch_version} - - -# make sure that the python version in the flash-attn wheel install below matches ${PYTHON_VERSION} -# hadolint ignore=DL3042 -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,from=build,src=/workspace/dist/,target=/workspace/dist/ \ - pip install \ - https://github.com/Dao-AILab/flash-attention/releases/download/v${FLASH_ATTN_VERSION}/flash_attn-${FLASH_ATTN_VERSION}+cu${CUDA_VERSION_FLASHATTN}torch${TORCH_VERSION}cxx11abiFALSE-cp311-cp311-linux_x86_64.whl \ - dist/*whl - -# vllm requires a specific nccl version built from source distribution -# See https://github.com/NVIDIA/nccl/issues/1234 -RUN pip install \ - -v \ - --force-reinstall \ - --no-binary="all" \ - --no-cache-dir \ - "vllm-nccl-cu12==2.18.1.0.4.0" && \ - mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/ && \ - chmod 0755 /opt/vllm/lib/libnccl.so.2.18.1 - -ENV HF_HUB_OFFLINE=1 \ - PORT=8000 \ - GRPC_PORT=8033 \ - HOME=/home/vllm \ - VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \ - VLLM_USAGE_SOURCE=production-docker-image \ - VLLM_WORKER_MULTIPROC_METHOD=fork - -# setup non-root user for OpenShift -RUN microdnf install -y shadow-utils \ - && umask 002 \ - && useradd --uid 2000 --gid 0 vllm \ - && microdnf remove -y shadow-utils \ - && microdnf clean all \ - && chmod g+rwx $HOME /usr/src /workspace - -COPY LICENSE /licenses/vllm.md - -USER 2000 -CMD ["python", "-m", "vllm_tgis_adapter"] diff --git a/README.md b/README.md index e7694ec..42c33fa 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,8 @@ vLLM adapter for a TGIS-compatible grpc server. [![PyPi](https://img.shields.io/pypi/v/vllm-tgis-adapter?label=pip)](https://pypi.org/project/vllm-tgis-adapter) -[![Tests](https://github.com/dtrifiro/vllm-tgis-adapter/actions/workflows/tests.yaml/badge.svg)](https://github.com/dtrifiro/vllm-tgis-adapter/actions/workflows/tests.yaml) -[![Docker Image Build](https://github.com/dtrifiro/vllm-tgis-adapter/actions/workflows/image.yml/badge.svg)](https://github.com/dtrifiro/vllm-tgis-adapter/actions/workflows/image.yml) -[![quay.io/dtrifiro/vllm-tgis](https://img.shields.io/badge/quay.io-dtrifiro/vllm--tgis-darkred)](https://quay.io/repository/dtrifiro/vllm-tgis?tab=tags) +[![Tests](https://github.com/opendatahub-io/vllm-tgis-adapter/actions/workflows/tests.yaml/badge.svg)](https://github.com/opendatahub-io/vllm-tgis-adapter/actions/workflows/tests.yaml) +[![quay.io/opendatahub/vllm](https://img.shields.io/badge/quay.io-opendatahub/vllm--tgis-darkred)](https://quay.io/repository/opendatahub/vllm?tab=tags) ## Install @@ -49,10 +48,10 @@ bash examples/inference.sh ### Docker -Image available at [quay.io/dtrifiro/vllm-tgis](https://quay.io/dtrifiro/vllm-tgis?tab=tags) +Image available at [quay.io/opendatahub/vllm](https://quay.io/opendatahub/vllm?tab=tags), built from [opendatahub-io/vllm](https://github.com/opendatahub-io/vllm)'s [Dockerfile.ubi](https://github.com/opendatahub-io/vllm/tree/main/Dockerfile.ubi) ```bash -docker pull quay.io/dtrifiro/vllm-tgis +docker pull quay.io/opendatahub/vllm ``` ### Inference diff --git a/pyproject.toml b/pyproject.toml index fd15591..23aafa6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,8 +37,8 @@ dependencies = [ ] [project.urls] -Issues = "https://github.com/dtrifiro/vllm_tgis_adapter/issues" -Source = "https://github.com/dtrifiro/vllm_tgis_adapter" +Issues = "https://github.com/opendatahub-io/vllm_tgis_adapter/issues" +Source = "https://github.com/opendatahub-io/vllm_tgis_adapter" [project.scripts] grpc_healthcheck = "vllm_tgis_adapter.healthcheck:cli"