Add example of helm chart for vllm deployment on k8s #190
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Lint and Test Charts | |
on: pull_request | |
jobs: | |
lint-test: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Set up Helm | |
uses: azure/[email protected] | |
with: | |
version: v3.14.4 | |
#Python is required because ct lint runs Yamale and yamllint which require Python. | |
- uses: actions/setup-python@v3 | |
with: | |
python-version: 3.7 | |
- name: Set up chart-testing | |
uses: helm/[email protected] | |
with: | |
version: v3.10.1 | |
- name: Run chart-testing (lint) | |
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm | |
- name: Setup minio | |
run: | | |
docker network create vllm-net | |
docker run -d -p 9000:9000 --name minio --net vllm-net \ | |
-e "MINIO_ACCESS_KEY=minioadmin" \ | |
-e "MINIO_SECRET_KEY=minioadmin" \ | |
-v /tmp/data:/data \ | |
-v /tmp/config:/root/.minio \ | |
minio/minio server /data | |
export AWS_ACCESS_KEY_ID=minioadmin | |
export AWS_SECRET_ACCESS_KEY=minioadmin | |
export AWS_EC2_METADATA_DISABLED=true | |
mkdir opt-125m | |
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd .. | |
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket | |
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive | |
- name: Create kind cluster | |
uses: helm/[email protected] | |
- name: Build the Docker image vllm cpu | |
run: docker buildx build --file Dockerfile.cpu --tag vllm-cpu-env --build-arg VLLM_CPU_DISABLE_AVX512="true" . | |
- name: Configuration of docker images, network and namespace for the kind cluster | |
run: | | |
docker pull amazon/aws-cli:2.6.4 | |
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing | |
kind load docker-image vllm-cpu-env:latest --name chart-testing | |
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")" | |
kubectl create ns ns-vllm | |
- name: Run chart-testing (install) | |
run: | | |
export AWS_ACCESS_KEY_ID=minioadmin | |
export AWS_SECRET_ACCESS_KEY=minioadmin | |
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=DTYPE --set image.env[2].name=VLLM_RPC_TIMEOUT --set image.env[3].name=VLLM_LOGGING_LEVEL --set image.env[4].name=ONEDNN_VERBOSE --set-string image.env[0].value="1" --set image.env[1].value=bfloat16 --set-string image.env[2].value="100000" --set-string image.env[3].value="DEBUG" --set image.env[4].value=all --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" | |
- name: curl test | |
run: | | |
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 & | |
sleep 10 | |
(kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods -o name | awk '/deployment/ {print $1;exit}')) & | |
curl -v --fail-with-body --show-error http://localhost:8001/v1/completions \ | |
--header "Content-Type: application/json" \ | |
--data '{ | |
"model": "opt-125m", | |
"prompt": "San Francisco is a", | |
"max_tokens": 7, | |
"temperature": 0 | |
}' | |
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \ | |
--header "Content-Type: application/json" \ | |
--data '{ | |
"model": "opt-125m", | |
"prompt": "San Francisco is a", | |
"max_tokens": 7, | |
"temperature": 0 | |
}'):$CODE" | |
echo "$CODE" |