From afdbd51782e1208f813dd2a671685211a275db57 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 10 Oct 2024 13:27:03 +0800 Subject: [PATCH] refine nginx doc Signed-off-by: Yuan Zhou --- docs/source/getting_started/cpu-installation.rst | 2 +- docs/source/getting_started/nginx-loadbalancer.rst | 14 +++----------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index 1e157c57e009f..bd60af69796aa 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -162,4 +162,4 @@ CPU Backend Considerations $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp - * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx `_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_. \ No newline at end of file + * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx `_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_. \ No newline at end of file diff --git a/docs/source/getting_started/nginx-loadbalancer.rst b/docs/source/getting_started/nginx-loadbalancer.rst index 0362973fe8ae2..4b7dd1fd8447d 100644 --- a/docs/source/getting_started/nginx-loadbalancer.rst +++ b/docs/source/getting_started/nginx-loadbalancer.rst @@ -30,9 +30,6 @@ Create a file named ``Dockerfile.nginx``: .. code-block:: console - # Copyright (C) 2024 Intel Corporation - # SPDX-License-Identifier: Apache-2.0 - FROM nginx:latest RUN rm /etc/nginx/conf.d/default.conf EXPOSE 80 @@ -74,15 +71,9 @@ Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many ser Build vLLM Container -------------------- -Notes: - -* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-hf``. - .. code-block:: console cd $vllm_root - model=meta-llama/Llama-2-7b-hf - sed -i "s|ENTRYPOINT \[\"python3\", \"-m\", \"vllm.entrypoints.openai.api_server\"\]|ENTRYPOINT [\"python3\", \"-m\", \"vllm.entrypoints.openai.api_server\", \"--model\", \"$model\"]|" Dockerfile.cpu docker build -f Dockerfile.cpu . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy .. _nginxloadbalancer_nginx_docker_network: @@ -105,6 +96,7 @@ Notes: * If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. * If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again. * The below example assumes a machine where socket 0 has cores 0-47 and socket 1 has cores 48-95. Adjust as needed for your application. +* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. .. code-block:: console @@ -112,8 +104,8 @@ Notes: hf_cache_dir=~/.cache/huggingface/ SVR_0_CORES=0-47 SVR_1_CORES=48-95 - docker run -itd --ipc host --privileged --network vllm_nginx --cap-add=SYS_ADMIN --shm-size=10.24gb -e VLLM_CPU_KVCACHE_SPACE=40 -e VLLM_CPU_OMP_THREADS_BIND=$SVR_0_CORES -e http_proxy=$http_proxy -e https_proxy=$https_proxy -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm - docker run -itd --ipc host --privileged --network vllm_nginx --cap-add=SYS_ADMIN --shm-size=10.24gb -e VLLM_CPU_KVCACHE_SPACE=40 -e VLLM_CPU_OMP_THREADS_BIND=$SVR_1_CORES -e http_proxy=$http_proxy -e https_proxy=$https_proxy -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm + docker run -itd --ipc host --privileged --network vllm_nginx --cap-add=SYS_ADMIN --shm-size=10.24gb -e VLLM_CPU_KVCACHE_SPACE=40 -e VLLM_CPU_OMP_THREADS_BIND=$SVR_0_CORES -e http_proxy=$http_proxy -e https_proxy=$https_proxy -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf + docker run -itd --ipc host --privileged --network vllm_nginx --cap-add=SYS_ADMIN --shm-size=10.24gb -e VLLM_CPU_KVCACHE_SPACE=40 -e VLLM_CPU_OMP_THREADS_BIND=$SVR_1_CORES -e http_proxy=$http_proxy -e https_proxy=$https_proxy -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf .. _nginxloadbalancer_nginx_launch_nginx: