skypilot-org · cblmemo · Nov 16, 2023 · Nov 16, 2023
diff --git a/llm/vicuna/README.md b/llm/vicuna/README.md
@@ -41,6 +41,11 @@ sky launch -c vicuna-serve-v100 -s serve.yaml --gpus V100
 sky launch -c vicuna-serve -s serve.yaml --env MODEL_SIZE=13
 ```
 
+5. [Optional] Serve the OpenAI API Compatible Endpoint:
+```bash
+sky launch -c vicuna-openai-api -s serve-openai-api-endpoint.yaml
+```
+
 
 ## Training Vicuna with SkyPilot
 Currently, training requires GPUs with 80GB memory.  See `sky show-gpus --all` for supported GPUs.

diff --git a/llm/vicuna/serve-openai-api-endpoint.yaml b/llm/vicuna/serve-openai-api-endpoint.yaml
@@ -0,0 +1,36 @@
+resources:
+  ports: 8080
+  accelerators: A100:1
+  disk_size: 1024
+  disk_tier: high
+
+setup: |
+  conda activate chatbot
+  if [ $? -ne 0 ]; then
+    conda create -n chatbot python=3.9 -y
+    conda activate chatbot
+  fi
+
+  # Install dependencies
+  pip install "fschat[model_worker,webui]==0.2.24"
+  pip install protobuf
+
+run: |
+  conda activate chatbot
+
+  echo 'Starting controller...'
+  python -u -m fastchat.serve.controller > ~/controller.log 2>&1 &
+  sleep 10
+  echo 'Starting model worker...'
+  python -u -m fastchat.serve.model_worker \
+            --model-path lmsys/vicuna-${MODEL_SIZE}b-v1.3 2>&1 \
+            | tee model_worker.log &
+
+  echo 'Waiting for model worker to start...'
+  while ! `cat model_worker.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+  echo 'Starting openai api server...'
+  python -u -m fastchat.serve.openai_api_server --host 0.0.0.0 --port 8080 | tee ~/openai_api_server.log
+
+envs:
+  MODEL_SIZE: 13
diff --git a/llm/vicuna/serve.yaml b/llm/vicuna/serve.yaml
@@ -11,7 +11,8 @@ setup: |
   fi
 
   # Install dependencies
-  pip install git+https://github.com/lm-sys/FastChat.git
+  pip install "fschat[model_worker,webui]==0.2.24"
+  pip install protobuf
 
 
 run: |