From 1e5246dac657330d0c4c561bb0526ad351cbafa1 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@berkeley.edu>
Date: Fri, 26 Apr 2024 13:14:17 -0700
Subject: [PATCH] [Examples] Add qwen-110B and fix GUI (#3489)

* Add qwen-110B

* Add qwen-110B

* fix GUI

* minor

* minor

* use vllm gradio

* Add gif and to ai gallery

* add new label

* add llama3 and qwen to readmes
---
 README.md                                  |  2 +
 docs/source/_gallery_original/index.rst    |  1 +
 docs/source/_gallery_original/llms/qwen.md |  1 +
 docs/source/_static/custom.js              |  1 +
 docs/source/docs/index.rst                 |  1 +
 llm/qwen/README.md                         | 23 +++++++-----
 llm/qwen/gui.yaml                          | 27 +++++---------
 llm/qwen/serve-110b.yaml                   | 43 ++++++++++++++++++++++
 8 files changed, 71 insertions(+), 28 deletions(-)
 create mode 120000 docs/source/_gallery_original/llms/qwen.md
 create mode 100644 llm/qwen/serve-110b.yaml

diff --git a/README.md b/README.md
index 5708a660f0e..07eff4e0104 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@
 ----
 :fire: *News* :fire:
 - [Apr, 2024] Serve and finetune [**Llama 3**](https://skypilot.readthedocs.io/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/)
+- [Apr, 2024] Serve [**Qwen-110B**](https://qwenlm.github.io/blog/qwen1.5-110b/) on your infra: [**example**](./llm/qwen/)
 - [Apr, 2024] Using [**Ollama**](https://github.com/ollama/ollama) to deploy quantized LLMs on CPUs and GPUs: [**example**](./llm/ollama/)
 - [Mar, 2024] Serve and deploy [**Databricks DBRX**](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) on your infra: [**example**](./llm/dbrx/)
 - [Feb, 2024] Deploying and scaling [**Gemma**](https://blog.google/technology/developers/gemma-open-models/) with SkyServe: [**example**](./llm/gemma/)
@@ -153,6 +154,7 @@ To learn more, see our [Documentation](https://skypilot.readthedocs.io/en/latest
 Runnable examples:
 - LLMs on SkyPilot
   - [Llama 3](./llm/llama-3/)
+  - [Qwen](./llm/qwen/) 
   - [Databricks DBRX](./llm/dbrx/)
   - [Gemma](./llm/gemma/)
   - [Mixtral 8x7B](./llm/mixtral/); [Mistral 7B](https://docs.mistral.ai/self-deployment/skypilot/) (from official Mistral team)
diff --git a/docs/source/_gallery_original/index.rst b/docs/source/_gallery_original/index.rst
index 61fb9c342b8..67f4eef11dc 100644
--- a/docs/source/_gallery_original/index.rst
+++ b/docs/source/_gallery_original/index.rst
@@ -37,6 +37,7 @@ Contents
    DBRX (Databricks) <llms/dbrx>
    Llama-2 (Meta) <llms/llama-2>
    Llama-3 (Meta) <llms/llama-3>
+   Qwen (Alibaba) <llms/qwen>
    CodeLlama (Meta) <llms/codellama>
    Gemma (Google) <llms/gemma>
 
diff --git a/docs/source/_gallery_original/llms/qwen.md b/docs/source/_gallery_original/llms/qwen.md
new file mode 120000
index 00000000000..2a6d513503f
--- /dev/null
+++ b/docs/source/_gallery_original/llms/qwen.md
@@ -0,0 +1 @@
+../../../../llm/qwen/README.md
\ No newline at end of file
diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index 26b55b6e5e4..00683c82f6b 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -30,6 +30,7 @@ document.addEventListener('DOMContentLoaded', () => {
         { selector: '.toctree-l1 > a', text: 'DBRX (Databricks)' },
         { selector: '.toctree-l1 > a', text: 'Ollama' },
         { selector: '.toctree-l1 > a', text: 'Llama-3 (Meta)' },
+        { selector: '.toctree-l1 > a', text: 'Qwen (Alibaba)' },
     ];
     newItems.forEach(({ selector, text }) => {
         document.querySelectorAll(selector).forEach((el) => {
diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst
index c73ebc59bb7..412e3284372 100644
--- a/docs/source/docs/index.rst
+++ b/docs/source/docs/index.rst
@@ -70,6 +70,7 @@ Runnable examples:
 * **LLMs on SkyPilot**
 
   * `Llama 3 <https://github.com/skypilot-org/skypilot/tree/master/llm/llama-3>`_
+  * `Qwen <https://github.com/skypilot-org/skypilot/tree/master/llm/qwen>`_
   * `Databricks DBRX <https://github.com/skypilot-org/skypilot/tree/master/llm/dbrx>`_
   * `Gemma <https://github.com/skypilot-org/skypilot/tree/master/llm/gemma>`_
   * `Mixtral 8x7B <https://github.com/skypilot-org/skypilot/tree/master/llm/mixtral>`_; `Mistral 7B <https://docs.mistral.ai/self-deployment/skypilot>`_ (from official Mistral team)
diff --git a/llm/qwen/README.md b/llm/qwen/README.md
index dac1852ff79..6ab9bb22ffc 100644
--- a/llm/qwen/README.md
+++ b/llm/qwen/README.md
@@ -3,6 +3,11 @@
 [Qwen1.5](https://github.com/QwenLM/Qwen1.5) is one of the top open LLMs.
 As of Feb 2024, Qwen1.5-72B-Chat is ranked higher than Mixtral-8x7b-Instruct-v0.1 on the LMSYS Chatbot Arena Leaderboard.
 
+📰 **Update (26 April 2024) -** SkyPilot now also supports the [**Qwen1.5-110B**](https://qwenlm.github.io/blog/qwen1.5-110b/) model! It performs competitively with Llama-3-70B across a [series of evaluations](https://qwenlm.github.io/blog/qwen1.5-110b/#model-quality). Use [serve-110b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/serve-110b.yaml) to serve the 110B model.
+
+<p align="center">
+    <img src="https://i.imgur.com/d7tEhAl.gif" alt="qwen" width="600"/>
+</p>
 
 ## References
 * [Qwen docs](https://qwen.readthedocs.io/en/latest/)
@@ -20,10 +25,10 @@ As of Feb 2024, Qwen1.5-72B-Chat is ranked higher than Mixtral-8x7b-Instruct-v0.
 
 After [installing SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), run your own Qwen model on vLLM with SkyPilot in 1-click:
 
-1. Start serving Qwen 72B on a single instance with any available GPU in the list specified in [serve-72b.yaml](serve-72b.yaml) with a vLLM powered OpenAI-compatible endpoint (You can also switch to [serve-7b.yaml](serve-7b.yaml) for a smaller model):
+1. Start serving Qwen 110B on a single instance with any available GPU in the list specified in [serve-110b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/serve-110b.yaml) with a vLLM powered OpenAI-compatible endpoint (You can also switch to [serve-72b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/serve-72b.yaml) or [serve-7b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/serve-7b.yaml) for a smaller model):
 
 ```console
-sky launch -c qwen serve-72b.yaml
+sky launch -c qwen serve-110b.yaml
 ```
 2. Send a request to the endpoint for completion:
 ```bash
@@ -32,7 +37,7 @@ IP=$(sky status --ip qwen)
 curl -L http://$IP:8000/v1/completions \
     -H "Content-Type: application/json" \
     -d '{
-      "model": "Qwen/Qwen1.5-72B-Chat",
+      "model": "Qwen/Qwen1.5-110B-Chat",
       "prompt": "My favorite food is",
       "max_tokens": 512
   }' | jq -r '.choices[0].text'
@@ -43,7 +48,7 @@ curl -L http://$IP:8000/v1/completions \
 curl -L http://$IP:8000/v1/chat/completions \
     -H "Content-Type: application/json" \
     -d '{
-      "model": "Qwen/Qwen1.5-72B-Chat",
+      "model": "Qwen/Qwen1.5-110B-Chat",
       "messages": [
         {
           "role": "system",
@@ -110,13 +115,13 @@ curl -L http://$ENDPOINT/v1/chat/completions \
 ```
 
 
-## **Optional:** Accessing Code Llama with Chat GUI
+## **Optional:** Accessing Qwen with Chat GUI
 
-It is also possible to access the Code Llama service with a GUI using [FastChat](https://github.com/lm-sys/FastChat).
+It is also possible to access the Qwen service with a GUI using [vLLM](https://github.com/vllm-project/vllm).
 
-1. Start the chat web UI:
+1. Start the chat web UI (change the `--env` flag to the model you are running):
 ```bash
-sky launch -c qwen-gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint qwen)
+sky launch -c qwen-gui ./gui.yaml --env MODEL_NAME='Qwen/Qwen1.5-72B-Chat' --env ENDPOINT=$(sky serve status --endpoint qwen)
 ```
 
 2. Then, we can access the GUI at the returned gradio link:
@@ -124,5 +129,3 @@ sky launch -c qwen-gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint q
 | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
 ```
 
-Note that you may get better results to use a higher temperature and top_p value.
-
diff --git a/llm/qwen/gui.yaml b/llm/qwen/gui.yaml
index df4fc6665d9..bf3c01abf76 100644
--- a/llm/qwen/gui.yaml
+++ b/llm/qwen/gui.yaml
@@ -13,7 +13,8 @@
 # you can click on it to open the GUI.
 
 envs:
-  ENDPOINT: x.x.x.x:3031 # Address of the API server running qwen. 
+  ENDPOINT: x.x.x.x:3031 # Address of the API server running qwen.
+  MODEL_NAME: Qwen/Qwen1.5-72B-Chat
 
 resources:
   cpus: 2
@@ -25,8 +26,8 @@ setup: |
     conda activate qwen
   fi
 
-  pip install "fschat[model_worker,webui]"
-  pip install "openai<1"
+  # Install Gradio for web UI.
+  pip install gradio openai
 
 run: |
   conda activate qwen
@@ -35,19 +36,9 @@ run: |
   CONTROLLER_PORT=21001
   WORKER_PORT=21002
 
-  cat <<EOF > ~/model_info.json
-  {
-    "Qwen/Qwen1.5-72B-Chat": {
-      "model_name": "Qwen/Qwen1.5-72B-Chat",
-      "api_base": "http://${ENDPOINT}/v1",
-      "api_key": "empty",
-      "model_path": "Qwen/Qwen1.5-72B-Chat"
-    }
-  }
-  EOF
-
-  python3 -m fastchat.serve.controller --host 0.0.0.0 --port ${CONTROLLER_PORT} > ~/controller.log 2>&1 &
-
   echo 'Starting gradio server...'
-  python -u -m fastchat.serve.gradio_web_server --share \
-    --register ~/model_info.json | tee ~/gradio.log
+  git clone https://github.com/vllm-project/vllm.git || true
+  python vllm/examples/gradio_openai_chatbot_webserver.py \
+    -m $MODEL_NAME \
+    --port 8811 \
+    --model-url http://$ENDPOINT/v1 | tee ~/gradio.log
diff --git a/llm/qwen/serve-110b.yaml b/llm/qwen/serve-110b.yaml
new file mode 100644
index 00000000000..857f37370b4
--- /dev/null
+++ b/llm/qwen/serve-110b.yaml
@@ -0,0 +1,43 @@
+envs:
+  MODEL_NAME: Qwen/Qwen1.5-110B-Chat
+
+service:
+  # Specifying the path to the endpoint to check the readiness of the replicas.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_tokens: 1
+    initial_delay_seconds: 1200
+  # How many replicas to manage.
+  replicas: 2
+  
+
+resources:
+  accelerators: {A100:8, A100-80GB:4, A100-80GB:8}
+  disk_size: 1024
+  disk_tier: best
+  memory: 32+
+  ports: 8000
+
+setup: |
+  conda activate qwen
+  if [ $? -ne 0 ]; then
+    conda create -n qwen python=3.10 -y
+    conda activate qwen
+  fi
+  pip install -U vllm==0.4.1
+  pip install -U transformers==4.38.0
+
+run: |
+  conda activate qwen
+  export PATH=$PATH:/sbin
+  python -u -m vllm.entrypoints.openai.api_server \
+    --host 0.0.0.0 \
+    --model $MODEL_NAME \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    --max-num-seqs 16 | tee ~/openai_api_server.log
+