diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml new file mode 100644 index 00000000..ed7daef4 --- /dev/null +++ b/qwen/qwen-110B-chat/config.yaml @@ -0,0 +1,16 @@ +external_package_dirs: [] +model_metadata: + example_model_input: {"prompt": "How long would it take to reach the sun?"} +model_name: Qwen1.5-vllm-streaming +python_version: py310 +requirements: +- torch==2.2.1 +- transformers==4.40.0 +- vllm==0.4.1 +- asyncio==3.4.3 +- ray +resources: + accelerator: A100:4 + use_gpu: true +secrets: {} +system_packages: [] diff --git a/qwen/qwen-110B-chat/model/__init__.py b/qwen/qwen-110B-chat/model/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qwen/qwen-110B-chat/model/model.py b/qwen/qwen-110B-chat/model/model.py new file mode 100644 index 00000000..a58fcd07 --- /dev/null +++ b/qwen/qwen-110B-chat/model/model.py @@ -0,0 +1,65 @@ +import subprocess +import uuid +from transformers import AutoTokenizer + +from vllm import SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine + + +class Model: + def __init__(self, model_name="Qwen/Qwen1.5-110B-Chat"): + self.model_name = model_name + self.tokenizer = None + self.sampling_params = None + + command = "ray start --head" + subprocess.check_output(command, shell=True, text=True) + + def load(self): + self.model_args = AsyncEngineArgs( + model=self.model_name, + dtype='auto', + enforce_eager=True, + tensor_parallel_size=4 + + ) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + self.sampling_params = SamplingParams( # Using default values + temperature=0.7, + top_p=0.8, + repetition_penalty=1.05, + max_tokens=512 + ) + + self.llm_engine = AsyncLLMEngine.from_engine_args(self.model_args) + + async def predict(self, model_input): + message = model_input.pop("prompt") + + prompt = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": message} + ] + + text = self.tokenizer.apply_chat_template( + prompt, + tokenize=False, + add_generation_prompt=True + ) + + idx = str(uuid.uuid4().hex) + vllm_generator = self.llm_engine.generate(text, self.sampling_params, idx) + + async def generator(): + full_text = "" + async for output in vllm_generator: + text = output.outputs[0].text + delta = text[len(full_text) :] + full_text = text + yield delta + + return generator() +