update base_model deploy example (#2803)

modelscope · Dec 30, 2024 · 0af291d · 0af291d
1 parent 132adc7
commit 0af291d
Show file tree

Hide file tree

Showing 7 changed files with 77 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -139,6 +139,7 @@ my_model/
 /data
 result/
 images
+/custom/
 
 # Pytorch
 *.pth

diff --git a/examples/deploy/client/llm/base/openai_client.py b/examples/deploy/client/llm/base/openai_client.py
@@ -0,0 +1,41 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+from openai import OpenAI
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def infer(client, model: str, messages):
+    query = messages[0]['content']
+    print(f'query: {query}')
+    resp = client.completions.create(model=model, prompt=query, max_tokens=64, temperature=0)
+    response = resp.choices[0].text
+    print(f'response: {response}')
+    # or (The two calling methods are equivalent.)
+    resp = client.chat.completions.create(model=model, messages=messages, max_tokens=64, temperature=0)
+    response = resp.choices[0].message.content
+    print(f'response: {response}')
+    return response
+
+
+def run_client(host: str = '127.0.0.1', port: int = 8000):
+    client = OpenAI(
+        api_key='EMPTY',
+        base_url=f'http://{host}:{port}/v1',
+    )
+    model = client.models.list().data[0].id
+    print(f'model: {model}')
+
+    messages = [{'role': 'user', 'content': '浙江 -> 杭州\n安徽 -> 合肥\n四川 ->'}]
+    infer(client, model, messages)
+
+
+if __name__ == '__main__':
+    from swift.llm import run_deploy, DeployArguments
+    # NOTE: In a real deployment scenario, please comment out the context of run_deploy.
+    with run_deploy(
+            DeployArguments(
+                model='Qwen/Qwen2.5-1.5B', verbose=False, log_interval=-1, infer_backend='pt',
+                use_chat_template=False)) as port:
+        run_client(port=port)
diff --git a/examples/deploy/client/llm/base/swift_client.py b/examples/deploy/client/llm/base/swift_client.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import List
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']):
+    request_config = RequestConfig(max_tokens=64, temperature=0)
+
+    resp_list = engine.infer(infer_requests, request_config)
+
+    query0 = infer_requests[0].messages[0]['content']
+    print(f'query0: {query0}')
+    print(f'response0: {resp_list[0].choices[0].message.content}')
+
+
+def run_client(host: str = '127.0.0.1', port: int = 8000):
+    engine = InferClient(host=host, port=port)
+    print(f'models: {engine.models}')
+
+    infer_requests = [InferRequest(messages=[{'role': 'user', 'content': '浙江 -> 杭州\n安徽 -> 合肥\n四川 ->'}])]
+    infer_batch(engine, infer_requests)
+
+
+if __name__ == '__main__':
+    from swift.llm import InferEngine, InferRequest, InferClient, RequestConfig, run_deploy, DeployArguments
+    # NOTE: In a real deployment scenario, please comment out the context of run_deploy.
+    with run_deploy(
+            DeployArguments(
+                model='Qwen/Qwen2.5-1.5B', verbose=False, log_interval=-1, infer_backend='pt',
+                use_chat_template=False)) as port:
+        run_client(port=port)
diff --git a/examples/deploy/client/llm/openai_client.py → ...s/deploy/client/llm/chat/openai_client.py b/examples/deploy/client/llm/openai_client.py → ...s/deploy/client/llm/chat/openai_client.py
@@ -7,7 +7,7 @@
 
 
 def infer(client, model: str, messages):
-    resp = client.chat.completions.create(model=model, messages=messages, temperature=0)
+    resp = client.chat.completions.create(model=model, messages=messages, max_tokens=512, temperature=0)
     query = messages[0]['content']
     response = resp.choices[0].message.content
     print(f'query: {query}')

diff --git a/examples/deploy/client/llm/swift_client.py → ...es/deploy/client/llm/chat/swift_client.py b/examples/deploy/client/llm/swift_client.py → ...es/deploy/client/llm/chat/swift_client.py
@@ -52,7 +52,6 @@ def run_client(host: str = '127.0.0.1', port: int = 8000):
     from swift.llm import (InferEngine, InferRequest, InferClient, RequestConfig, load_dataset, run_deploy,
                            DeployArguments)
     from swift.plugin import InferStats
-    # TODO: The current 'pt' deployment does not support automatic batch.
     # NOTE: In a real deployment scenario, please comment out the context of run_deploy.
     with run_deploy(
             DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1,

diff --git a/examples/deploy/client/mllm/openai_client.py b/examples/deploy/client/mllm/openai_client.py
@@ -8,7 +8,7 @@
 
 
 def infer(client, model: str, messages):
-    resp = client.chat.completions.create(model=model, messages=messages, temperature=0)
+    resp = client.chat.completions.create(model=model, messages=messages, max_tokens=512, temperature=0)
     query = messages[0]['content']
     response = resp.choices[0].message.content
     print(f'query: {query}')

diff --git a/examples/deploy/client/mllm/swift_client.py b/examples/deploy/client/mllm/swift_client.py
@@ -117,7 +117,6 @@ def run_client(host: str = '127.0.0.1', port: int = 8000):
     from swift.llm import (InferEngine, InferRequest, InferClient, RequestConfig, load_dataset, run_deploy,
                            DeployArguments)
     from swift.plugin import InferStats
-    # TODO: The current 'pt' deployment does not support automatic batch.
     # NOTE: In a real deployment scenario, please comment out the context of run_deploy.
     with run_deploy(
             DeployArguments(model='Qwen/Qwen2-VL-2B-Instruct', verbose=False, log_interval=-1,
-Original file line number
+Diff line change
@@ Expand Up / @@ -139,6 +139,7 @@ my_model/ @@
     /data
     result/
     images
+    /custom/
     # Pytorch
     *.pth
@@ Expand Down @@