diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md index 2c281da5ad..fa92d6bca6 100644 --- a/docs/en/llm/api_server.md +++ b/docs/en/llm/api_server.md @@ -259,11 +259,8 @@ Following are two steps to launch multiple api servers through torchrun. Just cr ```python import os import socket -from typing import List - +from typing import List, Literal import fire - - def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) @@ -272,22 +269,26 @@ def get_host_ip(): finally: s.close() return ip - - def main(model_path: str, + tp: int=1, proxy_url: str = 'http://0.0.0.0:8000', - port: int = 23333): + port: int = 23333, + backend: Literal['turbomind', 'pytorch']='turbomind'): local_rank = int(os.environ.get('LOCAL_RANK', -1)) + world_size = int(os.environ.get('WORLD_SIZE', -1)) local_ip = get_host_ip() if isinstance(port, List): - assert len(port) == int(os.environ.get('WORLD_SIZE', -1)) + assert len(port) == world_size port = port[local_rank] else: port += local_rank * 10 - command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-name {local_ip} --server-port {port} --proxy-url {proxy_url}' - os.system(command) - - + if (world_size-local_rank)%tp==0: + rank_list = ','.join([str(local_rank+i) for i in range(tp)]) + command = f'CUDA_VISIBLE_DEVICES={rank_list} lmdeploy serve api_server {model_path} '\ + f'--server-name {local_ip} --server-port {port} --tp {tp} '\ + f'--proxy-url {proxy_url} --backend {backend}' + print(f'running command: {command}') + os.system(command) if __name__ == '__main__': fire.Fire(main) ``` diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md index 2a415b5768..40514691a6 100644 --- a/docs/zh_cn/llm/api_server.md +++ b/docs/zh_cn/llm/api_server.md @@ -268,11 +268,8 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \ ```python import os import socket -from typing import List - +from typing import List, Literal import fire - - def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) @@ -281,22 +278,26 @@ def get_host_ip(): finally: s.close() return ip - - def main(model_path: str, + tp: int=1, proxy_url: str = 'http://0.0.0.0:8000', - port: int = 23333): + port: int = 23333, + backend: Literal['turbomind', 'pytorch']='turbomind'): local_rank = int(os.environ.get('LOCAL_RANK', -1)) + world_size = int(os.environ.get('WORLD_SIZE', -1)) local_ip = get_host_ip() if isinstance(port, List): - assert len(port) == int(os.environ.get('WORLD_SIZE', -1)) + assert len(port) == world_size port = port[local_rank] else: port += local_rank * 10 - command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-name {local_ip} --server-port {port} --proxy-url {proxy_url}' - os.system(command) - - + if (world_size-local_rank)%tp==0: + rank_list = ','.join([str(local_rank+i) for i in range(tp)]) + command = f'CUDA_VISIBLE_DEVICES={rank_list} lmdeploy serve api_server {model_path} '\ + f'--server-name {local_ip} --server-port {port} --tp {tp} '\ + f'--proxy-url {proxy_url} --backend {backend}' + print(f'running command: {command}') + os.system(command) if __name__ == '__main__': fire.Fire(main) ```