Skip to content

Commit

Permalink
fix swift deploy log error (repeat log) (#2808)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jintao-Huang authored Dec 30, 2024
1 parent 9b078a1 commit 307dd05
Show file tree
Hide file tree
Showing 12 changed files with 42 additions and 38 deletions.
4 changes: 2 additions & 2 deletions examples/deploy/client/llm/chat/swift_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
def run_client(host: str = '127.0.0.1', port: int = 8000):
engine = InferClient(host=host, port=port)
print(f'models: {engine.models}')

dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], strict=False, seed=42)[0]
# Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], seed=42)[0]
print(f'dataset: {dataset}')
infer_requests = [InferRequest(**data) for data in dataset]
infer_batch(engine, infer_requests)
Expand Down
3 changes: 2 additions & 1 deletion examples/deploy/client/mllm/swift_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ def get_data(mm_type: Literal['text', 'image', 'video', 'audio']):
def run_client(host: str = '127.0.0.1', port: int = 8000):
engine = InferClient(host=host, port=port)
print(f'models: {engine.models}')
dataset = load_dataset(['AI-ModelScope/LaTeX_OCR:small#1000'], strict=False, seed=42)[0]
# Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
dataset = load_dataset(['AI-ModelScope/LaTeX_OCR:small#1000'], seed=42)[0]
print(f'dataset: {dataset}')
infer_requests = [InferRequest(**data) for data in dataset]
infer_batch(engine, infer_requests)
Expand Down
3 changes: 2 additions & 1 deletion examples/infer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
from swift.llm import LmdeployEngine
engine = LmdeployEngine(model)

dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], strict=False, seed=42)[0]
# Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], seed=42)[0]
print(f'dataset: {dataset}')
infer_requests = [InferRequest(**data) for data in dataset]
infer_batch(engine, infer_requests)
Expand Down
3 changes: 2 additions & 1 deletion examples/infer/demo_mllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def get_data(mm_type: Literal['text', 'image', 'video', 'audio']):
dataset = 'AI-ModelScope/LaTeX_OCR:small#1000'
engine = LmdeployEngine(model, vision_batch_size=8)

dataset = load_dataset([dataset], strict=False, seed=42)[0]
# Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
dataset = load_dataset([dataset], seed=42)[0]
print(f'dataset: {dataset}')
infer_requests = [InferRequest(**data) for data in dataset]
infer_batch(engine, infer_requests)
Expand Down
4 changes: 2 additions & 2 deletions requirements/install_all.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# please use python=3.10, cuda12.*
# sh requirements/install_all.sh
pip install vllm -U
pip install lmdeploy -U --no-deps
pip install "vllm>=0.5.1" -U
pip install "lmdeploy>=0.5" -U --no-deps
pip install autoawq!=0.2.7.post3 -U --no-deps
pip install auto_gptq optimum bitsandbytes -U
pip install git+https://github.com/modelscope/ms-swift.git#egg=ms-swift[all]
Expand Down
1 change: 0 additions & 1 deletion swift/llm/argument/infer_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ class InferArguments(MergeArguments, VllmArguments, LmdeployArguments, BaseArgum
infer_backend: Literal['vllm', 'pt', 'lmdeploy'] = 'pt'

result_path: Optional[str] = None
writer_buffer_size: int = 65536
# for pt engine
max_batch_size: int = 1
ddp_backend: Optional[str] = None
Expand Down
7 changes: 5 additions & 2 deletions swift/llm/infer/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
from contextlib import contextmanager
from dataclasses import asdict
from functools import partial
from http import HTTPStatus
from threading import Thread
from typing import List, Optional, Union
Expand Down Expand Up @@ -153,10 +154,12 @@ def pre_infer_hook(kwargs):
logger.info(request_info)
return kwargs

self.infer_engine.pre_infer_hooks = [pre_infer_hook]
infer_kwargs['pre_infer_hook'] = pre_infer_hook
try:
res_or_gen = await self.infer_async(infer_request, request_config, template=self.template, **infer_kwargs)
except ValueError as e:
except Exception as e:
import traceback
print(traceback.format_exc())
return self.create_error_response(HTTPStatus.BAD_REQUEST, str(e))
if request_config.stream:

Expand Down
3 changes: 1 addition & 2 deletions swift/llm/infer/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ def get_infer_engine(args: InferArguments, **kwargs):

def main(self):
args = self.args
context = open_jsonl_writer(
args.result_path, buffer_size=args.writer_buffer_size) if args.result_path else nullcontext()
context = open_jsonl_writer(args.result_path) if args.result_path else nullcontext()
with context as json_writer:
self.jsonl_writer = json_writer
return super().main()
Expand Down
9 changes: 6 additions & 3 deletions swift/llm/infer/infer_engine/infer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def _post_init(self):
self.model_name = self.model_info.model_name
self.max_model_len = self.model_info.max_model_len
self.config = self.model_info.config
self.pre_infer_hooks = []
if getattr(self, 'default_template', None) is None:
self.default_template = get_template(self.model_meta.template, self.processor)
self._adapters_pool = {}
Expand Down Expand Up @@ -60,7 +59,9 @@ async def _run_infer(i, task, queue, stream: bool = False):
queue.put((i, stream_response))
else:
queue.put((i, await task))
finally:
except Exception as e:
queue.put((i, e))
else:
queue.put((i, None))

async def _batch_run(tasks):
Expand All @@ -78,7 +79,9 @@ async def _batch_run(tasks):

while n_finished < len(new_tasks):
i, output = queue.get()
if output is None: # is_finished
if isinstance(output, Exception):
raise output
elif output is None: # is_finished
n_finished += 1
prog_bar.update()
else:
Expand Down
3 changes: 2 additions & 1 deletion swift/llm/infer/infer_engine/lmdeploy_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ async def infer_async(self,
request_config: Optional[RequestConfig] = None,
*,
template: Optional[Template] = None,
pre_infer_hook=None,
**kwargs) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionStreamResponse]]:
request_config = deepcopy(request_config or RequestConfig())
if template is None:
Expand All @@ -275,7 +276,7 @@ async def infer_async(self,
generation_config = self._prepare_generation_config(request_config)
self._add_stop_words(generation_config, request_config, template.template_meta)
kwargs.update({'template': template, 'inputs': inputs, 'generation_config': generation_config})
for pre_infer_hook in self.pre_infer_hooks:
if pre_infer_hook:
kwargs = pre_infer_hook(kwargs)
if request_config.stream:
return self._infer_stream_async(**kwargs)
Expand Down
14 changes: 8 additions & 6 deletions swift/llm/infer/infer_engine/pt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,15 +348,16 @@ async def infer_async(
*,
template: Optional[Template] = None,
adapter_request: Optional[AdapterRequest] = None,
pre_infer_hook=None,
) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionStreamResponse]]:
# TODO:auto batch
if request_config is None:
request_config = RequestConfig()
res_or_gen = self.infer([infer_request],
request_config,
template=template,
use_tqdm=False,
adapter_request=adapter_request)
res_or_gen = self._infer([infer_request],
request_config,
template=template,
adapter_request=adapter_request,
pre_infer_hook=pre_infer_hook)
if request_config.stream:

async def _gen_wrapper():
Expand All @@ -376,6 +377,7 @@ def _infer(
*,
template: Optional[Template] = None,
adapter_request: Optional[AdapterRequest] = None,
pre_infer_hook=None,
) -> Union[List[ChatCompletionResponse], Iterator[List[Optional[ChatCompletionStreamResponse]]]]:
self.model.eval()
request_config = deepcopy(request_config)
Expand Down Expand Up @@ -414,7 +416,7 @@ def _infer(
'adapter_request': adapter_request,
'template_inputs': template_inputs
}
for pre_infer_hook in self.pre_infer_hooks:
if pre_infer_hook:
kwargs = pre_infer_hook(kwargs)
if request_config.stream:

Expand Down
26 changes: 10 additions & 16 deletions swift/llm/infer/infer_engine/vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Union

import torch
import vllm
from packaging import version
from transformers import GenerationConfig, PreTrainedTokenizerBase
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams

from swift.llm import InferRequest, Template, TemplateMeta, get_model_tokenizer
from swift.plugin import Metric
Expand All @@ -21,6 +19,14 @@
from .patch import patch_auto_config, patch_auto_tokenizer
from .utils import AdapterRequest, InferStreamer

try:
# After setting the environment variables, import vllm. This way of writing allows lint to pass.
os.environ['VLLM_ENGINE_ITERATION_TIMEOUT_S'] = '3600'
import vllm
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
except Exception:
raise

logger = get_logger()
dtype_mapping = {torch.float16: 'float16', torch.bfloat16: 'bfloat16', torch.float32: 'float32'}

Expand Down Expand Up @@ -50,7 +56,6 @@ def __init__(
max_loras: int = 1,
max_lora_rank: int = 16,
engine_kwargs: Optional[Dict[str, Any]] = None) -> None:
self._init_env()
self.processor = get_model_tokenizer(
model_id_or_path,
torch_dtype,
Expand Down Expand Up @@ -137,18 +142,6 @@ def _prepare_engine_kwargs(self,
if max_model_len is not None:
model_info.max_model_len = max_model_len

@staticmethod
def _init_env() -> None:
try:
from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel
destroy_model_parallel()
except ImportError:
pass
# fix HTTPError bug (use model_dir)
os.environ.pop('VLLM_USE_MODELSCOPE', None)
if version.parse(vllm.__version__) >= version.parse('0.5.1'):
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

def _fix_vllm_bug(self) -> None:
# fix vllm==0.4 bug (very slow)
tokenizer = self.tokenizer
Expand Down Expand Up @@ -364,6 +357,7 @@ async def infer_async(
*,
template: Optional[Template] = None,
adapter_request: Optional[AdapterRequest] = None,
pre_infer_hook=None,
) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionStreamResponse]]:
request_config = deepcopy(request_config or RequestConfig())
if template is None:
Expand All @@ -381,7 +375,7 @@ async def infer_async(
'generation_config': generation_config,
'adapter_request': adapter_request
}
for pre_infer_hook in self.pre_infer_hooks:
if pre_infer_hook:
kwargs = pre_infer_hook(kwargs)
if request_config.stream:
return self._infer_stream_async(**kwargs)
Expand Down

0 comments on commit 307dd05

Please sign in to comment.