-
Notifications
You must be signed in to change notification settings - Fork 455
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Check-in turbomind engine config (#909)
* add EngineConfig for turbomind * add EngineGenerationConfig for turbomind * add deprecated params warning * update TurbomindModelConfig * fix comments * update prepare_inputs * update TurbomindModelConfig * update EngineConfig * use defaut bad/stop words * use defaut bad/stop words * fix comments * typo * fix bad words * add engine_config to turbomind.chat * update EngineConfig * rename generation_config -> gen_config * update config
- Loading branch information
Showing
6 changed files
with
285 additions
and
201 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
# Copyright (c) OpenMMLab. All rights reserved. | ||
from .engine_config import EngineConfig | ||
from .turbomind import TurboMind | ||
|
||
__all__ = ['TurboMind'] | ||
__all__ = ['TurboMind', 'EngineConfig'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# Copyright (c) OpenMMLab. All rights reserved. | ||
from pydantic.dataclasses import dataclass | ||
|
||
|
||
@dataclass | ||
class EngineConfig: | ||
"""TurboMind Engine config. | ||
Args: | ||
model_name (str): the name of the deployed model | ||
model_format (str): the layout of the deployed model. It can be one of the following values [hf, llama, awq], `hf` meaning `hf_llama`, `llama` meaning `meta_llama`, `awq` meaning the quantized model by AWQ. | ||
group_size (int): the group size used when quantizing weights to 4bit, default to 128 | ||
tp (int): the number of GPU cards used in tensor parallelism, default to 1 | ||
session_len (int): the max session length of a sequence, default to None | ||
max_batch_size (int): the max batch size during inference, default to 128 | ||
max_context_token_num (int): the max number of tokens to be processed in each forward pass, default to 1 | ||
cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache, default to 0.5 | ||
cache_block_seq_len (int): the length of a sequence in a k/v block, default to 128 | ||
cache_chunk_size (int): the number of blocks each time TurboMind engine tries to realloc from gpu memory, default to -1. When it is -1, | ||
num_tokens_per_iter (int): number of tokens to be processed per iteration, default to 0 | ||
max_prefill_iters (int): max prefill iters for a single request, default to 1 | ||
use_context_fmha (int): whether or not to use fmha in context decoding, default to 1 | ||
quant_policy: (int): , default to 0. When k/v is quantized into 8 bit, set it to 4 | ||
rope_scaling_factor (int): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention | ||
use_dynamic_ntk (bool): whether or not to use dynamic ntk, default to False | ||
use_logn_attn (bool): whether or not to use log attn: default to False | ||
kv_bits (int): the number of bits of k/v after quantization, default to 8 | ||
""" # noqa: E501 | ||
|
||
model_name: str = None | ||
model_format: str = None | ||
tp: int = 1 | ||
session_len: int = None | ||
max_batch_size: int = 128 | ||
group_size: int = 128 | ||
kv_bits: int = 8 | ||
max_context_token_num: int = 1 | ||
cache_max_entry_count: float = 0.5 | ||
cache_block_seq_len: int = 128 | ||
cache_chunk_size: int = -1 | ||
num_tokens_per_iter: int = 0 | ||
max_prefill_iters: int = 1 | ||
use_context_fmha: int = 1 | ||
quant_policy: int = 0 | ||
rope_scaling_factor: float = 0.0 | ||
use_dynamic_ntk: bool = False | ||
use_logn_attn: bool = False |
Oops, something went wrong.