Skip to content

Commit

Permalink
Model: Add Tensor Parallel support
Browse files Browse the repository at this point in the history
Use the tensor parallel loader when the flag is enabled. The new loader
has its own autosplit implementation, so gpu_split_auto isn't valid
here.

Also make it easier to determine which cache type to use rather than
multiple if/else statements.

Signed-off-by: kingbri <[email protected]>
  • Loading branch information
bdashore3 committed Aug 22, 2024
1 parent 5002617 commit 871c890
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 53 deletions.
147 changes: 94 additions & 53 deletions backends/exllamav2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@
from common.transformers_utils import GenerationConfig, HuggingFaceConfig
from common.utils import coalesce, unwrap

# Dynamic imports
try:
from exllamav2 import ExLlamaV2Cache_TP

has_tp = True
except ImportError:
has_tp = False


class ExllamaV2Container:
"""The model container class for ExLlamaV2 models."""
Expand Down Expand Up @@ -78,6 +86,7 @@ class ExllamaV2Container:
gpu_split: Optional[list] = None
gpu_split_auto: bool = True
autosplit_reserve: List[float] = [96 * 1024**2]
use_tp: bool = False

# Load state
model_is_loading: bool = False
Expand Down Expand Up @@ -144,30 +153,52 @@ def progress(loaded_modules: int, total_modules: int,
# Turn off GPU split if the user is using 1 GPU
gpu_count = torch.cuda.device_count()
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
use_tp = unwrap(kwargs.get("tensor_parallel"), False)
gpu_split = kwargs.get("gpu_split")
gpu_device_list = list(range(0, gpu_count))

if gpu_count > 1 and gpu_split_auto:
# Auto GPU split parameters
self.gpu_split_auto = gpu_split_auto

autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
self.autosplit_reserve = [
int(math.ceil(value * 1024**2)) for value in autosplit_reserve_megabytes
]
elif gpu_count > 1:
# Manual GPU split
self.gpu_split = kwargs.get("gpu_split")
self.gpu_split_auto = False

gpu_device_list = [
device_idx
for device_idx, memory in enumerate(self.gpu_split)
if memory > 0
]
else:
# One GPU setup
# Set GPU split options
if gpu_count == 1:
self.gpu_split_auto = False
logger.info("Disabling GPU split because one GPU is in use.")
else:
# Set tensor parallel
if use_tp:
if has_tp:
self.use_tp = True

# TP has its own autosplit loader
self.gpu_split_auto = False
else:
# TODO: Remove conditional with exl2 v0.1.9 release
logger.warning(
"Tensor parallelism is not supported in the "
"current ExllamaV2 version."
)

# Enable manual GPU split if provided
if gpu_split:
self.gpu_split_auto = False
self.gpu_split = gpu_split

gpu_device_list = [
device_idx
for device_idx, memory in enumerate(self.gpu_split)
if memory > 0
]
elif gpu_split_auto and not self.use_tp:
# Otherwise fallback to autosplit settings
self.gpu_split_auto = gpu_split_auto

autosplit_reserve_megabytes = unwrap(
kwargs.get("autosplit_reserve"), [96]
)

# Reserve VRAM for each GPU
self.autosplit_reserve = [
int(math.ceil(value * 1024**2))
for value in autosplit_reserve_megabytes
]

self.config = ExLlamaV2Config()
self.config.model_dir = str(model_directory.resolve())
Expand All @@ -182,10 +213,7 @@ def progress(loaded_modules: int, total_modules: int,
self.config.prepare()

# Check if the model arch is compatible with various exl2 features
try:
self.config.arch_compat_overrides()
except AttributeError:
pass
self.config.arch_compat_overrides()

# Create the hf_config
self.hf_config = HuggingFaceConfig.from_file(model_directory)
Expand Down Expand Up @@ -548,9 +576,11 @@ def progress(loaded_modules: int, total_modules: int)
if not self.quiet:
logger.info("Loading draft model: " + self.draft_config.model_dir)

# Draft uses the autosplit loader, so create a cache that reflects this
self.draft_cache = self.create_cache(
cache_mode=self.draft_cache_mode,
autosplit=True,
use_tp=False,
)

for value in self.draft_model.load_autosplit_gen(
Expand All @@ -572,7 +602,17 @@ def progress(loaded_modules: int, total_modules: int)

# Load model with manual split
# Entrypoint for single GPU users
if not self.gpu_split_auto:
if self.use_tp:
logger.info("Loading with tensor parallel")

for value in self.model.load_tp_gen(
self.gpu_split,
callback_gen=progress_callback,
expect_cache_tokens=self.cache_size,
):
if value:
yield value
elif not self.gpu_split_auto:
logger.info("Loading with a manual GPU split (or a one GPU setup)")

for value in self.model.load_gen(
Expand All @@ -582,13 +622,15 @@ def progress(loaded_modules: int, total_modules: int)
if value:
yield value

# Create the model cache
self.cache = self.create_cache(
cache_mode=self.cache_mode,
autosplit=self.gpu_split_auto,
use_tp=self.use_tp,
)

# Load model with autosplit
if self.gpu_split_auto:
# Load model with autosplit (without TP)
if self.gpu_split_auto and not self.use_tp:
logger.info("Loading with autosplit")

for value in self.model.load_autosplit_gen(
Expand All @@ -604,36 +646,35 @@ def progress(loaded_modules: int, total_modules: int)
input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
self.model.forward(input_ids, cache=self.cache, preprocess_only=True)

def create_cache(self, cache_mode: str, autosplit: bool):
def create_cache(self, cache_mode: str, autosplit: bool, use_tp: bool):
if has_tp and use_tp:
if self.cache_mode != "FP16":
logger.warning(
"Tensor parallel does not currently allow for use of "
"a quantized K/V cache. Using the specialized TP cache."
)

return ExLlamaV2Cache_TP(
self.model,
max_seq_len=self.cache_size,
batch_size=1,
)

cache_type = ExLlamaV2Cache
match cache_mode:
case "Q4":
return ExLlamaV2Cache_Q4(
self.model,
max_seq_len=self.cache_size,
lazy=autosplit,
batch_size=1,
)
cache_type = ExLlamaV2Cache_Q4
case "Q6":
return ExLlamaV2Cache_Q6(
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
cache_type = ExLlamaV2Cache_Q6
case "Q8":
return ExLlamaV2Cache_Q8(
self.model,
max_seq_len=self.cache_size,
lazy=autosplit,
batch_size=1,
)
case _:
return ExLlamaV2Cache(
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
cache_type = ExLlamaV2Cache_Q8

return cache_type(
self.model,
max_seq_len=self.cache_size,
lazy=autosplit,
batch_size=1,
)

async def create_generator(self):
try:
Expand Down
5 changes: 5 additions & 0 deletions common/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ def add_model_args(parser: argparse.ArgumentParser):
type=str_to_bool,
help="Overrides base model context length",
)
model_group.add_argument(
"--tensor-parallel",
type=str_to_bool,
help="Use tensor parallelism to load models",
)
model_group.add_argument(
"--gpu-split-auto",
type=str_to_bool,
Expand Down
7 changes: 7 additions & 0 deletions config_sample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ model:
# Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
#override_base_seq_len:

# Load model with tensor parallelism
# If a GPU split isn't provided, the TP loader will fallback to autosplit
# Enabling ignores the gpu_split_auto and autosplit_reserve values
# NOTE: Requires a development build of exllamav2
#tensor_parallel: False

# Automatically allocate resources to GPUs (default: True)
# NOTE: Not parsed for single GPU users
#gpu_split_auto: True
Expand All @@ -118,6 +124,7 @@ model:
#autosplit_reserve: [96]

# An integer array of GBs of vram to split between GPUs (default: [])
# Used with tensor parallelism
# NOTE: Not parsed for single GPU users
#gpu_split: [20.6, 24]

Expand Down
3 changes: 3 additions & 0 deletions endpoints/core/types/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ class ModelLoadRequest(BaseModel):
default_factory=lambda: get_config_default("cache_size"),
examples=[4096],
)
tensor_parallel: Optional[bool] = Field(
default_factory=lambda: get_config_default("tensor_parallel", False)
)
gpu_split_auto: Optional[bool] = Field(
default_factory=lambda: get_config_default("gpu_split_auto", True)
)
Expand Down

0 comments on commit 871c890

Please sign in to comment.