From 4ee042a8e64a00ad070e9321c712cd3b77ae2de4 Mon Sep 17 00:00:00 2001 From: saber <3082548039@qq.com> Date: Sun, 26 Mar 2023 13:44:10 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=86=E9=85=8D=E7=BD=AEdevice=5Fmap?= =?UTF-8?q?=E7=9A=84=E9=80=BB=E8=BE=91=E6=8A=BD=E7=A6=BB,=20=E6=A0=B9?= =?UTF-8?q?=E6=8D=AEgpu=E6=95=B0=E9=87=8F=E8=87=AA=E5=8A=A8=E9=85=8D?= =?UTF-8?q?=E7=BD=AEdevice=5Fmap=EF=BC=8C=E5=B9=B6=E4=B8=94=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E9=80=82=E9=85=8D=E6=89=80=E6=9C=89=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chatglm_parallel.py | 34 ++++++++++++++++++++++------------ requirements.txt | 1 + web_demo.py | 2 +- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/chatglm_parallel.py b/chatglm_parallel.py index e500d64c..da3f776f 100644 --- a/chatglm_parallel.py +++ b/chatglm_parallel.py @@ -3,28 +3,38 @@ Date: 2023-03-23 09:18:13 Description: 将模型加载到多张GPU卡中,根据gpu的数量自动分配平均的显存占用 ''' +from typing import Dict -from transformers import AutoModel, AutoTokenizer from accelerate import load_checkpoint_and_dispatch +from transformers import AutoModel -def load_model_on_gpus(checkpoint_path, num_gpus=2): - # 总共占用13GB显存,28层transformer每层0.39GB左右 - # 第一层 word_embeddings和最后一层 lm_head 层各占用1.2GB左右 +def auto_configure_device_map(num_gpus) -> Dict[str, int]: + # transformer.word_embeddings 占用1层 + # transformer.final_layernorm 和 lm_head 占用1层 + # transformer.layers 占用 28 层 + # 总共30层分配到num_gpus张卡上 num_trans_layers = 28 - vram_per_layer = 0.39 - average = 13/num_gpus - used = 1.2 + per_gpu_layers = 30 / num_gpus + device_map = {'transformer.word_embeddings': 0, - 'transformer.final_layernorm': num_gpus-1, 'lm_head': num_gpus-1} + 'transformer.final_layernorm': num_gpus - 1, 'lm_head': num_gpus - 1} + + used = 1 gpu_target = 0 for i in range(num_trans_layers): - if used > average-vram_per_layer/2 and gpu_target < num_gpus: + if used >= per_gpu_layers: gpu_target += 1 used = 0 - else: - used += vram_per_layer - device_map['transformer.layers.%d' % i] = gpu_target + assert gpu_target < num_gpus + device_map[f'transformer.layers.{i}'] = gpu_target + used += 1 + + return device_map + + +def load_model_on_gpus(checkpoint_path, num_gpus=2): + device_map = auto_configure_device_map(num_gpus) model = AutoModel.from_pretrained( checkpoint_path, trust_remote_code=True) diff --git a/requirements.txt b/requirements.txt index 29484801..f311c857 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ icetk cpm_kernels torch>=1.10 gradio +accelerate \ No newline at end of file diff --git a/web_demo.py b/web_demo.py index 07ddc339..6f4f34d7 100644 --- a/web_demo.py +++ b/web_demo.py @@ -1,4 +1,4 @@ -from transformers import AutoModel, AutoTokenizer +from transformers import AutoTokenizer import gradio as gr from chatglm_parallel import load_model_on_gpus