Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev chat_glm npu #544

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/common/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
# Enable automatic mixed precision for training which does not
# change model's inference behavior.
amp=dict(enabled=False),

train_with_fp16=False,
# Enable activation checkpointing to allow for training
# with larger models, sequences, and batch sizes.
# If enabled, checkpoint the input activations of each transformer layers by default.
Expand Down
23 changes: 23 additions & 0 deletions configs/loadder_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
loader_mapping_models = dict(

llama=dict(
loader_prefix="projects.Llama.utils.llama_loader",
huggingface_loader="LlamaLoaderHuggerFace",
),

chatglm=dict(
loader_prefix="projects.ChatGLM.utils.chatglm_loader",
huggingface_loader="ChatGLMLoaderHuggerFace",
),

qwen2=dict(
loader_prefix="projects.Qwen2.utils.qwen_loader",
huggingface_loader="Qwen2LoaderHuggerFace",
),

aquila=dict(
loader_prefix="projects.Aquila.utils.aquila_loader",
huggingface_loader="AquilaLoaderHuggerFace",
)

)
4 changes: 2 additions & 2 deletions libai/data/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def build_nlp_train_loader(
train_batch_size,
test_batch_size=None,
sampler=LazyCall(CyclicSampler)(shuffle=True),
num_workers=4,
num_workers=0,
consumed_samples=0,
seed=0,
collate_fn=None,
Expand Down Expand Up @@ -223,7 +223,7 @@ def build_nlp_test_loader(
dataset,
test_batch_size,
sampler=LazyCall(SingleRoundSampler)(shuffle=False, drop_last=False),
num_workers=4,
num_workers=0,
seed=0,
collate_fn=None,
):
Expand Down
2 changes: 1 addition & 1 deletion libai/data/structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def stack(distTensor_lists: List["DistTensorData"]) -> "DistTensorData":
assert (
data.placement_idx == placement_idx
), f"placement_idx is not equal, {data.placement_idx} != {placement_idx}"
tensors.append(data.tensor)
tensors.append(data.tensor.to(flow.int64))
tensors = flow.stack(tensors, dim=0)
ret = DistTensorData(tensors, sbp_list=sbp_list, placement_idx=placement_idx)
return ret
Expand Down
7 changes: 4 additions & 3 deletions libai/engine/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,9 +341,10 @@ def _do_eval(self):
def after_step(self):
next_iter = self.trainer.iter + 1
if self._period > 0 and next_iter % self._period == 0:
# do the last eval in after_train
if next_iter != self.trainer.max_iter:
self._do_eval()
# # do the last eval in after_train
# if next_iter != self.trainer.max_iter:
# self._do_eval()
pass

def after_train(self):
# This condition is to prevent the eval from running after a failed training
Expand Down
3 changes: 2 additions & 1 deletion libai/engine/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def train(self, start_iter: int, max_iter: int):
Args:
start_iter, max_iter (int): See docs above
"""
# start_iter = 9980 # for profiling
logger = logging.getLogger(__name__)
logger.info("Starting training from iteration {}".format(start_iter))

Expand Down Expand Up @@ -283,7 +284,7 @@ def run_step(self, get_batch: Callable, input_placement_device: str = "cuda"):
if (self.iter + 1) % self.grad_acc_steps == 0:
self.optimizer.clip_grad()
self.optimizer.step()
self.optimizer.zero_grad()
self.optimizer.zero_grad(set_to_none=True)


class GraphTrainer(TrainerBase):
Expand Down
4 changes: 2 additions & 2 deletions libai/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,12 +203,12 @@ def inference_on_dataset(

# get valid sample
valid_data = {
key: dist.tensor_to_rank0(value, to_local=True)[:valid_sample]
key: dist.tensor_to_rank0(value, to_local=True, device=input_placement_device)[:valid_sample]
for key, value in data.items()
}
valid_outputs = {}
for key, value in outputs.items():
value = dist.tensor_to_rank0(value, to_local=True)
value = dist.tensor_to_rank0(value, to_local=True, device=input_placement_device)
if value.ndim > 1:
valid_outputs[key] = value[:valid_sample] # Slice if it's batched output
else:
Expand Down
4 changes: 4 additions & 0 deletions libai/inference/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
pipeline_parallel=None,
pipeline_stage_id=None,
pipeline_num_layers=None,
device_type="npu",
model_path=None,
mode="libai",
**kwargs,
Expand All @@ -59,6 +60,7 @@ def __init__(
pipeline_parallel,
pipeline_stage_id,
pipeline_num_layers,
device_type,
)
dist.setup_dist_util(self.cfg.train.dist)
logger.info(self.cfg.train.dist)
Expand Down Expand Up @@ -90,11 +92,13 @@ def update_cfg(
pipeline_parallel=1,
pipeline_stage_id=None,
pipeline_num_layers=None,
device_type="npu",
):
self.cfg.train.dist.data_parallel_size = data_parallel
self.cfg.train.dist.tensor_parallel_size = tensor_parallel
self.cfg.train.dist.pipeline_parallel_size = pipeline_parallel
self.cfg.train.dist.custom_pipeline_stage_id = pipeline_stage_id
self.cfg.train.dist.device_type = device_type
if pipeline_num_layers is not None:
self.cfg.train.dist.pipeline_num_layers = pipeline_num_layers

Expand Down
23 changes: 17 additions & 6 deletions libai/layers/cross_entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,24 @@ def forward(self, logits: flow.Tensor, target: flow.Tensor):
assert target.ndim == 2
assert logits.shape[0:2] == target.shape

target = target.to_global(placement=logits.placement)

# Change -1 in target to 0 because sparse_softmax_cross_entropy don't accept -1
target = target * (target >= 0)

lm_loss = flow._C.sparse_softmax_cross_entropy(
target = target.to(flow.int32) # NOTE:npu nll target only support int32 for now
target = target.to_global(placement=logits.placement)
lm_loss = flow._C.cross_entropy(
logits.view(-1, logits.shape[-1]),
target.view(-1),
None,
-100,
"none",
0.0
)

# target = target.to_global(placement=logits.placement)

# # Change -1 in target to 0 because sparse_softmax_cross_entropy don't accept -1
# target = target * (target >= 0)

# lm_loss = flow._C.sparse_softmax_cross_entropy(
# logits.view(-1, logits.shape[-1]),
# target.view(-1),
# )
return lm_loss
5 changes: 4 additions & 1 deletion libai/models/gpt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,10 @@ def forward(self, input_ids, past_length=0):
bsz, seq_length = input_ids.size()

position_ids = self.position_ids[:, past_length : past_length + seq_length]
position_ids = position_ids.expand_as(input_ids).to_global(sbp=input_ids.sbp)
# position_ids = position_ids.expand_as(input_ids).to_global(sbp=input_ids.sbp)
position_ids = position_ids.expand_as(input_ids).to_global(
sbp=input_ids.sbp, placement=input_ids.placement
)

token_embeds = self.token_embeddings(input_ids)
position_embeds = self.position_embeddings(position_ids)
Expand Down
6 changes: 5 additions & 1 deletion libai/models/utils/graph_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,12 @@ def __init__(
def build(self, **kwargs):
if self.is_train:
placement_sbp_dict = (
# dict(
# placement=flow.env.all_device_placement("cuda"),
# sbp=flow.sbp.split(0),
# )
dict(
placement=flow.env.all_device_placement("cuda"),
placement=flow.env.all_device_placement("npu"),
sbp=flow.sbp.split(0),
)
if self.global_mode.enabled
Expand Down
7 changes: 7 additions & 0 deletions libai/models/utils/model_loader/base_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,10 @@ def _convert_tensor(self, tensor):
Returns:
flow.Tensor: The target tensor.
"""
import torch
if tensor.dtype == torch.bfloat16:
data = tensor.detach().half().cpu().numpy()
return flow.Tensor(data)
return flow.Tensor(tensor.detach().cpu().numpy())

def _convert_tensors(self, torch_state_dict):
Expand Down Expand Up @@ -490,6 +494,9 @@ def _load_torch_state_dict(self, state_dict_file, use_safetensors=False):
merged_state_dict = {}
for file in state_dict_file:
state_dict = torch.load(file, map_location="cpu")
# NOTE: align to libai oneflow_xpu
for k in state_dict.keys():
state_dict[k] = state_dict[k].to(torch.float)
merged_state_dict.update(state_dict)
return merged_state_dict

Expand Down
6 changes: 3 additions & 3 deletions libai/tokenizer/tokenization_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,9 +782,9 @@ def convert_to_tensors(self, token_ids, return_tensors=None, is_global=False, **
return_token_ids = flow.tensor(token_ids, dtype=flow.long)
elif is_global:
sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
placement = kwargs.get(
"placement", flow.placement("cuda", list(range(dist.get_world_size())))
)
placement = kwargs.get("placement")
if placement is None:
placement = flow.placement("npu", list(range(dist.get_world_size())))
return_token_ids = flow.tensor(
token_ids, sbp=sbp, placement=placement, dtype=flow.long
)
Expand Down
22 changes: 18 additions & 4 deletions libai/utils/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def device_type(self):
return self._device_type

def set_device_type(self, device_type):
assert device_type in ["cpu", "cuda"], f"not supported for {device_type}"
assert device_type in ["cpu", "cuda", "npu"], f"not supported for {device_type}"
self._device_type = device_type

def get_layer_ranks(self, layer_idx):
Expand Down Expand Up @@ -431,14 +431,28 @@ def convert_to_distributed_default_setting(t):
)
else:
dist_util = get_dist_util()
if dist_util.device_type != "npu":
from omegaconf import DictConfig

setup_dist_util(
DictConfig(
dict(
data_parallel_size=1,
tensor_parallel_size=1,
pipeline_parallel_size=1,
device_type="npu",
)
)
)
dist_util = get_dist_util()
device_type = dist_util.device_type
return t.to_global(placement=flow.placement(device_type, ranks=t.placement.ranks))


def ttol(tensor, pure_local=False, ranks=None):
"""Global tensor to local tensor."""
if tensor.is_global:
placement = tensor.placement if not ranks else flow.placement("cuda", ranks)
placement = tensor.placement if not ranks else flow.placement(tensor.placement.type, ranks)
if pure_local:
tensor = tensor.to_global(placement=placement).to_local()
else:
Expand All @@ -457,9 +471,9 @@ def tton(tensor, local_only=False, ranks=None):
return tensor.numpy()


def tensor_to_rank0(tensor, device="cuda", to_local=False):
def tensor_to_rank0(tensor, device="npu", to_local=False):
"""Global tensor to rank0."""
assert device in ["cpu", "cuda"], f"not supported for device:{device}"
assert device in ["cpu", "cuda", "npu"], f"not supported for device:{device}"
if tensor.is_global:
# Consider if it's 2d mesh, ranks should be [[0]] instead of [0]
placement = flow.placement(device, ranks=[0] if tensor.placement.ranks.ndim == 1 else [[0]])
Expand Down
2 changes: 2 additions & 0 deletions libai/version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__version__ = '0.2.0'
git_version = '229c4d9ee2bf6f881a9883176f1ea067254b3583'
6 changes: 4 additions & 2 deletions projects/ChatGLM/chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@ def apply_rotary_pos_emb(x: flow.Tensor, rope_cache: flow.Tensor) -> flow.Tensor
x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
# truncate to support variable sizes
rope_cache = rope_cache[:sq]
xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
# xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
# rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
xshaped = dist.convert_to_distributed_default_setting(x.reshape(sq, -1, np, rot_dim // 2, 2))
rope_cache = dist.convert_to_distributed_default_setting(rope_cache.view(sq, -1, 1, xshaped.size(3), 2))
x_out2 = flow.cat(
[
(xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1]).unsqueeze(
Expand Down
9 changes: 6 additions & 3 deletions projects/ChatGLM/configs/chatglm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from projects.ChatGLM.chatglm import ChatGLMForConditionalGeneration
from projects.ChatGLM.tokenizer import ChatGLMTokenizer
from configs.common.train import train

# from configs.train import train

cfg = dict(
# Model
model_type='chatglm',
add_bias_linear=False,
add_qkv_bias=True,
apply_query_key_layer_scaling=True,
Expand Down Expand Up @@ -61,7 +62,8 @@
output_scores=False,
output_hidden_states=False,
# train
pretrained_model_path=os.environ["CHATGLM_HF_DIR"],
# pretrained_model_path=os.environ["CHATGLM_HF_DIR"],
pretrained_model_path='/data0/hf_models/chatglm/chatglm2-6b',
# lora_cfg
lora_enable=False,
lora_cfg=dict(
Expand All @@ -87,5 +89,6 @@
tokenization = OmegaConf.create()
tokenization.make_vocab_size_divisible_by = 1
tokenization.tokenizer = LazyCall(ChatGLMTokenizer)(
vocab_file=f"{os.environ['CHATGLM_HF_DIR']}/tokenizer.model"
# vocab_file=f"{os.environ['CHATGLM_HF_DIR']}/tokenizer.model"
vocab_file=cfg.pretrained_model_path+"/tokenizer.model"
)
22 changes: 16 additions & 6 deletions projects/ChatGLM/configs/chatglm_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@
max_source_len = 128
max_target_len = 128
max_length = 256
dataset_path = os.environ["DATA_DIR"]
pretrained_model_path = os.environ["CHATGLM_HF_DIR"]
# dataset_path = os.environ["DATA_DIR"]
# pretrained_model_path = os.environ["CHATGLM_HF_DIR"]
dataset_path = './data/libai_xpu_alpaca'
pretrained_model_path = '/data0/hf_models/chatglm/chatglm2-6b'

# graph & optim
graph["enabled"] = True
# graph["enabled"] = True
graph["enabled"] = False

optim.update(
dict(
Expand Down Expand Up @@ -76,21 +79,28 @@
test_micro_batch_size=1,
train_epoch=3,
train_iter=1,
log_period=10,
# log_period=10,
log_period=1,
warmup_ratio=2 / 5,
num_accumulation_steps=8,
rdma_enabled=True,
amp=dict(enabled=True),
# amp=dict(enabled=True),
amp=dict(enabled=False),
# train_with_fp16=True,
train_with_fp16=False,
activation_checkpoint=dict(enabled=True),
input_placement_device='npu',
checkpointer=dict(
period=5000,
max_to_keep=1,
),
dist=dict(
data_parallel_size=1,
tensor_parallel_size=1,
pipeline_parallel_size=4,
# pipeline_parallel_size=4,
pipeline_parallel_size=1,
pipeline_num_layers=cfg.num_layers,
device_type='npu',
),
evaluation=dict(
enabled=False,
Expand Down
3 changes: 2 additions & 1 deletion projects/ChatGLM/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from libai.utils.logger import setup_logger

IGNORE_INDEX = -100
logger = setup_logger()
# logger = setup_logger()
logger = setup_logger(name=__name__)


class ChatGLMTrainDataset(Dataset):
Expand Down
Loading