Skip to content

Commit

Permalink
Reworked the chatglm model to be pinned on a specific upstream commit…
Browse files Browse the repository at this point in the history
… id for #3.
  • Loading branch information
ljsabc committed Apr 5, 2023
1 parent 53471c1 commit 1becf4b
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 1,990 deletions.
29 changes: 15 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ Colab演示(需要GPU):[![Open In Colab](https://colab.research.google.com
- [x] LoRA finetuning with multiple GPUs
- [x] Basic prompt engineering for original posts
- [x] Hyperparameter tuning (incl. LoRA rank, batch size, learning rate, etc.)
- [ ] Allow in-reply-to and quoted tweets to be downloaded, for now it can only generate random tweets/replies/quotes
- [x] Allow in-reply-to tweets to be downloaded.
- [x] Advanced prompt engineering from OpenAI
- [x] Colab notebook for easy deployment (I believe this code can surely run on T4 as we are expecting much shortened tokens)
- [ ] Download quoted tweets.
- [ ] Support other datasets (e.g. Reddit, Weibo, etc. Future plan, let's discuss in #2 .)

## 环境与安装
Expand Down Expand Up @@ -104,45 +105,45 @@ Colab演示(需要GPU):[![Open In Colab](https://colab.research.google.com
finetune.py \
--dataset_path tweets.tokens \
--lora_rank 8 \
--per_device_train_batch_size 2 \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 1 \
--num_train_epoch 1 \
--num_train_epoch 2 \
--save_steps 2000 \
--save_total_limit 2 \
--learning_rate 2e-4 \
--learning_rate 6e-4 \
--fp16 \
--remove_unused_columns false \
--logging_steps 20 \
--logging_steps 50 \
--output_dir output \
--ddp_find_unused_parameters false \
--warmup_steps 50

进行多卡训练。如果多卡训练报错,
进行多卡训练。如果多卡训练报错,可能再跑一遍就好了,是一个小的缓存bug。

单卡训练:

python finetune.py \
--dataset_path tweets.tokens \
--lora_rank 8 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 4 \
--num_train_epoch 1 \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 1 \
--num_train_epoch 2 \
--save_steps 2000 \
--save_total_limit 2 \
--learning_rate 2e-4 \
--fp16 \
--remove_unused_columns false \
--logging_steps 20 \
--logging_steps 50 \
--output_dir output \
--warmup_steps 50
--warmup_steps 100

项目的调参还在研究中,目前的参数和[ChatGLM+LoRa](https://github.com/mymusise/ChatGLM-Tuning/)很类似,不过可以根据GPU数量调节学习率。默认的学习率是`2e-4` (每8个sample,如果loss突增可能还要降一些),请根据batch size和显卡能力自行测试调节。LoRA的rank可以根据你希望的模型性能进行调节,默认的8是足够的,你也可以提升到12甚至更高,经过一定的测试`lora_rank`上到16结果会上升一个台阶,代价是稍微更长一点的训练和测试时间,但是不会多很多。
目前的参数和[ChatGLM+LoRa](https://github.com/mymusise/ChatGLM-Tuning/)很类似,不过可以根据GPU数量调节学习率。默认的学习率是`2e-4` (每8个sample,如果loss突增可能还要降一些),请根据batch size和显卡能力自行测试调节。上游增加了gradient checkpointing之后3090甚至可以用到8的batch size了,体验非常好。LoRA的rank可以根据你希望的模型性能进行调节,默认的8是足够的,你也可以提升到12甚至更高,经过一定的测试`lora_rank`上到16结果会上升一个台阶,代价是稍微更长一点的训练和测试时间,但是不会多很多。

训练好的模型会保存在`output`文件夹下,你可以在`output/`中找到对应的模型文件。

## 预测(以及互动)

调用 `infer.py` 进行对话。你可以输入任何问题(但是目前没什么用),不过即便什么都不输入也可以生成一个很类似我的推文。
调用 `infer.py` 进行对话。你可以输入任何问题(因为是推文的原因,输入陈述句的效果比疑问句好),不过即便什么都不输入也可以生成一个很类似我的推文。

```python3 ./infer.py <path_to_model>```

Expand All @@ -164,4 +165,4 @@ Inspired by the following projects:

tloen/alpaca-lora
HuggingFace: KBlueLeaf/guanaco-7B-lora-embed
(potentially) twint-fork
(potentially) twint-fork
92 changes: 0 additions & 92 deletions configuration_chatglm.py

This file was deleted.

84 changes: 21 additions & 63 deletions finetune.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from transformers import TrainingArguments
from transformers import Trainer, HfArgumentParser
from transformers import AutoTokenizer
from modeling_chatglm import ChatGLMForConditionalGeneration
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_int8_training
Expand All @@ -10,7 +9,7 @@
import os


tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, revision="fdb7a60")


@dataclass
Expand All @@ -25,90 +24,45 @@ def forward(self, x):
return super().forward(x).to(torch.float32)


def get_masks_and_position_ids(
seq, seq_len, context_length, device, gmask=False, position_encoding_2d=True
):
mask_position = (
seq_len - 2
) # is equal to `seq.index(mask_token)` or `seq.index(150001)`
attention_mask = torch.ones((1, context_length, context_length), device=device)
attention_mask.tril_()
attention_mask[..., : mask_position - 1] = 1
attention_mask = (attention_mask < 0.5).bool()

if position_encoding_2d:
seq_length = seq_len - 1 # is equal to `seq_length = seq.index(150004)`
position_ids = torch.arange(context_length, dtype=torch.long, device=device)
if not gmask:
position_ids[seq_length:] = mask_position
block_position_ids = torch.cat(
(
torch.zeros(seq_length, dtype=torch.long, device=device),
torch.arange(
context_length - seq_length, dtype=torch.long, device=device
)
+ 1,
)
)
position_ids = torch.stack((position_ids, block_position_ids), dim=0)
else:
position_ids = torch.arange(context_length, dtype=torch.long, device=device)
if not gmask:
position_ids[context_length - 1 :] = mask_position
return attention_mask, position_ids


def data_collator(features: list) -> dict:
len_ids = [len(feature["input_ids"]) for feature in features]
longest = max(len_ids)
input_ids = []
attention_mask_list = []
position_ids_list = []
labels_list = []
for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
ids = feature["input_ids"]
seq_len = feature["seq_len"]
labels = (
[-100] * (seq_len - 1)
+ ids[(seq_len - 1) :]
+ [-100] * (longest - ids_l)
[-100] * (seq_len - 1) + ids[(seq_len - 1) :] + [-100] * (longest - ids_l)
)
ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)
_ids = torch.LongTensor(ids)
attention_mask, position_ids = get_masks_and_position_ids(
ids, seq_len, longest, _ids.device, gmask=False
)
labels_list.append(torch.LongTensor(labels))
input_ids.append(_ids)
attention_mask_list.append(attention_mask)
position_ids_list.append(position_ids)
input_ids = torch.stack(input_ids)
labels = torch.stack(labels_list)
attention_mask = torch.stack(attention_mask_list)
position_ids = torch.stack(position_ids_list)
return {
"input_ids": input_ids,
"labels": labels,
"attention_mask": attention_mask,
"position_ids": position_ids,
}


class ModifiedTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
return model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
position_ids=inputs["position_ids"],
labels=inputs["labels"],
).loss

def save_model(self, output_dir=None, _internal_call=False):
from transformers.trainer import TRAINING_ARGS_NAME

def save_tunable_parameters(model, path):
saved_params = {
k: v.to("cpu") for k, v in model.named_parameters() if v.requires_grad
}
torch.save(saved_params, path)
os.makedirs(output_dir, exist_ok=True)
torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
saved_params = {
k: v.to("cpu") for k, v in self.model.named_parameters() if v.requires_grad
}
torch.save(saved_params, os.path.join(output_dir, "adapter_model.bin"))


def main():
Expand All @@ -124,19 +78,23 @@ def main():
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}

# init model
model = ChatGLMForConditionalGeneration.from_pretrained(
"THUDM/chatglm-6b", load_in_8bit=True, device_map=device_map
model = AutoModel.from_pretrained(
"THUDM/chatglm-6b", load_in_8bit=True, trust_remote_code=True, device_map=device_map,revision="fdb7a60"
)
#model = prepare_model_for_int8_training(model)

# It's exactly the following.
model = prepare_model_for_int8_training(model)
#model.gradient_checkpointing_enable()
model.enable_input_require_grads()
#model.enable_input_require_grads()
model.is_parallelizable = True
model.model_parallel = True
model.lm_head = CastOutputToFloat(model.lm_head)
#model.lm_head = CastOutputToFloat(model.lm_head)
model.config.use_cache = (
False # silence the warnings. Please re-enable for inference!
)

#print(model)

# setup peft
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
Expand All @@ -149,6 +107,7 @@ def main():

# load dataset
dataset = datasets.load_from_disk(finetune_args.dataset_path)
#print(f"\n{len(dataset)=}\n")

# start train
trainer = ModifiedTrainer(
Expand All @@ -158,7 +117,6 @@ def main():
data_collator=data_collator,
)
trainer.train()

# save model
#save_tunable_parameters(
# model, os.path.join(training_args.output_dir, "chatglm-lora.pt")
Expand Down
Loading

0 comments on commit 1becf4b

Please sign in to comment.