Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for Falcon LLM #72

Open
wants to merge 53 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
f21223a
bug fix
ydli-ai Nov 24, 2022
cf74ac6
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Nov 25, 2022
efc6539
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Dec 1, 2022
9fe4f32
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Dec 14, 2022
6baa170
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Jan 4, 2023
ec90e5e
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Mar 7, 2023
a1047b7
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Mar 16, 2023
3816b6b
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Mar 30, 2023
b7c9175
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Apr 9, 2023
fe6e8b0
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Apr 14, 2023
8ac0c6f
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Apr 15, 2023
6000aab
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai May 12, 2023
b483c05
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai May 15, 2023
53fd4fa
Merge branch 'main' of https://github.com/Tencent/TencentPretrain
ydli-ai Jun 7, 2023
12c6d36
update
ydli-ai Jun 8, 2023
75b5bec
update
ydli-ai Jun 8, 2023
db37d08
update
ydli-ai Jun 8, 2023
220b59d
update
ydli-ai Jun 8, 2023
ad41ad1
update
ydli-ai Jun 8, 2023
c23f13c
update
ydli-ai Jun 9, 2023
133a615
update
ydli-ai Jun 9, 2023
91ccd26
update
ydli-ai Jun 9, 2023
f5e1fe8
update
ydli-ai Jun 9, 2023
4f36d0c
update
ydli-ai Jun 9, 2023
f914ff4
update
ydli-ai Jun 9, 2023
51c83c2
update
ydli-ai Jun 9, 2023
0edd2bd
update
ydli-ai Jun 9, 2023
9893cb3
update
ydli-ai Jun 9, 2023
4d443a9
update
ydli-ai Jun 9, 2023
1c5dfdf
update
ydli-ai Jun 9, 2023
2f8fca7
update
ydli-ai Jun 9, 2023
6dcac76
update
ydli-ai Jun 9, 2023
cdd45c6
update
ydli-ai Jun 9, 2023
c564517
update
ydli-ai Jun 10, 2023
417b16a
update
ydli-ai Jun 10, 2023
ae8f5d8
update
ydli-ai Jun 10, 2023
6e9fde2
update
ydli-ai Jun 10, 2023
c9a1cb5
update
ydli-ai Jun 10, 2023
dbb2c36
update
ydli-ai Jun 10, 2023
73df196
update
ydli-ai Jun 10, 2023
0ee4d51
update
ydli-ai Jun 10, 2023
b6a3670
update
ydli-ai Jun 10, 2023
1984cb0
update
ydli-ai Jun 10, 2023
5beb0fb
update
ydli-ai Jun 10, 2023
86981ee
update
ydli-ai Jun 10, 2023
c8ad4e8
update
ydli-ai Jun 10, 2023
ae6b1b0
update
ydli-ai Jun 10, 2023
bb0f06a
update
ydli-ai Jun 10, 2023
c4e7360
update
ydli-ai Jun 10, 2023
97e3cf2
update
ydli-ai Jun 10, 2023
0e6430b
update
ydli-ai Jun 10, 2023
0af91a9
update
ydli-ai Jun 10, 2023
a7811fd
update
ydli-ai Jun 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions models/falcon/7b_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"emb_size": 4544,
"feedforward_size": 18176,
"hidden_size": 4544,
"hidden_act": "gelu_fast",
"heads_num": 71,
"layers_num": 32,
"dropout": 0.0,
"data_processor": "lm",
"max_seq_length": 2048,
"embedding": ["word"],
"remove_transformer_bias": true,
"remove_embedding_layernorm": true,
"attention": "flash_attention",
"encoder": "transformer",
"mask": "causal",
"layernorm_positioning": "parallel_attn",
"layernorm": "normal_torch",
"layernorm_eps": 1e-5,
"target": ["lm"]
}
7 changes: 7 additions & 0 deletions models/falcon_special_tokens_map.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"pad_token": "<|endoftext|>",
"unk_token": "<|endoftext|>",
"cls_token": "<|endoftext|>",
"sep_token": "<|endoftext|>",
"mask_token": "<mask>"
}
1 change: 1 addition & 0 deletions preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def main():
help="Probability of truncating sequence."
"The larger value, the higher probability of using short (truncated) sequence.")
parser.add_argument("--full_sentences", action="store_true", help="Full sentences.")
parser.add_argument("--remove_cls_token", action="store_true", help="Preprocess without CLS tokens.")
parser.add_argument("--seed", type=int, default=7, help="Random seed.")

# Masking options.
Expand Down
56 changes: 56 additions & 0 deletions scripts/convert_falcon_from_huggingface_to_tencentpretrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import argparse
import collections
import torch
import os
import json


parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--input_model_path", type=str, default="models/falcon-7b/",
help=".")
parser.add_argument("--output_model_path", type=str, default="models/falcon-7b.bin",
help=".")
parser.add_argument("--layers_num", type=int, default=32,
help=".")

args = parser.parse_args()

files = os.listdir(args.input_model_path)
model_files = [f for f in files if f[-4:] == ".bin"]
input_models = {f: torch.load(os.path.join(args.input_model_path, f), map_location="cpu") for f in model_files}

with open(os.path.join(args.input_model_path, "pytorch_model.bin.index.json")) as f:
model_index = json.load(f)
weight_map = model_index["weight_map"]


output_model = collections.OrderedDict()

def get_weight_from_name(layer_name):
return input_models[weight_map[layer_name]][layer_name]


output_model["embedding.word.embedding.weight"] = get_weight_from_name("transformer.word_embeddings.weight")

for i in range(args.layers_num):

output_model["encoder.transformer." + str(i) + ".layer_norm_1.weight"] = \
get_weight_from_name("transformer.h." + str(i) + ".input_layernorm.weight")
output_model["encoder.transformer." + str(i) + ".layer_norm_1.bias"] = \
get_weight_from_name("transformer.h." + str(i) + ".input_layernorm.bias")

output_model["encoder.transformer." + str(i) + ".self_attn.query_key_value.weight"] = \
get_weight_from_name("transformer.h." + str(i) + ".self_attention.query_key_value.weight")
output_model["encoder.transformer." + str(i) + ".self_attn.dense.weight"] = \
get_weight_from_name("transformer.h." + str(i) + ".self_attention.dense.weight")

output_model["encoder.transformer." + str(i) + ".feed_forward.linear_1.weight"] = \
get_weight_from_name("transformer.h." + str(i) + ".mlp.dense_h_to_4h.weight")
output_model["encoder.transformer." + str(i) + ".feed_forward.linear_2.weight"] = \
get_weight_from_name("transformer.h." + str(i) + ".mlp.dense_4h_to_h.weight")

output_model["encoder.layer_norm.weight"] = get_weight_from_name("transformer.ln_f.weight")
output_model["encoder.layer_norm.bias"] = get_weight_from_name("transformer.ln_f.bias")
output_model["target.lm.output_layer.weight"] = get_weight_from_name("lm_head.weight")

torch.save(output_model, args.output_model_path)
4 changes: 4 additions & 0 deletions scripts/generate_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,13 @@ def top_k_top_p_filtering(logits, top_k, top_p):

args = load_hyperparam(args)

args.tokenizer_type = args.tokenizer
args.tokenizer = str2tokenizer[args.tokenizer](args)

model = GenerateLm(args)
model = load_model(model, args.load_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

model.eval()
Expand All @@ -110,6 +112,8 @@ def top_k_top_p_filtering(logits, top_k, top_p):

f.write(line + "\n")
tokens = [token_id.item() for token_id in src_tensor[0]]
if args.tokenizer_type in ["hfpretrained"]:
generated_sentence = args.tokenizer.decode(tokens)
if args.tokenizer.sp_model is not None:
generated_sentence = args.tokenizer.sp_model.decode(tokens)
else:
Expand Down
13 changes: 4 additions & 9 deletions tencentpretrain/encoders/transformer_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import torch.nn as nn
from tencentpretrain.utils.rope import precompute_freqs_cis
from tencentpretrain.layers.transformer import TransformerLayer
from tencentpretrain.layers.layer_norm import *
from tencentpretrain.layers import *
from tencentpretrain.layers.relative_position_embedding import RelativePositionEmbedding

class TransformerEncoder(nn.Module):
Expand Down Expand Up @@ -36,13 +36,8 @@ def __init__(self, args):
self.transformer = nn.ModuleList(
[TransformerLayer(args) for _ in range(self.layers_num)]
)
if self.layernorm_positioning == "pre":
if args.layernorm == "t5":
self.layer_norm = T5LayerNorm(args.hidden_size)
elif args.layernorm == "rms":
self.layer_norm = RMSNorm(args.hidden_size)
else:
self.layer_norm = LayerNorm(args.hidden_size)

self.layer_norm = str2layernorm[args.layernorm](args.hidden_size, eps=args.layernorm_eps)

if self.relative_position_embedding:
self.relative_pos_emb = RelativePositionEmbedding(bidirectional=True, heads_num=args.heads_num,
Expand Down Expand Up @@ -143,7 +138,7 @@ def custom_forward(*inputs):
has_residual_attention=self.has_residual_attention,
prev_attn=prev_attn, freqs_cis=freqs_cis)

if self.layernorm_positioning == "pre":
if self.layernorm_positioning in ["pre", "parallel_attn"]:
return self.layer_norm(hidden)
else:
return hidden
15 changes: 15 additions & 0 deletions tencentpretrain/layers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from tencentpretrain.layers.layer_norm import *
from tencentpretrain.layers.multi_headed_attn import *
from tencentpretrain.layers.position_ffn import *
import torch.nn as nn

str2layernorm = {"t5": T5LayerNorm, "rms": RMSNorm, "normal_torch": nn.LayerNorm, "normal": LayerNorm}

str2attention = {"multi_head": MultiHeadedAttention, "flash_attention": FlashAttention}

str2feedforward = {"gated": GatedFeedForward, "dense": PositionwiseFeedForward}

__all__ = ["T5LayerNorm", "RMSNorm", "LayerNorm", "MultiHeadedAttention",
"FlashAttention", "GatedFeedForward", "PositionwiseFeedForward", "str2layernorm",
"str2attention", "str2feedforward"]

140 changes: 140 additions & 0 deletions tencentpretrain/layers/multi_headed_attn.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from tencentpretrain.utils.rope import apply_rotary_emb
from tencentpretrain.utils.lora import LoraLinear
from tencentpretrain.utils.rope import RotaryEmbedding

class MultiHeadedAttention(nn.Module):
"""
Expand Down Expand Up @@ -88,3 +90,141 @@ def unshape(x):
output = unshape(torch.matmul(probs, value))
output = self.final_linear(output)
return output, prev_attn_out


class FlashAttention(nn.Module):
"""
Flash Attention used in Falcon.
https://huggingface.co/tiiuae/falcon-7b/blob/main/modelling_RW.py#L154
"""

def __init__(self, hidden_size, heads_num, attention_head_size, dropout, has_bias=True, with_scale=True,
lora_params=None):
super(FlashAttention, self).__init__()
self.heads_num = heads_num
self.hidden_size = hidden_size
self.per_head_size = attention_head_size
self.with_scale = with_scale
self.inner_hidden_size = heads_num * attention_head_size
self.multi_query = True


self.rotary = RotaryEmbedding(self.per_head_size)
# Layer-wise attention scaling
self.inv_norm_factor = 1.0 / math.sqrt(self.per_head_size)
self.beta = self.inv_norm_factor

self.query_key_value = nn.Linear(
self.hidden_size,
3 * self.hidden_size if not self.multi_query else (self.hidden_size + 2 * self.per_head_size),
bias=has_bias
)

self.dense = nn.Linear(self.hidden_size, self.hidden_size, bias=has_bias)
self.attention_dropout = nn.Dropout(dropout)
self.num_kv = self.heads_num if not self.multi_query else 1

def _split_heads(self, fused_qkv: torch.Tensor):
"""
Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
storage as `fused_qkv`
Args:
fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
Returns:
query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
value: [batch_size, seq_length, num_heads, head_dim]
"""
if not self.multi_query:
batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
fused_qkv = fused_qkv.view(batch_size, seq_length, self.heads_num, 3, self.per_head_size)
return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
else:
batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
fused_qkv = fused_qkv.view(batch_size, seq_length, self.heads_num + 2, self.per_head_size)
return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]

def _merge_heads(self, x: torch.Tensor):
"""
Merge heads together over the last dimenstion
Args:
x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
Returns:
torch.tensor: [batch_size, seq_length, num_heads * head_dim]
"""
# What we want to achieve is:
# batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
batch_size_and_num_heads, seq_length, _ = x.shape
batch_size = batch_size_and_num_heads // self.heads_num

# First view to decompose the batch size
# batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
x = x.view(batch_size, self.heads_num, seq_length, self.per_head_size)

# batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
x = x.permute(0, 2, 1, 3)

# batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
return x.reshape(batch_size, seq_length, self.heads_num * self.per_head_size)

def forward(self, key, value, query, mask, position_bias=None, has_residual_attention=False, prev_attn=None,
freqs_cis=None):
"""
Args:
key: [batch_size x seq_length x hidden_size]
value: [batch_size x seq_length x hidden_size]
query: [batch_size x seq_length x hidden_size]
mask: [batch_size x 1 x seq_length x seq_length]
position_bias: [1 x heads_num x seq_length x seq_length]
Returns:
output: [batch_size x seq_length x hidden_size]
"""

fused_qkv = self.query_key_value(query)

# 3 x [batch_size, seq_length, num_heads, head_dim]
(query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
batch_size, q_length, _, _ = query_layer.shape

query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.heads_num, q_length, self.per_head_size)

key_layer = key_layer.transpose(1, 2).reshape(
batch_size * self.num_kv,
q_length,
self.per_head_size,
)
value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_kv, q_length, self.per_head_size)

query_layer, key_layer = self.rotary(query_layer, key_layer)

_, kv_length, _ = key_layer.shape

query_layer_ = query_layer.reshape(batch_size, self.heads_num, -1, self.per_head_size)
key_layer_ = key_layer.reshape(batch_size, self.num_kv, -1, self.per_head_size)
value_layer_ = value_layer.reshape(batch_size, self.num_kv, -1, self.per_head_size)

if torch.__version__ < "2.0.0":
scores = torch.matmul(query_layer_, key_layer_.transpose(-2, -1))
if self.with_scale:
scores = scores / math.sqrt(float(self.per_head_size))
scores = scores + mask.type_as(scores)
prev_attn_out = None
if has_residual_attention:
if prev_attn is not None:
scores += prev_attn
prev_attn_out = scores
probs = nn.Softmax(dim=-1)(scores)
attn_output = probs @ value_layer_

else:
prev_attn_out = None
attn_output = F.scaled_dot_product_attention(
query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True
)

x = attn_output.view(batch_size, self.heads_num, q_length, self.per_head_size)
x = x.permute(0, 2, 1, 3)
attn_output = x.reshape(batch_size, q_length, self.heads_num * self.per_head_size)

output_tensor = self.dense(attn_output)

return output_tensor, prev_attn_out
Loading