-
Notifications
You must be signed in to change notification settings - Fork 83
/
point-post-training-wwm-sop.py
345 lines (273 loc) · 10.6 KB
/
point-post-training-wwm-sop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# -*- coding: utf-8 -*-
# @Date : 2020/12/1
# @Author : mingming.xu
# @Email : [email protected]
# @File : post-training-wwm-sop.py
"""
训练样本格式为:[query, reply1, reply2,..], 此外替换NSP 为SOP,且SOP 时只替换reply list 顺序
"""
import os
os.environ['TF_KERAS'] = '1' # 必须使用tf.keras
import numpy as np
from tqdm import tqdm
import jieba
import itertools
from toolkit4nlp.utils import DataGenerator, pad_sequences
from toolkit4nlp.models import *
from toolkit4nlp.tokenizers import *
from toolkit4nlp.backend import *
from toolkit4nlp.layers import *
from toolkit4nlp.optimizers import *
# config
path = '/home/mingming.xu/datasets/NLP/ccf_qa_match/'
p = os.path.join(path, 'train', 'train.query.tsv')
config_path = '/home/mingming.xu/pretrain/NLP/nezha_base_wwm/bert_config.json'
checkpoint_path = '/home/mingming.xu/pretrain/NLP/nezha_base_wwm/model.ckpt'
dict_path = '/home/mingming.xu/pretrain/NLP/nezha_base_wwm/vocab.txt'
model_saved_path = './nezha_post_training/wwm-model-add-dict-no-mask-end-sop.ckpt'
new_dict_path = './data/new_dict.txt'
maxlen = 256
batch_size = 16
epochs = 100
learning_rate = 5e-5
# 建立分词器
tokenizer = Tokenizer(dict_path)
# query /reply 拼接作为训练句子
def load_data(train_test='train'):
D = {}
with open(os.path.join(path, train_test, train_test + '.query.tsv')) as f:
for l in f:
span = l.strip().split('\t')
q = span[1]
D[span[0]] = {'query': q, 'reply': []}
with open(os.path.join(path, train_test, train_test + '.reply.tsv')) as f:
for l in f:
span = l.strip().split('\t')
if len(span) == 4:
q_id, r_id, r, label = span
else:
q_id, r_id, r = span
# if len(r) < 4 or (len(r) == 1 and tokenizer._is_punctuation(r)):
# continue
# 补上句号
# if not tokenizer._is_punctuation(list(r)[-1]):
# r += '。'
D[q_id]['reply'].append(r)
d = []
for k, v in D.items():
item = []
l = 0
q = v['query']
replys = v['reply']
l += len(q)
item.append(q)
for r in replys:
lr = len(r)
# if l + lr >maxlen:
# d.append(item)
# item = []
# l = 0
l += lr
item.append(r)
d.append(item)
return d
train_data = load_data('train')
test_data = load_data('test')
data = train_data + test_data
# wwm
jieba.initialize()
new_words = []
with open(new_dict_path) as f:
for l in f:
w = l.strip()
new_words.append(w)
jieba.add_word(w)
words_data = [[jieba.lcut(line) for line in sen] for sen in data]
def shuffle_reply(item):
"""
只打乱reply list的顺序
"""
q, rs = item[0], item[1:]
permuter_rs = list(itertools.permutations(rs))[1:]
if len(permuter_rs) < 1:
print(item)
idx = np.random.choice(len(permuter_rs))
r = permuter_rs[idx]
return [q] + list(r)
def can_mask(token_ids):
if token_ids in (tokenizer._token_start_id, tokenizer._token_mask_id, tokenizer._token_end_id):
return False
return True
def random_masking(lines):
"""对输入进行随机mask
"""
# rands = np.random.random(len(token_ids))
sources, targets = [tokenizer._token_start_id], [0]
segments = [0]
for i, sent in enumerate(lines):
source, target = [], []
segment = []
rands = np.random.random(len(sent))
for r, word in zip(rands, sent):
word_token = tokenizer.encode(word)[0][1:-1]
if r < 0.15 * 0.8:
source.extend(len(word_token) * [tokenizer._token_mask_id])
target.extend(word_token)
elif r < 0.15 * 0.9:
source.extend(word_token)
target.extend(word_token)
elif r < 0.15:
source.extend([np.random.choice(tokenizer._vocab_size - 5) + 5 for _ in range(len(word_token))])
target.extend(word_token)
else:
source.extend(word_token)
target.extend([0] * len(word_token))
# add end token
source.append(tokenizer._token_end_id)
# target.append(tokenizer._token_end_id) # if mask end token, use this line
target.append(0)
if i == 0:
segment = [0] * len(source)
else:
segment = [1] * len(source)
sources.extend(source)
targets.extend(target)
segments.extend(segment)
return sources, targets, segments
class data_generator(DataGenerator):
def __iter__(self, shuffle=False):
batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked, batch_nsp = [], [], [], [], []
for is_end, item in self.get_sample(shuffle):
# 50% shuffle order
label = 1
p = np.random.random()
if p < 0.5:
label = 0
item = shuffle_reply(item)
source_tokens, target_tokens, segment_ids = random_masking(item)
is_masked = [0 if i == 0 else 1 for i in target_tokens]
batch_token_ids.append(source_tokens)
batch_segment_ids.append(segment_ids)
batch_target_ids.append(target_tokens)
batch_is_masked.append(is_masked)
batch_nsp.append([label])
if is_end or len(batch_token_ids) == self.batch_size:
batch_token_ids = pad_sequences(batch_token_ids, maxlen=maxlen)
batch_segment_ids = pad_sequences(batch_segment_ids, maxlen=maxlen)
batch_target_ids = pad_sequences(batch_target_ids, maxlen=maxlen)
batch_is_masked = pad_sequences(batch_is_masked, maxlen=maxlen)
batch_nsp = pad_sequences(batch_nsp)
yield [batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked, batch_nsp], None
batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked = [], [], [], []
batch_nsp = []
train_generator = data_generator(words_data, batch_size)
def build_transformer_model_with_mlm():
"""带mlm的bert模型
"""
bert = build_transformer_model(
config_path,
with_mlm='linear',
with_nsp=True,
model='nezha',
return_keras_model=False,
)
proba = bert.model.output
# print(proba)
# 辅助输入
token_ids = Input(shape=(None,), dtype='int64', name='token_ids') # 目标id
is_masked = Input(shape=(None,), dtype=K.floatx(), name='is_masked') # mask标记
nsp_label = Input(shape=(None,), dtype='int64', name='nsp') # nsp
def mlm_loss(inputs):
"""计算loss的函数,需要封装为一个层
"""
y_true, y_pred, mask = inputs
_, y_pred = y_pred
loss = K.sparse_categorical_crossentropy(
y_true, y_pred, from_logits=True
)
loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
return loss
def nsp_loss(inputs):
"""计算nsp loss的函数,需要封装为一个层
"""
y_true, y_pred = inputs
y_pred, _ = y_pred
loss = K.sparse_categorical_crossentropy(
y_true, y_pred
)
loss = K.mean(loss)
return loss
def mlm_acc(inputs):
"""计算准确率的函数,需要封装为一个层
"""
y_true, y_pred, mask = inputs
_, y_pred = y_pred
y_true = K.cast(y_true, K.floatx())
acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
return acc
def nsp_acc(inputs):
"""计算准确率的函数,需要封装为一个层
"""
y_true, y_pred = inputs
y_pred, _ = y_pred
y_true = K.cast(y_true, K.floatx())
acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
acc = K.mean(acc)
return acc
mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])
nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba])
nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba])
train_model = Model(
bert.model.inputs + [token_ids, is_masked, nsp_label], [mlm_loss, mlm_acc, nsp_loss, nsp_acc]
)
loss = {
'mlm_loss': lambda y_true, y_pred: y_pred,
'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
'nsp_loss': lambda y_true, y_pred: y_pred,
'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
}
return bert, train_model, loss
bert, train_model, loss = build_transformer_model_with_mlm()
Opt = extend_with_weight_decay(Adam)
Opt = extend_with_gradient_accumulation(Opt)
Opt = extend_with_piecewise_linear_lr(Opt)
opt = Opt(learning_rate=learning_rate,
exclude_from_weight_decay=['Norm', 'bias'],
lr_schedule={int(len(train_generator) * epochs * 0.1): 1.0, len(train_generator) * epochs: 0},
weight_decay_rate=0.01,
grad_accum_steps=2,
)
train_model.compile(loss=loss, optimizer=opt)
# 如果传入权重,则加载。注:须在此处加载,才保证不报错。
if checkpoint_path is not None:
bert.load_weights_from_checkpoint(checkpoint_path)
train_model.summary()
class ModelCheckpoint(keras.callbacks.Callback):
"""
每10个epoch保存一次模型
"""
def __init__(self):
self.loss = 1e6
def on_epoch_end(self, epoch, logs=None):
if logs['loss'] < self.loss:
self.loss = logs['loss']
# print('epoch: {}, loss is : {}, lowest loss is:'.format(epoch, logs['loss'], self.loss))
if (epoch + 1) % 10 == 0:
bert.save_weights_as_checkpoint(model_saved_path + '-{}'.format(epoch + 1))
token_ids, segment_ids = tokenizer.encode(u'看哪个?', '微信您通过一下吧')
token_ids[9] = token_ids[10] = tokenizer._token_mask_id
probs = bert.model.predict([np.array([token_ids]), np.array([segment_ids])])[1]
print(tokenizer.decode(probs[0, 9:11].argmax(axis=1)))
if __name__ == '__main__':
# 保存模型
checkpoint = ModelCheckpoint()
# 记录日志
csv_logger = keras.callbacks.CSVLogger('training.log')
train_model.fit(
train_generator.generator(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[checkpoint, csv_logger],
)