-
Notifications
You must be signed in to change notification settings - Fork 4
/
pretraining.py
133 lines (106 loc) · 4.38 KB
/
pretraining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import json
import comet_ml
from datasets import Dataset, DatasetDict
from transformers import (Trainer, TrainingArguments, EarlyStoppingCallback,
AutoTokenizer, AutoModelForMaskedLM, set_seed,
RobertaForCausalLM, DataCollatorForLanguageModeling,
)
# loading parameters
with open('config/pretraining_config.json') as f:
params = json.load(f)
# special tokens used when converting KG-to-Text
with open('data/special_tokens.txt', 'r') as in_file:
special_tokens = [line.strip() for line in in_file.readlines()]
model_name = params['model_checkpoint']
tokenizer_name = params['tokenizer_name']
pretraining_method = params['pretraining_method']
train_data = params['train_data']
dev_data = params['dev_data']
max_length = params['max_length']
block_size = params['block_size']
output_dir = params['output_dir']
kg_name = params['kg_name']
relation_category = params['relation_category']
create_dev = params['create_dev']
test_size = params['test_size']
text_field = params['text_field']
set_seed(42)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
def preprocess_example(example):
example[text_field] = example[text_field].replace('\n', '')
return example
def encode(examples):
# since the data collator (DataCollatorForLanguageModeling) dynamically pads the input examples,
# we skip padding when we are tokenizing the input examples and only do the truncation
return tokenizer(examples[text_field], max_length=max_length, truncation=True, padding='max_length')
dataset = DatasetDict()
dataset['train'] = Dataset.from_csv(train_data)
if create_dev == 1:
train_eval = dataset["train"].train_test_split(test_size=test_size)
dataset["train"] = train_eval['train']
dataset["dev"] = train_eval['test']
else:
dataset['dev'] = Dataset.from_csv(dev_data)
dataset = dataset.filter(lambda example: example[text_field] != '' and "[MASK]" not in example[text_field])
# relation_category only exists for ATOMIC
if kg_name == "atomic":
dataset = dataset.filter(lambda example: example['relation_category'] in relation_category)
dataset = dataset.map(preprocess_example)
dataset = dataset.map(encode, batched=True)
if params['batch_training'] == 1:
dataset = dataset.map(
group_texts,
batched=True,
batch_size=params['block_size'],
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True,
mlm_probability=0.15,
)
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=params['num_train_epochs'],
learning_rate=params['learning_rate'],
weight_decay=params['weight_decay'],
evaluation_strategy=params['eval_save_strategy'],
save_strategy=params['eval_save_strategy'],
eval_steps=params['eval_steps'],
save_steps=params['save_steps'],
logging_steps=params['logging_steps'],
per_device_train_batch_size=params['batch_size'],
per_device_eval_batch_size=params['batch_size'],
load_best_model_at_end=True,
metric_for_best_model='eval_loss',
gradient_accumulation_steps=params['gradient_accumulation_steps'],
)
def get_model():
if pretraining_method == 'mlm':
return AutoModelForMaskedLM.from_pretrained(model_name)
elif pretraining_method == "clm":
return RobertaForCausalLM.from_pretrained(model_name)
trainer = Trainer(
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["dev"],
data_collator=data_collator,
model_init=get_model,
callbacks=[EarlyStoppingCallback(early_stopping_patience=params['early_stopping_patience'])]
)
trainer.train()
trainer.save_model()