-
Notifications
You must be signed in to change notification settings - Fork 4
/
stanzas-evaluation.py
executable file
·127 lines (107 loc) · 4.09 KB
/
stanzas-evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
# coding: utf-8
# conda install pytorch>=1.6 cudatoolkit=10.2 -c pytorch
# wandb login XXX
import json
import logging
import os
import re
import sklearn
import sys
import time
from itertools import product
import numpy as np
import pandas as pd
import wandb
#from IPython import get_ipython
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
truthy_values = ("true", "1", "y", "yes")
TAG = os.environ.get("TAG", "bertsification")
MODELNAME = os.environ.get("MODELNAME", "bert;bert-base-multilingual-cased")
OVERWRITE = os.environ.get("OVERWRITE", "False").lower() in truthy_values
logging.basicConfig(level=logging.INFO, filename=time.strftime("models/{}-%Y-%m-%dT%H%M%S.log".format(TAG)))
with open('pid', 'w') as pid:
pid.write(str(os.getpid()))
logging.info("Experiment '{}', (eval_df = {}, pid = {})".format(
TAG, MODELNAME, str(os.getpid()),
))
# Utils
def clean_text(string):
output = string.strip()
# replacements = (("“", '"'), ("”", '"'), ("//", ""), ("«", '"'), ("»",'"'))
replacements = (
("“", ''), ("”", ''), ("//", ""), ("«", ''), ("»",''), (",", ''),
(";", ''), (".", ''),
# ("?", ''), ("¿", ''), ("¡", ''), ("!", ''), ("-", ' '),
)
for replacement in replacements:
output = output.replace(*replacement)
# Any sequence of two or more spaces should be converted into one space
output = re.sub(r'(?is)\s+', ' ', output)
return output.strip()
def clean_labels(label):
return "unknown" if str(label) == "None" else label
def prepare_data():
df = (pd
.read_csv(...)
.rename(...)
.assign(
text=lambda x: x["text"].apply(...),
stanza=lambda x: x["stanza"].apply(...),
)
)
label_encoder = LabelEncoder()
label_encoder.fit(...)
df["labels"] = ...
train_df, eval_df = train_test_split(
df, stratify=..., test_size=0.25, random_state=42
)
return train_df, eval_df, label_encoder
def train_model(train_df, num_labels):
model_type, model_name = MODELNAME.split(";")
model_output = 'models/{}-{}-{}'.format(TAG, model_type, model_name.replace("/", "-"))
if OVERWRITE is False and os.path.exists(model_output):
logging.info("Skipping training of {}".format(model_name))
sys.exit(0)
logging.info("Starting training of {}".format(model_name))
run = wandb.init(project=model_output.split("/")[-1], reinit=True)
model = ...(
..., ..., num_labels=..., args={
'output_dir': model_output,
'best_model_dir': '{}/best'.format(model_output),
'evaluate_during_training': False,
'manual_seed': 42,
# 'learning_rate': 2e-5, # For BERT, 5e-5, 3e-5, 2e-5
# For BERT 16, 32. It could be 128, but with gradient_acc_steps set to 2 is equivalent
'train_batch_size': 8 if "large" in model_name else 32,
'eval_batch_size': 8 if "large" in model_name else 32,
# Doubles train_batch_size, but gradients and wrights are calculated once every 2 steps
'gradient_accumulation_steps': 2 if "large" in model_name else 1,
'max_seq_length': 64,
'wandb_project': model_output.split("/")[-1],
# "adam_epsilon": 3e-5, # 1e-8
"silent": False,
"fp16": False,
"n_gpu": 1,
})
# train the model
model.train_model(...)
return model, run
def eval_model(model, eval_df, run):
result, *_ = model.eval_model(...)
logging.info("Results: {}".format(str(result)))
eval_df["predicted"], *_ = model.predict(...)
acc = sum(eval_df.labels == eval_df.predicted) / eval_df.labels.size
logging.info("Accuracy: {}".format(acc))
wandb.log({"accuracy_es": acc})
run.finish()
def main() -> None:
logging.info("Starting...")
train_df, eval_df, label_encoder = ...()
model, run = ...(train_df, len(label_encoder.classes_))
...(model, eval_df, run)
logging.info("Done.")
if __name__ == "__main__":
main()