forked from elsevierlabs-os/nerds
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtest_models.py
170 lines (149 loc) · 5.78 KB
/
test_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import csv
import os
import shutil
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from nerds.models import (
DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER,
ElmoNER, FlairNER, BertNER, TransformerNER,
EnsembleNER
)
from nerds.utils import *
def convert_to_iob_format(input_file, output_file):
num_written = 0
fout = open(output_file, "w")
with open(input_file, "r", encoding="iso-8859-1") as fin:
csv_reader = csv.reader(fin, delimiter=',', quotechar='"')
# skip header
next(csv_reader)
for line in csv_reader:
sid, token, pos, tag = line
if num_written > 0:
if len(sid) != 0:
# end of sentence marker
fout.write("\n")
fout.write("\t".join([token, tag]) + "\n")
num_written += 1
fout.write("\n")
fout.close()
# convert GMB dataset to our standard IOB format
if not os.path.exists("train.iob"):
convert_to_iob_format("train.csv", "train.iob")
# these are our entities
entity_labels = ["art", "eve", "geo", "gpe", "nat", "org", "per", "tim"]
# make model directory to store our models
if not os.path.exists("models"):
os.makedirs("models")
# read IOB file
data, labels = load_data_and_labels("train.iob", encoding="iso-8859-1")
# optional: restrict dataset to 5000 sentences
# data_s, labels_s = shuffle(data, labels, random_state=42)
# data = data_s
# labels = labels_s
print(len(data), len(labels))
# split into train and test set
xtrain, xtest, ytrain, ytest = train_test_split(data, labels,
test_size=0.3, random_state=42)
print(len(xtrain), len(ytrain), len(xtest), len(ytest))
# train and test the dictionary NER
model = DictionaryNER()
model.fit(xtrain, ytrain)
model.save("models/dict_model")
trained_model = model.load("models/dict_model")
ypred = trained_model.predict(xtest)
print(classification_report(flatten_list(ytest, strip_prefix=True),
flatten_list(ypred, strip_prefix=True),
labels=entity_labels))
# train and test the CRF NER
model = CrfNER()
model.fit(xtrain, ytrain)
model.save("models/crf_model")
trained_model = model.load("models/crf_model")
ypred = trained_model.predict(xtest)
print(classification_report(flatten_list(ytest, strip_prefix=True),
flatten_list(ypred, strip_prefix=True),
labels=entity_labels))
# train and test the SpaCy NER
model = SpacyNER()
model.fit(xtrain, ytrain)
model.save("models/spacy_model")
trained_model = model.load("models/spacy_model")
ypred = trained_model.predict(xtest)
print(classification_report(flatten_list(ytest, strip_prefix=True),
flatten_list(ypred, strip_prefix=True),
labels=entity_labels))
# train and test the BiLSTM-CRF NER
model = BiLstmCrfNER()
model.fit(xtrain, ytrain)
model.save("models/bilstm_model")
trained_model = model.load("models/bilstm_model")
ypred = trained_model.predict(xtest)
print(classification_report(flatten_list(ytest, strip_prefix=True),
flatten_list(ypred, strip_prefix=True),
labels=entity_labels))
# train and test the ELMo NER
if os.path.exists("glove.6B.100d.txt"):
model = ElmoNER()
model.fit(xtrain, ytrain)
model.save("models/elmo_model")
trained_model = model.load("models/elmo_model")
ypred = trained_model.predict(xtest)
print(classification_report(flatten_list(ytest, strip_prefix=True),
flatten_list(ypred, strip_prefix=True),
labels=entity_labels))
# train and test the FLAIR NER
model = FlairNER("models/flair_model")
model.fit(xtrain, ytrain)
model.save("models/flair_model")
trained_model = model.load("models/flair_model")
ypred = trained_model.predict(xtest)
print(classification_report(flatten_list(ytest, strip_prefix=True),
flatten_list(ypred, strip_prefix=True),
labels=entity_labels))
# train and test the BERT NER
model = BertNER(padding_tag="X")
model.fit(xtrain, ytrain)
model.save("models/bert_model")
trained_model = model.load("models/bert_model")
ypred = trained_model.predict(xtest)
ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X")
print(classification_report(flatten_list(ytest, strip_prefix=True),
flatten_list(ypred, strip_prefix=True),
labels=entity_labels))
# train and test Transformer NER
model = TransformerNER(
model_dir="models/transformer_model",
padding_tag="X")
model.fit(xtrain, ytrain)
model.save()
trained_model = model.load()
ypred = trained_model.predict(xtest)
ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X")
print(classification_report(flatten_list(ytest, strip_prefix=True),
flatten_list(ypred, strip_prefix=True),
labels=entity_labels))
# create and test an ensemble
dict_model = DictionaryNER()
dict_model.load("models/dict_model")
crf_model = CrfNER()
crf_model.load("models/crf_model")
spacy_model = SpacyNER()
spacy_model.load("models/spacy_model")
bilstm_model = BiLstmCrfNER()
bilstm_model.load("models/bilstm_model")
estimators = [
("dict_model", dict_model),
("crf_model", crf_model),
("spacy_model", spacy_model),
("bilstm_model", bilstm_model)
]
model = EnsembleNER(estimators=estimators, is_pretrained=True)
ypred = model.predict(xtest)
print(classification_report(flatten_list(ytest, strip_prefix=True),
flatten_list(ypred, strip_prefix=True),
labels=entity_labels))
# clean up
shutil.rmtree("models")
os.remove("train.iob")
os.remove("glove.6B.100d.txt")