-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
121 lines (97 loc) · 4.98 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from collections import namedtuple
from collections import defaultdict
from random import shuffle
import datetime
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from data_tools import *
from eval import *
############
# https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
############
# download if file not exist
prepare_data()
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
all_docs = [] # will hold all docs in original order
with open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:
for line_no, line in enumerate(alldata):
tokens = gensim.utils.to_unicode(line).split()
words = tokens[1:]
tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
split = ['train', 'test', 'extra', 'extra'][line_no // 25000] # 25k train, 25k test, 25k extra
sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][
line_no // 12500] # [12.5K pos, 12.5K neg]*2 then unknown
all_docs.append(SentimentDocument(words, tags, split, sentiment))
train_docs = [doc for doc in all_docs if doc.split == 'train']
test_docs = [doc for doc in all_docs if doc.split == 'test']
doc_list = all_docs[:] # for reshuffling per pass
print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"
simple_models = [
# PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
# PV-DBOW
Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
# PV-DM w/average
Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]
# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(all_docs) # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
model.reset_from(simple_models[0])
print(model)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])
best_error = defaultdict(lambda: 1.0) # to selectively-print only best errors achieved
alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes
print("START %s" % datetime.datetime.now())
for epoch in range(passes):
shuffle(doc_list) # shuffling gets best results
for name, train_model in models_by_name.items():
# train
duration = 'na'
train_model.alpha, train_model.min_alpha = alpha, alpha
with elapsed_timer() as elapsed:
train_model.train(doc_list)
duration = '%.1f' % elapsed()
# evaluate
eval_duration = ''
with elapsed_timer() as eval_elapsed:
err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
eval_duration = '%.1f' % eval_elapsed()
best_indicator = ' '
if err <= best_error[name]:
best_error[name] = err
best_indicator = '*'
print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))
if ((epoch + 1) % 5) == 0 or epoch == 0:
eval_duration = ''
with elapsed_timer() as eval_elapsed:
infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs,
infer=True)
eval_duration = '%.1f' % eval_elapsed()
best_indicator = ' '
if infer_err < best_error[name + '_inferred']:
best_error[name + '_inferred'] = infer_err
best_indicator = '*'
print("%s%f : %i passes : %s %ss %ss" % (
best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))
print('completed pass %i at alpha %f' % (epoch + 1, alpha))
alpha -= alpha_delta
print("END %s" % str(datetime.datetime.now()))
# print best error rates achieved
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
print("%f %s" % (rate, name))
# Are inferred vectors close to the pre-calculated ones?
doc_id = np.random.randint(simple_models[0].docvecs.count) # pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
inferred_docvec = model.infer_vector(all_docs[doc_id].words)
print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))