-
Notifications
You must be signed in to change notification settings - Fork 2
/
train_model.py
26 lines (21 loc) · 957 Bytes
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from embed_funcs import WordEmbeddings
import sys
import numpy as np
HALF_BATCH_SIZE = 128
rng = check_random_state(0)
def cosine_similarity(a_matrix, b_matrix):
return ((a_matrix * b_matrix).sum(axis=1)) / ((a_matrix.norm(2, axis = 1))*(b_matrix.norm(2, axis = 1)))
print('Loading Hindi embeddings...')
we_hi = WordEmbeddings()
we_hi.load_from_word2vec('./models/wv_hindi')
we_hi.downsample_frequent_words()
skn_hi = StandardScaler()
we_hi.vectors = skn_hi.fit_transform(we_hi.vectors).astype(theano.config.floatX)
we_batches_hi = we_hi.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng)
print >> sys.stderr, 'Loading English embeddings...'
we_en = WordEmbeddings()
we_en.load_from_word2vec('./models/wv_english')
we_en.downsample_frequent_words()
skn_en = StandardScaler()
we_en.vectors = skn_en.fit_transform(we_en.vectors).astype(theano.config.floatX)
we_batches_en = we_en.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng)