-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenFeatureMatrix.py
executable file
·105 lines (92 loc) · 3.92 KB
/
genFeatureMatrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/python
import json
import os
import en
import datetime
import nltk
import numpy as np
def dateGenerator(numdays): # generate N days until now, eg [20151231, 20151230]
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, numdays)]
for i in range(len(date_list)): date_list[i] = date_list[i].strftime("%Y%m%d")
return set(date_list)
def unify_word(word): # went -> go, apples -> apple, BIG -> big
try: word = en.verb.present(word) # unify tense
except: pass
try: word = en.noun.singular(word) # unify noun
except: pass
return word.lower()
def readGlove(we_file, w2i_file, concat=True):
npz = np.load(we_file)
W1 = npz['arr_0']
W2 = npz['arr_1']
with open(w2i_file) as f:
word2idx = json.load(f)
V = len(word2idx)
if concat:
We = np.hstack([W1, W2.T])
print "We.shape:", We.shape
assert(V == We.shape[0])
else:
We = (W1 + W2.T) / 2
return We
def padding(sentencesVec, keepNum):
shape = sentencesVec.shape[0]
ownLen = sentencesVec.shape[1]
if ownLen < keepNum:
return np.hstack((np.zeros([shape, keepNum-ownLen]), sentencesVec)).flatten()
else:
return sentencesVec[:, -keepNum:].flatten()
def gen_FeatureMatrix(wordEmbedding, word2idx, priceDt, max_words=60, mtype="test"):
# step 2, build feature matrix for training data
loc = './input/'
input_files = [f for f in os.listdir(loc) if f.startswith('news_reuters.csv')]
current_idx = 2
dp = {} # only consider one news for a company everyday
cnt = 0
testDates = dateGenerator(100)
shape = wordEmbedding.shape[1]
features = np.zeros([0, max_words * shape])
labels = []
for file in input_files:
for line in open(loc + file):
line = line.strip().split(',')
if len(line) != 5: continue
ticker, name, day, headline, body = line
if ticker not in priceDt: continue # skip if no corresponding company found
if day not in priceDt[ticker]: continue # skip if no corresponding date found
cnt += 1
print(cnt)
if mtype == "test" and day not in testDates: continue
if mtype == "train" and day in testDates: continue
# 2.1 tokenize sentense, check if the word belongs to the top words, unify the format of words
tokens = nltk.word_tokenize(headline) + nltk.word_tokenize(body)
tokens = [unify_word(t) for t in tokens]
#tokens = [t for t in tokens if t in stopWords]
#tokens = [t for t in tokens if t in topWords]
# 2.2 create word2idx/idx2word list, and a list to count the occurence of words
sentencesVec = np.zeros([shape, 0])
for t in tokens:
if t not in word2idx: continue
sentencesVec = np.hstack((sentencesVec, np.matrix(wordEmbedding[word2idx[t]]).T))
features = np.vstack((features, padding(sentencesVec, max_words)))
labels.append(round(priceDt[ticker][day], 6))
features = np.array(features)
labels = np.matrix(labels)
featureMatrix = np.concatenate((features, labels.T), axis=1)
fileName = './input/featureMatrix_' + mtype + '.csv'
np.savetxt(fileName, featureMatrix, fmt="%s")
def build(wordEmbedding, w2i_file, max_words=60):
with open('./input/stockPrices.json') as data_file:
priceDt = json.load(data_file)
with open(w2i_file) as data_file:
word2idx = json.load(data_file)
gen_FeatureMatrix(wordEmbedding, word2idx, priceDt, max_words, "train")
gen_FeatureMatrix(wordEmbedding, word2idx, priceDt, max_words, "test")
def main(we, w2i_file):
wordEmbedding = readGlove(we, w2i_file)
build(wordEmbedding, w2i_file, 30)
if __name__ == "__main__":
we = './input/glove_model_50.npz'
w2i_file = "./input/word2idx.json"
main(we, w2i_file)