forked from nicholaslocascio/bcs-lstm
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.py
46 lines (37 loc) · 1.38 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import tensorflow as tf
import cPickle as p
from collections import defaultdict
import re, random
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import word2vec
trainData = p.load( open("data/train.p", 'rb'))
devData = p.load( open("data/dev.p", 'rb'))
testData = p.load( open("data/test.p", 'rb'))
w2vM = word2vec.load('data/emeddings.bin')
vocab = w2vM.vocab
vocabIndxDict = {}
for i,v in enumerate(vocab):
vocabIndxDict[v] = i
UNK_ID = vocabIndxDict["unktoken"]
PAD_ID = vocabIndxDict["padtoken"]
print "unkid",UNK_ID
print "padid",PAD_ID
def tweet2id(tweet):
resTweet = []
lenTweet = len(tweet)
for i in range(20):
if i < lenTweet:
if tweet[i] in vocabIndxDict:
resTweet.append(vocabIndxDict[tweet[i]])
else:
resTweet.append(UNK_ID)
resTweet = ([PAD_ID] * (20 - len(resTweet) )) + resTweet
assert len(resTweet) == 20
return np.array(resTweet, dtype=np.int32)
train_data_pre = [ (tweet2id(tweet),label) for tweet,label in trainData]
dev_data_pre = [ (tweet2id(tweet),label) for tweet,label in devData]
test_data_pre = [ (tweet2id(tweet),label) for tweet,label in testData]
p.dump(train_data_pre, open('data/trainTweets_preprocessed.p','wb'))
p.dump(dev_data_pre, open('data/devTweets_preprocessed.p','wb'))
p.dump(test_data_pre, open('data/testTweets_preprocessed.p','wb'))