-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
61 lines (41 loc) · 1.32 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
from gensim.models import Word2Vec
import logging
from sklearn import svm
# Configure loggin for gensim model training
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def buildVectors(train):
corpus = []
# Create corpus
for p in train.Phrase:
corpus.append(p.split())
# Build word vectors
model = Word2Vec(corpus, min_count=1, size=25)
# Save model for later use (hdd persistant)
model.save("model/model")
return model
# Read train.csv
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
model = buildVectors(train)
pvecs = dict()
for r in train.iterrows():
sid = r[1]["SentenceId"]
phvec = sum([model[x] for x in r[1]["Phrase"].split()])
pvecs[sid] = phvec
# Convert the dictionary into a dataframe
pvdf = pd.DataFrame.from_dict(pvecs, orient='index')
# Rename columns
pvdf.columns = ["feat_"+str(x) for x in range(1,26)]
# Add sentiment lable to pvdf
pvdf["label"] = train.Sentiment
# Define and train a classifier
clf = svm.SVC()
clf.fit(pvdf[pvdf.columns[:25]], pvdf.label)
# test.Phrase[0]
# 'An intermittently pleasing but mostly routine effort .'
# Lets create a phrase vector
y = pd.DataFrame(sum([model[x] for x in test.Phrase[0].split()])).transpose()
# Make the final prediction
clf.predict(y)
# outputs 2 => neutral