-
Notifications
You must be signed in to change notification settings - Fork 0
/
KNN.py
101 lines (84 loc) · 2.93 KB
/
KNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
import numpy as np
from dataPreprocessing import load_obj, build_inverted_index, save_obj, processFile, clear_review
from collections import Counter
#============ READ TOKENIZED TRAINING DATASET =================================
df = pd.read_csv(
filepath_or_buffer='tokenized_reviews.dat',
header=None,
sep='\n')
# separate names from classes
vals = df.iloc[:,:].values
reviews = [n[0][2:].split(',') for n in vals]
classes = [n[0][:1] for n in vals]
#================= INVERTED INDX ALREADY CREATED ==============================
#inverted_index = build_inverted_index(reviews)
#save_obj(inverted_index, "inverted_idx")
#================= LOAD INVERTED IDX FOR COMPUTING TESTSET ====================
inverted_idx = load_obj("inverted_idx")
#================ PROCESS AND TOKENIZE TESTSET FOR FUTURE USE =================
#querries = processFile("test", "tokenized_test")
df = pd.read_csv(
filepath_or_buffer='tokenized_test.dat',
header=None,
sep='\n')
# separate names from classes
values = df.iloc[:,:].values
querries = [n[0][:].split(',') for n in values]
#================== COMPUTE ACCUMULATOR FOR EACH QUERRY========================
s = (len(querries), len(vals))
querry_accumulator = np.zeros(s)
review_id = 0
for querry in querries:
print "Processing query " + str(review_id) + "/" + str(len(querries))
val = 1/np.sqrt(len(querry))
for word in querry:
for doc in inverted_idx[word]:
querry_accumulator[review_id][doc[0]] += val * doc[1]
review_id += 1
#======= SELECT K NEAREST NEIGHBORS AND PICK CLASS USING MAJORITY =============
#K = 10
#querry_classes = np.zeros(len(querries))
#best = (None, None)
#for k in range(1, K):
# acc = 0.0
# temp_classes = np.zeros(k)
#
# for i in range(0, len(querries)):
# temp = querry_accumulator[i].argsort()[-k:][::-1]
# for cl in range(k):
# if classes[temp[cl]] == "+":
# cla = "+1"
# else:
# cla = "-1"
# temp_classes[cl] = cla
# c = Counter(temp_classes);
# val, blah = c.most_common()[0]
# querry_classes[i] = int(val)
#
# for i in range(len(querry_classes)):
# if querry_classes[i] == querryClasses[i]:
# acc += 1
# print str(acc/len(querry_classes))
K = 10
querry_classes = np.zeros(len(querries))
temp_classes = np.zeros(K)
for i in range(0, len(querries)):
temp = querry_accumulator[i].argsort()[-K:][::-1]
for cl in range(K):
if classes[temp[cl]] == "+":
cla = "+1"
else:
cla = "-1"
temp_classes[cl] = cla
c = Counter(temp_classes);
val, blah = c.most_common()[0]
querry_classes[i] = int(val)
#============ WRITE OUTPUT TO TEST.DAT AND UPLOAD TO CLP ======================
file = open("test.dat","w")
for clas in querry_classes:
if clas > 0:
file.write("+1\n")
else:
file.write("-1\n")
file.close()