-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOutliers.py
190 lines (149 loc) · 5.66 KB
/
Outliers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import numpy as np
from matplotlib.pyplot import (figure, imshow, bar, title, xticks, yticks, cm,
subplot, show)
from scipy.io import loadmat
from __init__ import gausKernelDensity
from sklearn.neighbors import NearestNeighbors
from matplotlib.pyplot import figure, bar, title, plot, show
from clean_data import clean_data, transform_data
#--------------------------IMPORTING DATA--------------------------------
data = clean_data('Datasets/**videos.csv')
data = transform_data(data,['likes','dislikes','views','comment_count','trending_time'])
#data = data.head(1000)
X = np.array(data[['dislikes','views','comment_count','trending_time']])
y = np.array(data['likes'])
attributeNames = ['dislikes','views','comment_count','trending_time']
N, M = X.shape
#---------------------------LEAVE ONE OUT---------------------------------
print("Looking at Leave One Out...")
m = np.array([1, 3, 6]); s = np.array([1, .5, 2])
c_sizes = np.random.multinomial(N, [1./3, 1./3, 1./3])
for c_id, c_size in enumerate(c_sizes):
X[c_sizes.cumsum()[c_id]-c_sizes[c_id]:c_sizes.cumsum()[c_id],:] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size,M))
# Estimate the optimal kernel density width, by leave-one-out cross-validation
widths = 2.0**np.arange(-10,10)
logP = np.zeros(np.size(widths))
for i,w in enumerate(widths):
f, log_f = gausKernelDensity(X, w)
logP[i] = log_f.sum()
val = logP.max()
ind = logP.argmax()
width=widths[ind]
print('Optimal estimated width is: {0}'.format(width))
# Estimate density for each observation not including the observation
# itself in the density estimate
density, log_density = gausKernelDensity(X, width)
# Sort the densities
i = (density.argsort(axis=0)).ravel()
density = density[i]
# Display the index of the lowest density data object
print('Lowest density for Leave one out: {0} for data object: {1}'.format(density[0],i[0]))
# Plot density estimate of outlier score
figure(1)
bar(range(20),density[:20].reshape(-1,))
title('Density estimate')
figure(2)
plot(logP)
title('Optimal width')
show()
'''
### Gausian Kernel density estimator
# cross-validate kernel width by leave-one-out-cross-validation
# (efficient implementation in gausKernelDensity function)
# evaluate for range of kernel widths
widths = X.var(axis=0).max() * (2.0**np.arange(-10,3))
logP = np.zeros(np.size(widths))
for i,w in enumerate(widths):
print('Fold {:2d}, w={:f}'.format(i,w))
density, log_density = gausKernelDensity(X,w)
logP[i] = log_density.sum()
val = logP.max()
ind = logP.argmax()
width=widths[ind]
print('Optimal estimated width is: {0}'.format(width))
# evaluate density for estimated width
density, log_density = gausKernelDensity(X,width)
# Sort the densities
i = (density.argsort(axis=0)).ravel()
density = density[i].reshape(-1,)
# Plot density estimate of outlier score
figure(1)
bar(range(20),density[:20])
title('Gausian Kernal Density estimate')
'''
#------------------------------------KNN--------------------------
### K-neighbors density estimator
# Neighbor to use:
print("Looking at KNN...")
K = 40
# Find the k nearest neighbors
knn = NearestNeighbors(n_neighbors=K).fit(X)
D, i = knn.kneighbors(X)
density = 1./(D.sum(axis=1)/K)
# Sort the scores
i = density.argsort()
density = density[i]
print('Lowest density for KNN: {0} for data object: {1}'.format(density[0],i[0]))
# Plot k-neighbor estimate of outlier score (distances)
figure(3)
bar(range(20),density[:20])
title('KNN density: Outlier score')
show()
# Outlier score
score = D[:,K-1]
# Sort the scores
i = score.argsort()
score = score[i[::-1]]
# Plot k-neighbor estimate of outlier score (distances)
figure(4)
bar(range(20),score[:20])
title('{}th neighbor distance: Outlier score'.format(K))
show()
#-----------------------------ARD--------------------------
print("Looking at KNN ARD...")
knn = NearestNeighbors(n_neighbors=K).fit(X)
D, i = knn.kneighbors(X)
density = 1./(D.sum(axis=1)/K)
avg_rel_density = density/(density[i[:,1:]].sum(axis=1)/K)
# Sort the avg.rel.densities
i_avg_rel = avg_rel_density.argsort()
avg_rel_density = avg_rel_density[i_avg_rel]
print('Lowest density for KNN ARD: {0} for data object: {1}'.format(avg_rel_density[0],i_avg_rel[0]))
# Plot k-neighbor estimate of outlier score (distances)
figure(5)
bar(range(20),avg_rel_density[:20])
title('KNN average relative density: Outlier score')
show()
#x = np.linspace(-10, 10, 50)
'''
m = np.array([1, 3, 6]); s = np.array([1, .5, 2])
c_sizes = np.random.multinomial(N, [1./3, 1./3, 1./3])
for c_id, c_size in enumerate(c_sizes):
X[c_sizes.cumsum()[c_id]-c_sizes[c_id]:c_sizes.cumsum()[c_id],:] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size,M))
# Estimate the optimal kernel density width, by leave-one-out cross-validation
widths = 2.0**np.arange(-10,10)
logP = np.zeros(np.size(widths))
for i,w in enumerate(widths):
f, log_f = gausKernelDensity(X, w)
logP[i] = log_f.sum()
val = logP.max()
ind = logP.argmax()
width=widths[ind]
print('Optimal estimated width is: {0}'.format(width))
# Estimate density for each observation not including the observation
# itself in the density estimate
density, log_density = gausKernelDensity(X, width)
# Sort the densities
i = (density.argsort(axis=0)).ravel()
density = density[i]
# Display the index of the lowest density data object
print('Lowest density: {0} for data object: {1}'.format(density[0],i[0]))
# Plot density estimate of outlier score
figure(1)
bar(range(20),density[:20].reshape(-1,))
title('Density estimate')
figure(2)
plot(logP)
title('Optimal width')
show()
'''