-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctional_n_gram_one_class_svm_combinations.py
114 lines (72 loc) · 2.75 KB
/
functional_n_gram_one_class_svm_combinations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from os import listdir
from os.path import isfile, join
import itertools
from Utilities.Text import Text
from Utilities.FunctionalNGram import FunctionalNGram as FNG
from Utilities.TermFrequencyInverseDocumentFrequency import TermFrequencyInverseDocumentFrequency as TFIDF
from Utilities.combine_features import combine_features as combine
from sklearn import svm
from sklearn import preprocessing
from sklearn.decomposition import PCA
from pprint import pprint
import numpy as np
def main():
path = "./Texts/Seneca/"
files = [f for f in listdir(path) if isfile(join(path, f))]
pprint(files)
frauds = ["octavia.txt", "hercules_oetaeus.txt"]
#frauds = ["jfk.txt", "medea.txt"]
fraud_indices = np.array([files.index(f) for f in frauds])
tfidf = TFIDF()
for document in files:
tfidf.add_text_to_corpus(Text(path + document))
tfidf_features, word_list = tfidf.calculate_features_for_corpus()
n_documents = tfidf_features.shape[0]
legitimate_indices = np.array(list(set(range(n_documents)) - set(fraud_indices)))
fng_features, n_grams = combine([FNG(Text(path + f)).probability_features for f in files])
features = np.hstack((fng_features, tfidf_features))
x = {
"train" : features[legitimate_indices, :],
"test" : features[fraud_indices, :]
}
files.pop(max(fraud_indices))
files.pop(min(fraud_indices))
indices = set(range(x["train"].shape[0]))
nu, kernel, gamma = 0.4, "rbf", 0.1
clf = svm.OneClassSVM(nu = nu, kernel = kernel, gamma = gamma)
for train_index_set in list(itertools.combinations(list(indices), features.shape[0] - 4)):
train_index = np.array(list(train_index_set))
test_index = np.array(list(indices - set(train_index)))
train_data = x["train"][train_index, :]
test_data = np.append(x["test"], x["train"][test_index, :], axis = 0)
apply_pca = True
if apply_pca:
pca = PCA(n_components = train_data.shape[1])
train_data = pca.fit_transform(train_data)
test_data = pca.transform(test_data)
clf.fit(train_data)
y = {
# "train" : clf.predict(train_data),
"test" : clf.predict(test_data)
}
metrics = {
# "train" : clf.decision_function(train_data),
"test" : clf.decision_function(test_data)
}
results = {"files" : {"train" : [files[index] for index in train_index], "test" : [files[index] for index in test_index]},"nu" : nu, "gamma" : gamma, "y" : y, "kernel" : kernel, "metrics" : metrics}
pprint(results)
print
print
print "Results from all data:"
clf.fit(x["train"])
y = {
# "train" : clf.predict(x["train"]),
"test" : clf.predict(x["test"])
}
metrics = {
# "train" : clf.decision_function(x["train"]),
"test" : clf.decision_function(x["test"])
}
pprint({"nu" : nu, "gamma" : gamma, "y" : y, "kernel" : kernel, "metrics" : metrics})
if __name__ == "__main__":
main()