-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering n_grams.py
98 lines (71 loc) · 3.96 KB
/
clustering n_grams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from scipy.sparse import hstack
from sklearn.preprocessing import OneHotEncoder
from sklearn import mixture
from sklearn.metrics import mutual_info_score,adjusted_rand_score,adjusted_mutual_info_score
from nltk.cluster import KMeansClusterer
import nltk
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import AgglomerativeClustering
def ngrams_BPI_2015():
data = pd.read_csv("BPI_2015.csv",sep=',',encoding='latin-1',na_values=['null'],keep_default_na=False)
cluster = data['cluster'].values
# #attributi lista nominali
# LAN = ['action_code_seq','activityNameEN_seq','concept_name_seq','question_seq','monitoringResource_seq','org_resource_seq']
# #attributi nominali
# AN = ['case_caseStatus','case_Responsible_actor','case_last_phase','case_requestComplete','case_parts','case_termName']
# #attributi lista numerici
# # LAV = ['delta','delta_p','delta_df','case_SUMleges_seq']
# #attributi numerici
# AV = ['duration','duration_df','duration_p']
#attributi lista nominali
LAN = ['case_termName_seq', 'case_parts_seq','case_requestComplete_seq','case_last_phase_seq','case_Responsible_actor_seq','case_caseStatus_seq','action_code_seq','activityNameEN_seq','concept_name_seq','question_seq','monitoringResource_seq','org_resource_seq']
#attributi nominali
AN = []
#attributi lista numerici
LAV = ['delta','delta_p','delta_df','case_SUMleges_seq']
#attributi numerici
AV = ['duration','duration_df','duration_p']
data = computeMatrix(data, LAN, AN, LAV, AV)
svd = TruncatedSVD(random_state=42)
data = svd.fit_transform(data)
clustering(data, cluster, 5)
def computeMatrix(data, lan, an, lav, av):
toReturn = []
for x in lan:
vec = CountVectorizer(ngram_range = (1,3),
analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
token_pattern = r"(?u)\b\w+\b")
toReturn.append(vec.fit_transform(data[x]))
for x in av:
toReturn.append(data[x].values.reshape(data.shape[0],1))
for x in lav:
vec = CountVectorizer(ngram_range = (1,3),
analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
token_pattern = r"(?u)\b\w+\b")
toReturn.append(vec.fit_transform(data[x]))
for x in an:
enc = OneHotEncoder(handle_unknown='ignore')
toReturn.append(enc.fit_transform(data[x].values.reshape(data.shape[0],1)))
toReturn = hstack(toReturn)
return toReturn
def clustering(data, cluster, n_classes):
print('\n------------------GMM\n')
assigned_clusters = mixture.GaussianMixture(n_components=n_classes,covariance_type='tied').fit_predict(data)
print ('Mutual_info_score =',mutual_info_score(cluster-1,assigned_clusters))
print ('Adjusted_mutual_info_score =',adjusted_mutual_info_score(cluster-1,assigned_clusters,average_method='min'))
print ('Adjusted_rand_scor =',adjusted_rand_score(cluster-1,assigned_clusters))
print('\nK_MEANS')
kclusterer = KMeansClusterer(num_means=n_classes, distance=nltk.cluster.util.cosine_distance)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)
print ('Mutual_info_score =',mutual_info_score(cluster-1,assigned_clusters))
print ('Adjusted_mutual_info_score =',adjusted_mutual_info_score(cluster-1,assigned_clusters,average_method='min'))
print ('Adjusted_rand_scor =',adjusted_rand_score(cluster-1,assigned_clusters))
ngrams_BPI_2015()