-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.py
408 lines (360 loc) · 18.1 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
from __future__ import print_function
import json
import os
import requests
import gensim
import re
import string
import scipy.sparse as ss
import pandas as pd
import math
import rouge
import nltk
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import helpers
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from itertools import repeat
from pprint import pprint
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from google.cloud import storage
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from pythonrouge.pythonrouge import Pythonrouge
from math import ceil
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score
from word_cloud.word_cloud_generator import WordCloud
import warnings
warnings.filterwarnings('ignore')
# 1. provide all info for this method (all keys and all dicts are from the retdata.py files return value)
def uniqueFromDocTopicMatrix(num_topics,doc_topic_matrix,start_time,end_time,all_keys,all_dicts):
documents_per_topic = []
for ind in range(num_topics):
documents_per_topic.append([])
key_index = 0
for row in doc_topic_matrix:
is_one_topic = False
topic_count = 0
for topic in row:
if topic == True:
topic_count+=1
#****CHANGE CONDITION TO INCLUDE DISCUSSIONS THAT APPEAR IN MULTIPLE TOPICS*****
if topic_count == 1:
for index,topic in np.ndenumerate(row):
if topic == True and isInTimeRange(all_keys[key_index],all_dicts,start_time,end_time):
documents_per_topic[index[0]].append(all_keys[key_index])
key_index+=1
return documents_per_topic
#topicwordmatrix = unique_from_doc_topic(8,topic_model.labels,1509494400,1539302400)
def isInTimeRange(key,dicts,start_time,end_time):
for item in dicts:
if item['key'] == key:
if item['time'] >= start_time and item['time']<=end_time:
return True
else:
return False
#This will categorize every discussion regardless of the topic under the 5 subcategories
# 2. filtered_keys = output from uniqueFromDocTopicMatrix dictionaries = retdata.py
def devideIntoCategories(filterd_keys,dictionaries):
other = []
informative= []
bug = []
inforeq = []
infogive = []
suggestion = []
for filterd_key in filterd_keys:
for dictionary in dictionaries:
if filterd_key == dictionary['key']:
scoredsub = {'other':(int(dictionary['class']['other'])-8),'bug':int(dictionary['class']['bug'])
,'inforeq':int(dictionary['class']['inforeq']),'infogive':int(dictionary['class']['infogive']),
'suggestion':int(dictionary['class']['suggestion'])}
sorted_score = sorted(scoredsub.items(), key=lambda kv: kv[1],reverse=True)
if sorted_score[0][1] > 0:
if sorted_score[0][0] == "other":
other.append(dictionary['key'])
if sorted_score[0][0] == "bug":
bug.append(dictionary['key'])
if sorted_score[0][0] == "inforeq":
inforeq.append(dictionary['key'])
if sorted_score[0][0] == "infogive":
infogive.append(dictionary['key'])
if sorted_score[0][0] == "suggestion":
suggestion.append(dictionary['key'])
else:
other.append(dictionary['key'])
# if int(dictionary['class']['other'])>0:
# other.append(dictionary['key'])
# if int(dictionary['class']['bug'])>0:
# bug.append(dictionary['key'])
# if int(dictionary['class']['inforeq'])>0 or int(dictionary['class']['infogive'])>0 or int(dictionary['class']['suggestion'])>0:
# informative.append(dictionary['key'])
return {'ohter':other,'inforeq':inforeq,'bug':bug,'infogive':infogive,'suggestion':suggestion}
# 3. the out put from No 2 , is a 5 item dict pass any dict item to the following as ids
#dictionaries = retdata.py
def getTitleSelftextVectors(ids,dictionaries,w2vModel):
id_vector_map =[]
for ide in ids:
for dic in dictionaries:
if ide == dic['key']:
titleandselftext = dic['title'] + ' . ' + dic['selftext']
titleandselftext = re.sub(r'&\S+','',titleandselftext)
titleandselftext = helpers.clean(titleandselftext,1)
vector=helpers.avg_sentence(titleandselftext.split(),w2vModel.wv)
if np.any(vector) == True:
id_vector_map.append({'key':dic['key'],'vec':vector,'titself':titleandselftext})
return id_vector_map
# helper for method below
def removeBelowMedian(subdiscussions,topic,topic_model,w2vModel):
topic_sent = ""
topic_prob_tuple = topic_model.get_topics(topic=topic, n_words=25)
for word,prob in topic_prob_tuple:
if prob > 0:
topic_sent += word + " "
topic_sent=topic_sent.strip()
topic_sent_vec = helpers.avg_sentence(topic_sent.split(),w2vModel.wv)
scored_items = []
for item in subdiscussions:
sim = helpers.cosine_sim(topic_sent_vec,item['vec'])
scored_items.append({'key':item['key'],'vec':item['vec'],'sim':sim,'titself':item['titself']})
scored_items.sort(key=lambda item:item['sim'], reverse=False)
is_even = False
if len(scored_items) % 2 == 0:
is_even = True
median = 0
if is_even == True:
bottom = len(scored_items)/2
top = bottom + 1
median = (scored_items[int(bottom)]['sim'] + scored_items[int(top)]['sim'])/2
else:
middle = int((len(scored_items) + 1)/2)
median = scored_items[middle]['sim']
filtered_items = []
for item in scored_items:
if item['sim'] >= median:
filtered_items.append(item)
filtered_items.sort(key=lambda item:item['sim'], reverse=True)
return filtered_items
#4 The following method will get discussions for a certain topic under a sub category such as bug etc.
# subdiscussions = 3 , topic = any topic from the topic list , sentnum = user input
#most simillar after clustering approach
'''
The logic for discussion selection is explained hearwith. The discussions will first be scored only considering the title + selftext(hence the assumption in diagram) sentence
this will happen using the cosine simmilarity compared to the syntatic sentence generated using the W2W + TM approach.
Then scores which have below median scores will be discarded.
Then the remaining disucssions will be clustered into a number of clusters determined by a shilouette analysis.
Then the requested number of discussions will be constructed from discussions taken out of the clusters.
This approach is taken to maintain a simple cutoff score procedure and clustering is done to reduce redundancy,
'''
def getClusterSim(subdiscussions,topic,sentnum,topic_model,w2wmodel):
if len(subdiscussions) / 2 >= sentnum * 2:
median_removed_list = removeBelowMedian(subdiscussions,topic,topic_model,w2wmodel)
if len(median_removed_list) > sentnum:
clusterlist = getClusters(median_removed_list)
finallist = chooseDiscussions(clusterlist,sentnum)
return finallist
else:
median_removed_list.sort(key=lambda item:item['sim'], reverse=True)
return median_removed_list
elif len(subdiscussions) <= sentnum:
return subdiscussions
else:
clusterlist = getClusters(subdiscussions)
finallist = chooseDiscussions(clusterlist,sentnum)
return finallist
#chosendiscussions = get_cluster_sim(testing,0,20)
# helper for above
def getClusters(discussions):
vectorlist = []
ids = []
stringlist = []
clusters =[]
for vector in discussions:
vectorlist.append(vector['vec'])
ids.append(vector['key'])
stringlist.append(vector['titself'])
n_clusters = helpers.determineClusters(vectorlist, len(vectorlist))
kmeans = KMeans(n_clusters=n_clusters)
kmeans = kmeans.fit(vectorlist)
for i in range(n_clusters):
clusters.append(np.where(i == kmeans.labels_)[0])
clusterlist = []
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, vectorlist)
#Get the most simillar to the cluster center and score according to similarrity
for i in range(n_clusters):
currentcluster = clusters[i]
appendcluster = []
clstidx = closest[i]
clstelement = vectorlist[clstidx]
for element in currentcluster:
sim = helpers.cosine_sim(clstelement,vectorlist[element])
if math.isnan(sim)== False:
appendcluster.append({'key':ids[int(element)],'sim':sim,'txt':stringlist[int(element)],
'vec':vectorlist[int(element)]})
appendcluster.sort(key=lambda item:item['sim'], reverse=True)
clusterlist.append(appendcluster)
return clusterlist
def chooseDiscussions(clusters,num):
finaldisc = []
filterdclusters = clusters
if len(filterdclusters) == num:
for cluster in filterdclusters:
finaldisc.append(cluster[0])
elif len(filterdclusters) > num:
alldiscussions = []
for cluster in filterdclusters:
alldiscussions.append(cluster[0])
alldiscussions.sort(key=lambda item:item['sim'], reverse=True)
finaldisc = alldiscussions[:num]
else:
allitems = 0
for cluster in filterdclusters:
allitems+=len(cluster)
#print('all items ' + str(allitems))
if allitems <= num:
for cluster in filterdclusters:
for element in cluster:
finaldisc.append(element)
#now we try to fill the num from discussions the else statement below will take the avg and fill the remaining
#Note the issues that might arrise from sorting
else:
items_per_cluster = int(num/len(filterdclusters))
lowest_number = items_per_cluster
for cluster in filterdclusters:
if len(cluster) < lowest_number:
lowest_number = len(cluster)
if lowest_number == items_per_cluster:
#print(lowest_number)
for cluster in filterdclusters:
items_to_append = cluster[:lowest_number]
for item in items_to_append:
finaldisc.append(item)
else:
remaining_discussions = []
for cluster in filterdclusters:
finaldisc.append(cluster[:lowest_number])
if len(cluster)>lowest_number:
items_to_append = cluster[:lowest_number]
for item in items_to_append:
remaining_discussions.append(item)
remaining_item_count = num - len(finaldisc)
remaining_discussions.sort(key=lambda item:item['sim'], reverse=True)
items_to_append = remaining_discussions[:remaining_item_count]
for item in items_to_append:
finaldisc.append(item)
#print('final len : '+ str(len(finaldisc)))
return finaldisc
def get_comment_summary(comments,title,topic_vector,w2wmodel):
comments = helpers.clean(comments,2)
title= helpers.clean(title,1)
title_vector = helpers.avg_sentence(title.strip().split(),w2wmodel.wv)
sentences = comments.split('*')
filteredsentences = []
filteredsentences = [element for element in sentences if len(element) > 0 and element.isspace()==False and element!='']
filteredsentences = [element for element in filteredsentences if len(element) > 10]
scoredsentences = []
summary =''
for sentence in filteredsentences:
vector = helpers.avg_sentence(sentence.strip().split(),w2wmodel.wv)
titlesim = helpers.cosine_sim(title_vector,vector)
scoredsentences.append({'text':sentence,'score':titlesim})
scoredsentences.sort(key=lambda item:item['score'],reverse=True)
# 5 From comments
scoredsentences = scoredsentences[:5]
for sentence in scoredsentences:
#print(sentence)
summary += sentence['text'] + '.'
return summary
'''
The summaries will be created for each discussion using 1 sentence from the title
the number of sentences using a silhouette analysis from the self text section
and 5 sentences from the comments. Note that the comments are not clustered as clustering is not
effective for comments from the observations made. However the selftext sections are clustered as
they do infact contain sentnecs that are relavent unlike some sentences from the comments section.
'''
def create_title_summaries(selected,dicts,topic,topic_model,w2wmodel):
topic_sent = ""
topic_prob_tuple = topic_model.get_topics(topic=topic, n_words=25)
for word,prob in topic_prob_tuple:
if prob > 0:
topic_sent += word + " "
topic_sent=topic_sent.strip()
topic_sent_vec = helpers.avg_sentence(topic_sent.split(),w2wmodel.wv)
summary = []
clusters =[]
for selection in selected:
for item in dicts:
if selection['key'] == item['key']:
#print('--'*40)
concatstring = item['selftext']
concatstring = helpers.clean(concatstring,tpe=2)
sentences = concatstring.split('*')
filteredsentences = []
filteredsentences = [element for element in sentences if len(element) > 0 and element.isspace()==False and
element!='']
#filteredsentences = [element for element in filteredsentences if len(element) > 20]
scoredvectors = []
scoredsentences = []
for sentence in filteredsentences:
vector = helpers.avg_sentence(sentence.strip().split(),w2wmodel.wv)
scoredsentences.append({'sentence':sentence,'vec':vector})
scoredvectors.append(vector)
n_clusters = helpers.determineClusters(scoredvectors,len(scoredvectors))
kmeans = KMeans(n_clusters=n_clusters)
if len(scoredvectors) >= n_clusters:
kmeans.fit(scoredvectors)
for i in range(n_clusters):
clusters.append(np.where(i == kmeans.labels_)[0])
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, scoredvectors)
indexes = []
for i in range(n_clusters):
indexes.append(closest[i])
titlesum = ""
indexes.sort()
for idx in indexes:
titlesum = titlesum +"."+ scoredsentences[idx]['sentence']
titlesentences = helpers.clean(item['title'],tpe=2).split('*')
fintitlesent = ''
fintitlescore = -3
if len(titlesentences) > 1:
for titlesentence in titlesentences:
if titlesentence != '' and titlesentence.isspace() == False:
vector = helpers.avg_sentence(titlesentence.strip().split(),w2wmodel.wv)
score = helpers.cosine_sim(topic_sent_vec,vector)
if score > fintitlescore:
fintitlescore = score
fintitlesent = titlesentence
else:
fintitlesent = titlesentences[0]
titlesum = fintitlesent + '.' + titlesum
summarizedcomments = get_comment_summary(comments=item['comments'],title=item['title']+'.'+
item['selftext'],topic_vector=topic_sent_vec,w2wmodel=w2wmodel)
summary.append({'header':titlesum,'content':summarizedcomments,'key':item['key']})
else:
titlesum = ""
titlesentences = helpers.clean(item['title'],tpe=2).split('*')
fintitlesent = ''
fintitlescore = -3
if len(titlesentences) > 1:
for titlesentence in titlesentences:
if titlesentence != '' and titlesentence.isspace() == False:
vector = helpers.avg_sentence(titlesentence.strip().split(),w2wmodel.wv)
score = helpers.cosine_sim(topic_sent_vec,vector)
if score > fintitlescore:
fintitlescore = score
fintitlesent = titlesentence
else:
fintitlesent = titlesentences[0]
titlesum = fintitlesent + '.' + titlesum
for sent in scoredsentences:
titlesum = titlesum + " . " + sent['sentence']
summarizedcomments = get_comment_summary(comments=item['comments'],title=item['title']+'.'+item['selftext'],topic_vector=topic_sent_vec,w2wmodel=w2wmodel)
summary.append({'header':titlesum,'content':summarizedcomments,'key':item['key']})
return summary