-
Notifications
You must be signed in to change notification settings - Fork 0
/
library.py
executable file
·167 lines (128 loc) · 4.99 KB
/
library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import string
import re
import itertools
import copy
import igraph
import nltk
from nltk.corpus import stopwords
# requires nltk 3.2.1
from nltk import pos_tag
punct = string.punctuation.replace('-', '')
stpwds = stopwords.words('english')
def clean_text_simple(text, remove_stopwords=True, pos_filtering=True, stemming=True):
# convert to lower case
text = text.lower()
# remove punctuation (preserving intra-word dashes)
text = ''.join(l for l in text if l not in punct)
#remove no letters
text = re.sub("[^a-zA-Z]"," ", text)
# strip extra white space
# strip leading and trailing white space
# tokenize (split based on whitespace)
tokens = text.split()
if pos_filtering == True:
# apply POS-tagging
tagged_tokens = pos_tag(tokens)
# retain only nouns and adjectives
tokens_keep = []
for i in range(len(tagged_tokens)):
item = tagged_tokens[i]
if (
item[1] == 'NN' or
item[1] == 'NNS' or
item[1] == 'NNP' or
item[1] == 'NNPS' or
item[1] == 'JJ' or
item[1] == 'JJS' or
item[1] == 'JJR'
):
tokens_keep.append(item[0])
tokens = tokens_keep
if remove_stopwords:
# remove stopwords
tokens = [token for token in tokens if token not in stpwds]
if stemming:
stemmer = nltk.stem.PorterStemmer()
# apply Porter's stemmer
tokens_stemmed = list()
for token in tokens:
tokens_stemmed.append(stemmer.stem(token))
tokens = tokens_stemmed
return(tokens)
def terms_to_graph(terms, w):
# This function returns a directed, weighted igraph from a list of terms (the tokens from the pre-processed text) e.g., ['quick','brown','fox']
# Edges are weighted based on term co-occurence within a sliding window of fixed size 'w'
from_to = {}
# create initial complete graph (first w terms)
terms_temp = terms[0:w]
indexes = list(itertools.combinations(range(w), r=2))
new_edges = []
for my_tuple in indexes:
new_edges.append(tuple([terms_temp[i] for i in my_tuple]))
for new_edge in new_edges:
if new_edge in from_to:
from_to[new_edge] += 1
else:
from_to[new_edge] = 1
# then iterate over the remaining terms
for i in xrange(w, len(terms)):
# term to consider
considered_term = terms[i]
# all terms within sliding window
terms_temp = terms[(i-w+1):(i+1)]
# edges to try
candidate_edges = []
for p in xrange(w-1):
candidate_edges.append((terms_temp[p],considered_term))
for try_edge in candidate_edges:
# if not self-edge
if try_edge[1] != try_edge[0]:
# if edge has already been seen, update its weight
if try_edge in from_to:
from_to[try_edge] += 1
# if edge has never been seen, create it and assign it a unit weight
else:
from_to[try_edge] = 1
# create empty graph
g = igraph.Graph(directed=True)
# add vertices
g.add_vertices(sorted(set(terms)))
# add edges, direction is preserved since the graph is directed
g.add_edges(from_to.keys())
# set edge and vertice weights
g.es['weight'] = from_to.values() # based on co-occurence within sliding window
g.vs['weight'] = g.strength(weights=from_to.values()) # weighted degree
return(g)
def unweighted_k_core(g):
# work on clone of g to preserve g
gg = copy.deepcopy(g)
# initialize dictionary that will contain the core numbers
cores_g = dict(zip(gg.vs['name'],[0]*len(gg.vs)))
i = 0
# while there are vertices remaining in the graph
while len(gg.vs)>0:
# while there is a vertex with degree less than i
while [deg for deg in gg.strength() if deg<= i]:
index = [ind for ind, deg in enumerate(gg.strength()) if deg<= i][0]
# assign i to the vertices core numbers
cores_g[gg.vs[index]['name']] = i
gg.delete_vertices(index)
i += 1
return cores_g
def accuracy_metrics(candidate, truth):
# true positives ('hits') are both in candidate and in truth
tp = len(set(candidate).intersection(truth))
# false positives ('false alarms') are in candidate but not in truth
fp = len([element for element in candidate if element not in truth])
# false negatives ('misses') are in truth but not in candidate
fn = len([element for element in truth if element not in candidate])
# precision
prec = round(float(tp)/(tp+fp),5)
# recall
rec = round(float(tp)/(tp+fn),5)
if prec+rec != 0:
# F1 score
f1 = round(2 * float(prec*rec)/(prec+rec),5)
else:
f1=0
return (prec, rec, f1)