This repository has been archived by the owner on Jul 19, 2022. It is now read-only.
forked from alpecli/predlig
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sample.py
336 lines (286 loc) · 12.6 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import networkx as nx
from networkx.algorithms import bipartite
import random
import numpy as np
import sys
import sklearn
import features
import spvlinkprediction as slp
from wrapper import *
from evolutionary import *
from hypothesis_test import *
class CDConstructor:
"""
A Classification Dataset Constructor.
This class extracts a sample of pairs of vertices from the graph that represents the original data, according to the defined parameters.
It can be used for both future and missing link prediction tasks, represented as 0 or 1 in the 'task' variable, respectively.
"""
def __init__(self, dataset, sample_size, sample_proportions, task = "missing_link_prediction", future_dataset = None, k_fold = 10):
self.dataset = dataset
self.future_dataset = future_dataset
self.sample_size = sample_size
self.positive_examples_size = sample_proportions[0]
self.negative_examples_size = sample_proportions[1]
self.edges = set()
self.positive_examples = set()
self.negative_examples = set()
self.original_graph = None
self.graph_training = None
self.graph_test = None
self.task = task
self.attributes_list = {}
self.ordered_attributes_list = []
self.k_fold = k_fold
self.sample_dataset = None
self.classification_dataset = None
self.folds_indexes = {}
def set_attributes_list(self, attributes_list):
"""
Stores the attributes list defined by the user in the attributes_list attribute
of the class. It also keeps stored the sorted attribute list in another variable,
in order to keep the same order of the attributes in the classification dataset.
"""
self.attributes_list = attributes_list
self.ordered_attributes_list = sorted(self.attributes_list.keys())
def get_future_link_graph(self):
"""
Returns the graph based on the future links dataset. It is used in the
future link prediction task.
"""
Graph = nx.Graph()
for line in open(self.future_dataset).readlines():
line = line.strip()
col = line.split(';')
node1_id = int(col[0])
node2_id = int(col[1])
Graph.add_edge(node1_id, node2_id)
return Graph
def set_test_graph(self, graph_test):
"""
Defines the graph used for testing.
"""
self.graph_test = graph_test
def is_subpath(self, path, first_node, second_node):
"""
Returns if a pair of nodes is part of a path.
"""
return ",".join(map(str, [first_node, second_node])) in ",".join(map(str, path)) or ",".join(map(str, [second_node, first_node])) in ",".join(map(str, path))
def nodes_in_path(self, paths, first_node, second_node):
"""
Verifies if a pair of nodes is part of some set of paths
"""
for path in paths:
if self.is_subpath(path, first_node, second_node) or self.is_subpath(path, second_node, first_node):
return True
return False
def get_pair_of_neighbors_nodes(self, Graph):
"""
Returns a random pair of neighbors from the graph.
"""
return random.sample(Graph.edges(), 1)[0]
def get_total_edges(self):
"""
Returns the total edges in the collected sample.
"""
return self.positive_examples.union(self.negative_examples)
def get_missing_link_positive_pair_of_nodes(self, paths):
"""
Returns a positive class pair of nodes for the missing link prediction task.
"""
while True:
first_node, second_node = self.get_pair_of_neighbors_nodes(self.graph_training)
if self.nodes_in_path(paths, first_node, second_node) or (first_node == second_node) or(first_node, second_node) in self.positive_examples:
continue
self.graph_training.remove_edge(first_node, second_node)
if not nx.has_path(self.graph_training, first_node, second_node):
self.graph_training.add_edge(first_node, second_node)
continue
return (first_node, second_node)
def get_future_link_positive_pair_of_nodes(self, paths):
"""
Returns a positive class pair of nodes for the future link prediction task.
"""
while True:
first_node, second_node = random.sample(self.graph_test.edges(), 1)[0]
if self.graph_training.has_edge(first_node, second_node) or (first_node == second_node) or (first_node, second_node) in self.positive_examples or not self.graph_training.has_node(first_node) or not self.graph_training.has_node(second_node) or not nx.has_path(self.graph_training, first_node, second_node):
continue
return (first_node, second_node)
def get_positive_examples(self):
"""
Extracts the pairs of nodes of the positive class from the graph. Those pairs of nodes have link(s) between them in the missing
link prediction, or have link(s) in the future period of time in the future link prediction.
"""
paths = set()
get_pair_of_nodes = self.get_future_link_positive_pair_of_nodes if self.task == "future_link_prediction" else self.get_missing_link_positive_pair_of_nodes
while len(self.positive_examples) < (self.sample_size * self.positive_examples_size):
edge = get_pair_of_nodes(paths)
if edge not in self.positive_examples:
paths.add(tuple(nx.shortest_path(self.graph_training, edge[0], edge[1])))
self.positive_examples.add(edge)
def get_missing_link_negative_pair_of_nodes(self):
"""
Returns a negative class pair of nodes for the missing link prediction task.
"""
while True:
first_node, second_node = random.sample(self.graph_training.nodes(), 2)
if not self.graph_training.has_edge(first_node, second_node) and not ((first_node, second_node) in self.negative_examples) and not (first_node == second_node) and nx.has_path(self.graph_training, first_node, second_node):
return (first_node, second_node)
def get_future_link_negative_pair_of_nodes(self):
"""
Returns a negative class pair of nodes for the future link prediction task.
"""
while True:
first_node, second_node = self.get_missing_link_negative_pair_of_nodes()
if not self.graph_test.has_edge(first_node, second_node):
return (first_node, second_node)
def get_negative_examples(self):
"""
Extracts the pairs of nodes of the negative class from the graph. Those pairs of nodes have no link between them in the missing
link prediction, or have no link in the future period of time in the future link prediction.
"""
get_pair_of_nodes = self.get_future_link_negative_pair_of_nodes if self.task == "future_link_prediction" else self.get_missing_link_negative_pair_of_nodes
while len(self.negative_examples) < (self.sample_size * self.negative_examples_size):
self.negative_examples.add(get_pair_of_nodes())
def get_sample(self):
if self.sample_dataset == None:
self.sample_dataset = self.extract_sample()
return self.sample_dataset
def set_sample(self, sample_file):
self.sample_dataset = []
for line in open(sample_file).readlines():
self.sample_dataset.append([int(field) for field in line.split(';')])
self.set_graphs()
def save_sample(self, sample_file):
writer = open(sample_file, 'w')
for line in self.sample_dataset:
writer.write("%s\n" %(";".join([str(column) for column in line])))
writer.close()
def set_graphs(self):
self.original_graph = nx.Graph()
dataset = open(self.dataset)
for line in dataset.readlines():
line = line.strip()
col = line.split(';')
iid = "p%s" %(int(col[0]))
self.original_graph.add_node(iid, {'bipartite': 0})
pid = int(col[1])
self.original_graph.add_node(pid, {'bipartite': 1})
self.original_graph.add_edge(iid, pid)
dataset.close()
nodes = set(n for n,d in self.original_graph.nodes(data=True) if d['bipartite'] == 1)
self.graph_training = nx.Graph()
for node in nodes:
for intermediate_node in list(nx.all_neighbors(self.original_graph, node)):
for second_node in list(set(nx.all_neighbors(self.original_graph, intermediate_node)) - set([node])):
self.graph_training.add_edge(node, second_node)
self.graph_test = self.graph_training.copy() if self.task == "missing_link_prediction" else self.get_future_link_graph()
edges = set()
if (self.task == "missing_link_prediction") and (self.sample_dataset != None):
for line in [line for line in self.sample_dataset if line[2] == 1]:
node_1, node_2 = int(line[0]), int(line[1])
try:
self.graph_training.remove_edge(node_1, node_2)
except:
print node_1, node_2
pass
def extract_sample(self):
"""
Extracts the whole sample from the graph representing the original dataset.
"""
self.set_graphs()
self.get_positive_examples()
self.get_negative_examples()
sample_dataset = []
line = 0
fold_half_size = (self.sample_size/self.k_fold)/2
for edge in self.positive_examples:
fold = int(line/fold_half_size)
sample_dataset.append([edge[0], edge[1], 1, fold])
line += 1
self.folds_indexes.setdefault(fold, set())
self.folds_indexes[fold].add(line)
line = 0
for edge in self.negative_examples:
fold = int(line/fold_half_size)
sample_dataset.append([edge[0], edge[1], 0, fold])
self.folds_indexes[fold].add(len(self.positive_examples) + line)
line += 1
return sample_dataset
def get_fold_list(self):
train_test_folds = []
for fold, lines in self.folds_indexes.items():
train_test_folds.append((list(set(range(self.sample_size)) - lines), list(lines)))
return train_test_folds
def normalize_attributes(self):
"""
Normalizes the values of the attributes by extracting the mean of the attribute and dividing the result by the standard deviation.
"""
attributes_number = self.classification_dataset.shape[1] - 2
examples = range(self.classification_dataset.shape[0])
for attribute in range(attributes_number):
std = np.std(self.classification_dataset[[examples],[attribute]])
if std == 0:
self.classification_dataset[[examples],[attribute]] = 0
else:
self.classification_dataset[[examples],[attribute]] = (self.classification_dataset[[examples],[attribute]] - self.classification_dataset[[examples],[attribute]].mean())/np.std(self.classification_dataset[[examples],[attribute]])
def get_classification_dataset(self):
if self.classification_dataset == None:
self.classification_dataset = self.calculate_classification_dataset()
return self.classification_dataset
def set_classification_dataset(self, attributes_file, dataset_file):
attributes_lines = open(attributes_file).readlines()
for line in attributes_lines:
fields = line.strip().split(';')
attribute_name = fields[0]
parameters = {}
if fields[1] != "":
for parameter in fields[1:]:
parameter_name, parameter_value = parameter.split(':')
parameters[parameter_name] = float(parameter_value)
self.attributes_list[attribute_name] = parameters
self.ordered_attributes_list.append(attribute_name)
dataset_lines = open(dataset_file).readlines()
self.classification_dataset = np.zeros((len(dataset_lines), len(attributes_lines) + 2))
counter = 0
self.folds_indexes = {}
for line in open(dataset_file).readlines():
self.classification_dataset[counter] = [float(field) for field in line.split(';')]
fold = self.classification_dataset[counter][-1]
self.folds_indexes.setdefault(fold, set())
self.folds_indexes[fold].add(counter)
counter += 1
def calculate_classification_dataset(self):
"""
Calculates the attributes for each example of the sample, and returns it as a matrix ready for applying the classification
algorithms, in order to perform the link prediction.
"""
attributes_calculator = features.FeatureConstructor(self.graph_training)
if self.attributes_list == {}:
self.ordered_attributes_list = sorted(attributes_calculator.attributes_map.keys())
for attribute in self.ordered_attributes_list:
self.attributes_list[attribute] = {}
classification_dataset = np.zeros((self.sample_size, len(self.attributes_list) + 2))
line_count = 0
for line in self.sample_dataset:
first_node, second_node, pair_class, pair_fold = line
attributes_calculator.set_nodes(first_node, second_node)
column = 0
for function in self.ordered_attributes_list:
parameters = self.attributes_list[function]
classification_dataset[line_count][column] = attributes_calculator.attributes_map[function](**parameters)
column += 1
classification_dataset[line_count][-2] = pair_class
classification_dataset[line_count][-1] = pair_fold
line_count += 1
return classification_dataset
def save_classification_dataset(self, dataset_file):
writer = open(dataset_file + "_attributes", "w")
for attribute in self.ordered_attributes_list:
line = attribute + ";"
line += ";".join(["%s:%s" %(parameter_name, parameter_value) for parameter_name, parameter_value in self.attributes_list[attribute]])
writer.write(line + "\n")
writer.close()
writer = open(dataset_file, 'w')
for line in self.classification_dataset:
writer.write(";".join([str(field) for field in line]) + "\n")
writer.close()