forked from jonhehir/private-spectral-clustering
-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasets.py
120 lines (92 loc) · 3.33 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pathlib
import numpy as np
from scipy import sparse
def path_to_file(file):
return pathlib.Path(__file__).parent.absolute().joinpath(file)
def digit(s):
try:
n = int(s)
except ValueError:
n = None
if str(n) != s:
return None
return n
def read_adj(file, n, first_node_index=1):
"""
Read a whitespace delimited adjacency matrix
Ignore lines that aren't of the right format (e.g., comments or headers)
By default, assumes first node index is 1. To use 0-index,
set `first_node_index` accordingly.
"""
A = sparse.lil_matrix((n, n))
with open(path_to_file(file)) as f:
for line in f:
digits = [digit(s) for s in line.strip().split()]
# only use valid lines
if any([d is None for d in digits]) or len(digits) != 2:
continue
A[digits[0] - first_node_index, digits[1] - first_node_index] = 1
return A
def symmetrize(A):
r, c = A.nonzero()
A[c, r] = 1
return A
def read_labels(file):
"""
Read a file with one string label per line and
return a list of label IDs (0-indexed)
"""
lookup = {}
labels = []
with open(path_to_file(file)) as f:
for line in f:
line = line.strip()
# skip blank lines
if line == "":
continue
# assign an ID to this label if it doesn't exist
if line not in lookup:
lookup[line] = len(lookup)
# lookup label; append it
labels.append(lookup[line])
return labels
def exclude_nodes(A, labels, indices):
"""
In the unlikely scenario that you want to completely remove
some nodes from the network, use this handy function!
"""
# This is a pretty inefficient implementation.
# This is really only intended to be used on fairly small matrices.
A = np.delete(A.todense(), indices, axis=0)
A = np.delete(A, indices, axis=1)
labels = labels.copy()
for i in indices:
del labels[i]
return sparse.lil_matrix(A), labels
# The datasets:
def fb100(school):
labels = read_labels(f"datasets/fb100/{school}-nodes.txt")
A = symmetrize(read_adj(f"datasets/fb100/{school}-edges.txt", len(labels)))
return A, labels
def hansell():
labels = read_labels("datasets/hansell/nodes.txt")
A = symmetrize(read_adj("datasets/hansell/edges.txt", 27))
return A, labels
def house(congress):
congress = int(congress)
labels = read_labels(f"datasets/house/nodes-{congress:03}.tsv")
A = symmetrize(read_adj(f"datasets/house/edges-{congress:03}.tsv", len(labels)))
return A, labels
def political_blogs():
labels = read_labels("datasets/political_blogs/blogs-orientation.txt")
A = symmetrize(read_adj("datasets/political_blogs/blogs.txt", len(labels)))
return exclude_nodes(A, labels, [425, 426])
def sampson():
labels = read_labels("datasets/sampson/nodes.txt")
A = read_adj("datasets/sampson/edges.txt", len(labels))
return A, labels
def senate(congress):
congress = int(congress)
labels = read_labels(f"datasets/senate/nodes-{congress:03}.tsv")
A = symmetrize(read_adj(f"datasets/senate/edges-{congress:03}.tsv", len(labels)))
return A, labels