-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_datasets.py
295 lines (223 loc) · 9.33 KB
/
load_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
from scipy.sparse import csr_matrix, hstack, SparseEfficiencyWarning, diags
import numpy as np
import sklearn.preprocessing
import warnings
warnings.simplefilter('ignore',SparseEfficiencyWarning)
def load_dataset(dataset, attr = ['attr'], labels= 'labels'):
if dataset in ['CORA-ORIG', 'CITESEER-ORIG', 'PUBMED-ORIG']:
X, y_target, adj = load_dataset_npz(dataset.lower(), attr, labels)
if dataset in ['WIKIVITALS_NEW']:
X, y_target, adj = load_dataset_npz(dataset.lower(), attr = ['attr'], labels= 'labels_1')
return(X, y_target, adj)
# # Method to transform data for Cora, Citeseer, Pubmed in a .npz file
# def transform_original_data_to_npz(dataset_str):
# '''Transform original data for Cora, Citeseer, and Pubmed in a npz file
# The file is created in directory: './datasets/' + dataset_str + '-orig/'
# Parameters
# ----------
# dataset_str: str
# Name of the dataset (may be 'cora', 'citeseer', or 'pubmed')
# Returns
# -------
# None
# '''
# # Using Kipf method to get Cora, Citeseer, and Pubmed data (with the canonical split from Planetoid)
# adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels_ = load_data(dataset_str)
# d_npz = {}
# # Transform to npz
# print(f"check sizes {train_mask.shape} {val_mask.shape} {test_mask.shape}")
# # adj.tocsr()
# print(len(adj.data))
# d_npz['adj_data'] = adj.data
# d_npz['adj_indices'] = adj.indices
# d_npz['adj_indptr'] = adj.indptr
# d_npz['adj_shape'] = adj.shape
# print(type(features))
# features = features.tocsr()
# print(type(features))
# d_npz['attr_data'] = features.data
# d_npz['attr_indices'] = features.indices
# d_npz['attr_indptr'] = features.indptr
# d_npz['attr_shape'] = features.shape
# # print(f'Length of features: {len(features.data)}')
# y_train_coo = coo_matrix(y_train)
# y_val_coo = coo_matrix(y_val)
# y_test_coo = coo_matrix(y_test)
# labels = torch.zeros(adj.shape[0], dtype=int)
# labels_ = coo_matrix(labels_)
# values = labels_.data
# indices = np.vstack((labels_.row, labels_.col))
# i = torch.LongTensor(indices)
# v = torch.FloatTensor(values)
# shape = labels_.shape
# Y = torch.sparse.LongTensor(i, v, torch.Size(shape)).to_dense()
# labels = torch.argmax(Y, dim=1)
# print(type(labels_))
# print(labels_.shape)
# d_npz['labels'] = labels
# # Train, val, test sets
# d_npz['canonical_train_set'] = torch.arange(adj.shape[0])[train_mask]
# d_npz['canonical_val_set'] = torch.arange(adj.shape[0])[val_mask]
# d_npz['canonical_test_set'] = torch.arange(adj.shape[0])[test_mask]
# np.savez_compressed('./datasets/' + dataset_str + '-orig/' + dataset_str + '-orig.npz' , **d_npz)
# return None
# Structure of .npz files:
# dataset = {
# 'adj_data' -> adjacency: list of ones (define the edges)
# 'adj_indices' -> adjacency: list of indices for the adjacency matrix
# 'adj_indptr' -> adjacency: list of index pointers for the adjacency matrix
# 'adj_shape' -> adjacency: shape of the adjacency matrix
#
# 'FEATURE_data' -> features: list of weights of the FEATUREs
# 'FEATURE_indices' -> features: list of indices for the FEATURE matrix
# 'FEATURE_indptr' -> features: list of index pointers for the FEATURE matrix
# 'FEATURE_shape' -> features: shape of the FEATURE matrix
# Note: a list of feature has to be provided (default feature is 'attr' (generally if only one set of features))
#
# 'LABELS' -> labels: LABELS of each node
# }
def load_dataset_npz(dataset_str, attr = ['attr'], labels= 'labels'):
'''Load a dataset stored in a .npz file
Parameters
----------
dataset_str: str
Name of the dataset (may be 'cora-orig', 'citeseer-orig', 'pubmed-orig', or 'wikivitals_new')
attr: list of str
List of attributes to concatenate. 'attr' by default if only one type of attributes.
labels: str
Name of the labels to use if multiple labellizations available
Returns
-------
X: csr_matrix
Node features. Shape is number of nodes * number of features
y_target: numpy array
Target values (labels). Shape is number of nodes
adj: csr_matrix
Raw adjacency matrix. Shape is number of nodes * number of nodes
'''
file = './datasets/' + dataset_str + '/' + dataset_str + '.npz'
dataset = np.load(file, allow_pickle=True)
# load adjacency matrix
adj = csr_matrix((dataset['adj_data'], dataset['adj_indices'], dataset['adj_indptr']), dataset['adj_shape'])
# load features
X = []
for a in attr:
attr_ = csr_matrix((dataset[a+'_data'], dataset[a+'_indices'], dataset[a+'_indptr']), dataset[a+'_shape'])
X.append(attr_)
if len(X)==1:
X = X[0]
else:
X = hstack(X, format='csr')
# load labels
unique_values, y_target = np.unique(dataset[labels], return_inverse=True)
print("----- DATA LOADED -----")
print(f'#Edges: {len(adj.data)}')
print(f'#Features: {X.shape[1]}')
print(f'#Nodes: {X.shape[0]}')
print(f'#Classes: {len(unique_values)}')
return X, y_target, adj
# -------------------------------
# Adjacency matrix transformation
# -------------------------------
# Assumption: all values in adj_tsr ar 0 (no edge) or 1 (edge)
def to_symmetric_with_ones(adj, action_on_self_links = None):
'''Transform a square matrix into a symmetric one (with self-links added or removed)
Parameters
----------
adj_tsr: csr_matrix
The matrix to transform
action_on_self_links: str
'remove_self_links' to fill the diagonal with 0s.
'add_self_links' to fill the diagonal with ones
Returns
-------
adj_sym: csr_matrix
Symmetric matrix
'''
# to symmetric
rows, cols = adj.nonzero()
rows_sym = np.hstack([rows, cols])
cols_sym = np.hstack([cols, rows])
n = len(cols_sym)
adj_sym = csr_matrix((np.ones(n), (rows_sym, cols_sym)), shape=adj.shape)
adj_sym.sum_duplicates() # to remove duplicate couple of indices
adj_sym[adj_sym > 0] = 1 # adjacency matrix has only 1s and 0s
# add self links or remove them or do nothing
if action_on_self_links == 'add_self_links':
adj_sym.setdiag(1)
elif action_on_self_links == 'remove_self_links':
adj_sym.setdiag(0)
else:
pass
print('A_, the symmetric version of A, has been computed')
return adj_sym
def calculate_degrees(adj, power = 1):
"""Calculation the degree matrix (at a given power)
"""
tmp = np.squeeze(np.asarray(adj.sum(axis = 1)))
degrees = diags(tmp, format='csr')
return degrees.power(power)
def transform_adjacency(adj, normalization_trick = "None", to_symmetric = False, action_on_self_links = None):
"""Transform the adjacency matrix
If normalization_trick is set to any value in ["normalize_D-1", "normalize_D-0.5"]
has priority over the other parameters
Else this method transforms (by default) the adjacency matrix into a symmetric adjacency
matrix with self links added
"""
if normalization_trick == "normalize_D-1":
# INFO: The normalization trick includes the symmetrization of the matrix and the
# addition of self-links (and overwrites the corresponding parameters
A_ = to_symmetric_with_ones(adj, 'add_self_links')
D_ = calculate_degrees(A_, power=-1)
print('A~ normalized: A~ = D^(-1)A_')
return D_.dot(A_)
elif normalization_trick == "normalize_D-0.5":
# INFO: The normalization trick includes the symmetrization of the matrix and the
# addition of self-links (and overwrites the corresponding parameters
A_ = to_symmetric_with_ones(adj, 'add_self_links')
D_ = calculate_degrees(A_, power=-0.5)
print('A_ normalized: A~ = D^(-1/2)A_D^(-1/2)')
return D_.dot(A_.dot(D_))
else:
pass
if to_symmetric:
return to_symmetric_with_ones(adj, action_on_self_links)
print('A_ not normalized')
print('A not normalized')
return adj
# -------------------------------
# Features pre-processing
# -------------------------------
# Binarization of features
def binarize_features(X, binarize = False):
X_ = X
if binarize:
X_[X_ > 0] = 1
print('Pre-processing: features binarized')
return X_
def normalize_features(X, normalization = 'None'):
if normalization == 'global-L1':
print('Pre-processing: feature normalization using l1-norm.')
return sklearn.preprocessing.normalize(X, norm='l1', axis = 1)
elif normalization == 'global-L2':
print('Pre-processing: feature normalization using l2-norm.')
return sklearn.preprocessing.normalize(X, norm='l2', axis = 1)
else:
print('Pre-processing: features not normalized')
return X
if __name__ == "__main__":
from datetime import datetime
# dataset = 'CORA-ORIG'
dataset = 'WIKIVITALS_NEW'
ts0 = datetime.timestamp(datetime.now())
X, y_target, adj = load_dataset(dataset)
X = binarize_features(X, True)
X = normalize_features(X, 'global-L1')
adj_norm = transform_adjacency(
adj,
'normalize_D-0.5',
to_symmetric = True,
action_on_self_links = 'add_self_links'
)
ts1 = datetime.timestamp(datetime.now())
print(ts1-ts0)