-
Notifications
You must be signed in to change notification settings - Fork 0
/
simulations.py
416 lines (370 loc) · 15.5 KB
/
simulations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
import numpy as np
from graspologic.simulations import sample_edges, sample_edges_corr, sbm
from graspologic.simulations.simulations_corr import check_dirloop, check_r
from graspologic.utils import symmetrize
from graspologic.models import DCSBMEstimator
def get_lowest_r(p, q):
return max(-np.sqrt(p * q / (1 - p) / (1 - q)), -np.sqrt((1 - p) * (1 - q) / p / q))
def get_highest_r(p, q):
return min(np.sqrt(p * (1 - q) / q / (1 - p)), np.sqrt(q * (1 - p) / p / (1 - q)))
def check_r_er(p, q, r):
low_r = get_lowest_r(p, q)
high_r = get_highest_r(p, q)
if r < low_r:
raise ValueError("r is lower than the lowest r allowed by p and q: {:.2f}".format(low_r))
if r > high_r:
raise ValueError("r is higher than the highest r allowed by p and q: {:.2f}".format(high_r))
def get_lowest_r_sbm(p, q):
r = -1
for i in range(np.array(p).shape[0]):
for j in range(np.array(p).shape[1]):
low_r = get_lowest_r(p[i][j], q[i][j])
if r < low_r:
r = low_r
return r
def get_highest_r_sbm(p, q):
r = 1
for i in range(np.array(p).shape[0]):
for j in range(np.array(p).shape[1]):
high_r = get_highest_r(p[i][j], q[i][j])
if r > high_r:
r = high_r
return r
def check_r_sbm(p, q, r):
low_r = get_lowest_r_sbm(p, q)
high_r = get_highest_r_sbm(p, q)
if r < low_r:
raise ValueError("r is lower than the lowest r allowed by p and q: {:.2f}".format(low_r))
if r > high_r:
raise ValueError("r is higher than the highest r allowed by p and q: {:.2f}".format(high_r))
def sample_edges_corr_diffmarg(P, Q, R, directed=False, loops=False):
"""
Generate a pair of correlated graphs with Bernoulli distribution.
Both G1 and G2 are binary matrices.
Allows for different marginal distributions
Parameters
----------
P: np.ndarray, shape (n_vertices, n_vertices)
Matrix of probabilities (between 0 and 1) for the first random graph.
Q: np.ndarray, shape (n_vertices, n_vertices)
Matrix of probabilities (between 0 and 1) for the second random graph.
R: np.ndarray, shape (n_vertices, n_vertices)
Matrix of correlation (between 0 and 1) between graph pairs.
directed: boolean, optional (default=False)
If False, output adjacency matrix will be symmetric. Otherwise, output adjacency
matrix will be asymmetric.
loops: boolean, optional (default=False)
If False, no edges will be sampled in the diagonal. Otherwise, edges
are sampled in the diagonal.
Returns
-------
G1: ndarray (n_vertices, n_vertices)
Adjacency matrix the same size as P representing a random graph.
G2: ndarray (n_vertices, n_vertices)
Adjacency matrix the same size as P representing a random graph.
"""
# test input
# check P
if type(P) is not np.ndarray:
raise TypeError("P must be numpy.ndarray")
if len(P.shape) != 2:
raise ValueError("P must have dimension 2 (n_vertices, n_vertices)")
if P.shape[0] != P.shape[1]:
raise ValueError("P must be a square matrix")
# check Q
if type(Q) is not np.ndarray:
raise TypeError("Q must be numpy.ndarray")
if len(Q.shape) != 2:
raise ValueError("Q must have dimension 2 (n_vertices, n_vertices)")
if Q.shape[0] != P.shape[0] or Q.shape[1] != P.shape[1]:
raise ValueError("Q must have the same shape as P")
# check R
if type(R) is not np.ndarray:
raise TypeError("R must be numpy.ndarray")
if len(R.shape) != 2:
raise ValueError("R must have dimension 2 (n_vertices, n_vertices)")
if R.shape[0] != P.shape[0] or R.shape[1] != P.shape[1]:
raise ValueError("R must have the same shape as P")
# check directed and loops
check_dirloop(directed, loops)
G1 = sample_edges(P, directed=directed, loops=loops)
P2 = G1.copy()
P2 = np.where(P2 == 1,
Q + R * np.sqrt((1 - P) * Q * (1 - Q) / P),
Q - R * np.sqrt(P * Q * (1 - Q) / (1 - P))
)
G2 = sample_edges(P2, directed=directed, loops=loops)
return G1, G2
def er_corr_diffmarg(n, p, q, r, directed=False, loops=False):
"""
Generate a pair of correlated graphs with specified edge probability
Both G1 and G2 are binary matrices.
Parameters
----------
n: int
Number of vertices
p: float
Probability of an edge existing between two vertices in the first graph, between 0 and 1.
q: float
Probability of an edge existing between two vertices in the second graph, between 0 and 1.
r: float
The value of the correlation between the same vertices in two graphs.
directed: boolean, optional (default=False)
If False, output adjacency matrix will be symmetric. Otherwise, output adjacency
matrix will be asymmetric.
loops: boolean, optional (default=False)
If False, no edges will be sampled in the diagonal. Otherwise, edges
are sampled in the diagonal.
Returns
-------
G1: ndarray (n_vertices, n_vertices)
Adjacency matrix the same size as P representing a random graph.
G2: ndarray (n_vertices, n_vertices)
Adjacency matrix the same size as P representing a random graph.
"""
# test input
# check n
if not np.issubdtype(type(n), np.integer):
raise TypeError("n is not of type int.")
elif n <= 0:
msg = "n must be > 0."
raise ValueError(msg)
# check p
if not np.issubdtype(type(p), np.floating):
raise TypeError("p is not of type float.")
elif p < 0 or p > 1:
msg = "p must between 0 and 1."
raise ValueError(msg)
# check q
if not np.issubdtype(type(q), np.floating):
raise TypeError("q is not of type float.")
elif q < 0 or q > 1:
msg = "q must between 0 and 1."
raise ValueError(msg)
# check r
check_r(r)
# check the relation between r, p and q
check_r_er(p, q, r)
# check directed and loops
check_dirloop(directed, loops)
P = p * np.ones((n, n))
Q = q * np.ones((n, n))
R = r * np.ones((n, n))
G1, G2 = sample_edges_corr_diffmarg(P, Q, R, directed=directed, loops=loops)
return G1, G2
def sbm_corr_diffmarg(n, p, q, r, directed=False, loops=False):
"""
Generate a pair of correlated graphs with specified edge probability
Both G1 and G2 are binary matrices.
Parameters
----------
n: list of int, shape (n_communities)
Number of vertices in each community. Communities are assigned n[0], n[1], ...
p: array-like, shape (n_communities, n_communities)
Probability of an edge between each of the communities in the first graph, where p[i, j] indicates
the probability of a connection between edges in communities [i, j].
0 < p[i, j] < 1 for all i, j.
q: array-like, shape (n_communities, n_communities)
same as p, but for the second graph
r: float
Probability of the correlation between the same vertices in two graphs.
directed: boolean, optional (default=False)
If False, output adjacency matrix will be symmetric. Otherwise, output adjacency
matrix will be asymmetric.
loops: boolean, optional (default=False)
If False, no edges will be sampled in the diagonal. Otherwise, edges
are sampled in the diagonal.
Returns
-------
G1: ndarray (n_vertices, n_vertices)
Adjacency matrix the same size as P representing a random graph.
G2: ndarray (n_vertices, n_vertices)
Adjacency matrix the same size as P representing a random graph.
"""
# test input
# Check n
if not isinstance(n, (list, np.ndarray)):
msg = "n must be a list or np.array, not {}.".format(type(n))
raise TypeError(msg)
else:
n = np.array(n)
if not np.issubdtype(n.dtype, np.integer):
msg = "There are non-integer elements in n"
raise ValueError(msg)
# Check p
if not isinstance(p, (list, np.ndarray)):
msg = "p must be a list or np.array, not {}.".format(type(p))
raise TypeError(msg)
else:
p = np.array(p)
if not np.issubdtype(p.dtype, np.number):
msg = "There are non-numeric elements in p"
raise ValueError(msg)
elif p.shape != (n.size, n.size):
msg = "p is must have shape len(n) x len(n), not {}".format(p.shape)
raise ValueError(msg)
elif np.any(p < 0) or np.any(p > 1):
msg = "Values in p must be in between 0 and 1."
raise ValueError(msg)
# Check q
if not isinstance(q, (list, np.ndarray)):
msg = "q must be a list or np.array, not {}.".format(type(q))
raise TypeError(msg)
else:
q = np.array(q)
if not np.issubdtype(q.dtype, np.number):
msg = "There are non-numeric elements in q"
raise ValueError(msg)
elif q.shape != (n.size, n.size):
msg = "q is must have shape len(n) x len(n), not {}".format(q.shape)
raise ValueError(msg)
elif np.any(q < 0) or np.any(q > 1):
msg = "Values in q must be in between 0 and 1."
raise ValueError(msg)
# check r
check_r(r)
# check the relation between r, p and q
check_r_sbm(p, q, r)
# check directed and loops
check_dirloop(directed, loops)
P = np.zeros((np.sum(n), np.sum(n)))
Q = np.zeros((np.sum(n), np.sum(n)))
block_indices = np.insert(np.cumsum(np.array(n)), 0, 0)
for i in range(np.array(p).shape[0]): # for each row
for j in range(np.array(p).shape[1]): # for each column
P[
block_indices[i] : block_indices[i + 1],
block_indices[j] : block_indices[j + 1],
] = p[i][j]
Q[
block_indices[i] : block_indices[i + 1],
block_indices[j] : block_indices[j + 1],
] = q[i][j]
R = r * np.ones((np.sum(n), np.sum(n)))
G1, G2 = sample_edges_corr_diffmarg(P, Q, R, directed=directed, loops=loops)
return G1, G2
def sample_edges_corr_weighted(shape, mu1, mu2, Sigma):
"""
Generate a pair of correlated matrices with the bivariate normal distribution.
Both G1 and G2 are non-binary matrices.
Every pair of entries is distributed as a bivariate normal, with mean = [mu1, mu2]
and covariance matrix Sigma
The correlation between G1 and G2 is Sigma12 / sqrt(Sigma11 * Sigma22)
Parameters
----------
shape: tuple
The shape of the output matrices: shape[0] denotes the number of rows, shape[1] the columns
mu1: float
The mean of the edge weights of G1 (analogous the marginal probability p in correlated Bernoulli graph)
mu2: float
The mean of the edge weights of G2 (analogous the marginal probability q in correlated Bernoulli graph)
Sigma: list or ndarray (2, 2)
The covariance matrix encoding the variances of the edge weights of G1, G2
and the covariance beteween them
Returns
-------
G1: ndarray (shape)
G2: ndarray (shape)
"""
if not isinstance(shape, tuple) or len(shape) != 2:
raise ValueError("shape must be a tuple of length 2")
if not np.issubdtype(type(mu1), np.floating) and not np.issubdtype(type(mu1), np.integer):
raise ValueError("mu1 is not of type int or float")
if not np.issubdtype(type(mu2), np.floating) and not np.issubdtype(type(mu2), np.integer):
raise ValueError("mu2 is not of type int or float")
if not isinstance(Sigma, (list, np.ndarray)):
raise ValueError("Sigma must be list or np.ndarray")
if np.array(Sigma).shape != (2, 2):
raise ValueError("Sigma must have shape (2, 2)")
sample = np.random.multivariate_normal([mu1, mu2], Sigma, size=shape)
G1 = sample[..., 0]
G2 = sample[..., 1]
return G1, G2
def er_corr_weighted(n, mu1, mu2, Sigma, directed=False, loops=False):
"""
Generate a pair of correlated graphs with the bivariate normal distribution.
Both G1 and G2 are non-binary matrices.
Every pair of edges is distributed as a bivariate normal, with mean = [mu1, mu2]
and covariance matrix Sigma
The correlation between G1 and G2 is Sigma12 / sqrt(Sigma11 * Sigma22)
Parameters
----------
n: int
Number of vertices
mu1: float
The mean of the edge weights of G1 (analogous the marginal probability p in correlated Bernoulli graph)
mu2: float
The mean of the edge weights of G2 (analogous the marginal probability q in correlated Bernoulli graph)
Sigma: list or ndarray (2, 2)
The covariance matrix encoding the variances of the edge weights of G1, G2
and the covariance beteween them
Returns
-------
G1: ndarray (n_vertices, n_vertices)
Adjacency matrix representing a random graph.
G2: ndarray (n_vertices, n_vertices)
Adjacency matrix representing a random graph.
"""
G1, G2 = sample_edges_corr_weighted((n, n), mu1, mu2, Sigma)
if not directed:
G1 = symmetrize(G1, method="triu")
G2 = symmetrize(G2, method="triu")
if not loops:
G1 = G1 - np.diag(np.diag(G1))
G2 = G2 - np.diag(np.diag(G2))
return G1, G2
def sbm_corr_weighted(n, mu1, mu2, Sigma, directed=False, loops=False):
"""
Parameters
----------
n: list of int, shape (n_communities)
Number of vertices in each community. Communities are assigned n[0], n[1], ...
mu1: array-like, shape (n_communities, n_communities)
Mean of the edge weight between each of the communities in the first graph, where mu1[i, j] indicates
the mean of the edge weights of edges in communities [i, j].
mu2: array-like, shape (n_communities, n_communities)
same as mu1, but for the second graph
Sigma: list or ndarray (2, 2)
The covariance matrix encoding the variances of the edge weights of G1, G2
and the covariance beteween them.
Right now we are forcing the entire graph to have the same variance and covariance
"""
n = np.array(n)
G1 = np.zeros((np.sum(n), np.sum(n)))
G2 = np.zeros((np.sum(n), np.sum(n)))
block_indices = np.insert(np.cumsum(np.array(n)), 0, 0)
for i in range(n.size): # for each row
for j in range(n.size): # for each column
g1, g2 = sample_edges_corr_weighted((n[i], n[j]), mu1[i][j], mu2[i][j], Sigma)
G1[
block_indices[i] : block_indices[i + 1],
block_indices[j] : block_indices[j + 1],
] = g1
G2[
block_indices[i] : block_indices[i + 1],
block_indices[j] : block_indices[j + 1],
] = g2
if not directed:
G1 = symmetrize(G1, method="triu")
G2 = symmetrize(G2, method="triu")
if not loops:
G1 = G1 - np.diag(np.diag(G1))
G2 = G2 - np.diag(np.diag(G2))
return G1, G2
def dcsbm_corr(n, p, r, theta, epsilon1=1e-3, epsilon2=1e-3, directed=False, loops=False):
'''
Sample a pair of DC-SBM with the same marginal probabilities
'''
Z = np.repeat(np.arange(0, np.array(n).size), n)
R = r * np.ones((np.sum(n), np.sum(n)))
# sample a DC-SBM w/ block prob p
G = sbm(n, p, dc=theta)
# fit DC-SBM to G1 to estimate P
G_dcsbm = DCSBMEstimator(directed=False).fit(G, y=Z)
p_mat = G_dcsbm.p_mat_
# P could be out of range
p_mat[p_mat < epsilon1] = epsilon1
p_mat[p_mat > 1 - epsilon2] = 1 - epsilon2
# sample correlated graphs based on P
G1, G2 = sample_edges_corr(p_mat, R, directed, loops)
return G1, G2