Skip to content

Commit

Permalink
Merge pull request #389 from cosanlab/fix
Browse files Browse the repository at this point in the history
Fix
  • Loading branch information
ejolly authored Apr 16, 2021
2 parents 3467117 + 7a07a33 commit 4966277
Show file tree
Hide file tree
Showing 10 changed files with 393 additions and 201 deletions.
42 changes: 23 additions & 19 deletions examples/01_DataOperations/plot_adjacency.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
m1 = block_diag(np.ones((4, 4)), np.zeros((4, 4)), np.zeros((4, 4)))
m2 = block_diag(np.zeros((4, 4)), np.ones((4, 4)), np.zeros((4, 4)))
m3 = block_diag(np.zeros((4, 4)), np.zeros((4, 4)), np.ones((4, 4)))
noisy = (m1*1+m2*2+m3*3) + np.random.randn(12, 12)*.1
dat = Adjacency(noisy, matrix_type='similarity', labels=['C1']*4 + ['C2']*4 + ['C3']*4)
noisy = (m1 * 1 + m2 * 2 + m3 * 3) + np.random.randn(12, 12) * 0.1
dat = Adjacency(
noisy, matrix_type="similarity", labels=["C1"] * 4 + ["C2"] * 4 + ["C3"] * 4
)

#########################################################################
# Basic information about the object can be viewed by simply calling it.
Expand All @@ -44,37 +46,39 @@
dat.plot()

#########################################################################
# The mean within a a grouping label can be calculated using the `.within_cluster_mean()` method. You must specify a group variable to group the data. Here we use the labels.
# The mean within a a grouping label can be calculated using the `.cluster_summary()` method. You must specify a group variable to group the data. Here we use the labels.

print(dat.within_cluster_mean(clusters=dat.labels))
print(dat.cluster_summary(clusters=dat.labels, summary="within", metric="mean"))

#########################################################################
# Regression
# ----------
#
# Adjacency objects can currently accommodate two different types of regression. Sometimes we might want to decompose an Adjacency matrix from a linear combination of other Adjacency matrices. Other times we might want to perform a regression at each pixel in a stack of Adjacency matrices. Here we provide an example of each method. We use the same data we generated above, but attempt to decompose it by each block of data. We create the design matrix by simply concatenating the matrices we used to create the data object. The regress method returns a dictionary containing all of the relevant information from the regression. Here we show that the model recovers the average weight in each block.

X = Adjacency([m1, m2, m3], matrix_type='similarity')
X = Adjacency([m1, m2, m3], matrix_type="similarity")
stats = dat.regress(X)
print(stats['beta'])
print(stats["beta"])

#########################################################################
# In addition to decomposing a single adjacency matrix, we can also estimate a model that predicts the variance over each voxel. This is equivalent to a univariate regression in imaging analyses. Remember that just like in imaging these tests are non-independent and may require correcting for multiple comparisons. Here we create some data that varies over matrices and identify pixels that follow a particular on-off-on pattern. We plot the t-values that exceed 2.

from nltools.data import Design_Matrix
import matplotlib.pyplot as plt

data = Adjacency([m1 + np.random.randn(12,12)*.5 for x in range(5)] +
[np.zeros((12, 12)) + np.random.randn(12, 12)*.5 for x in range(5)] +
[m1 + np.random.randn(12, 12)*.5 for x in range(5)])
data = Adjacency(
[m1 + np.random.randn(12, 12) * 0.5 for x in range(5)]
+ [np.zeros((12, 12)) + np.random.randn(12, 12) * 0.5 for x in range(5)]
+ [m1 + np.random.randn(12, 12) * 0.5 for x in range(5)]
)

X = Design_Matrix([1]*5 + [0]*5 + [1]*5)
X = Design_Matrix([1] * 5 + [0] * 5 + [1] * 5)
f = X.plot()
f.set_title('Model', fontsize=18)
f.set_title("Model", fontsize=18)

stats = data.regress(X)
t = stats['t'].plot(vmin=2)
plt.title('Significant Pixels',fontsize=18)
t = stats["t"].plot(vmin=2)
plt.title("Significant Pixels", fontsize=18)

#########################################################################
# Similarity/Distance
Expand All @@ -88,22 +92,22 @@
#########################################################################
# We can also calculate the distance between multiple matrices contained within a single Adjacency object. Any distance metric is available from the `sci-kit learn <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html>`_ by specifying the `method` flag. This outputs an Adjacency matrix. In the example below we see that several matrices are more similar to each other (i.e., when the signal is on). Remember that the nodes here now represent each matrix from the original distance matrix.

dist = data.distance(metric='correlation')
dist = data.distance(metric="correlation")
dist.plot()

#########################################################################
# Similarity matrices can be converted to and from Distance matrices using `.similarity_to_distance()` and `.distance_to_similarity()`.

dist.distance_to_similarity().plot()
dist.distance_to_similarity(metric="correlation").plot()

#########################################################################
# Multidimensional Scaling
# ------------------------
#
# We can perform additional analyses on distance matrices such as multidimensional scaling. Here we provide an example to create a 3D multidimensional scaling plot of our data to see if the on and off matrices might naturally group together.

dist = data.distance(metric='correlation')
dist.labels = ['On']*5 + ['Off']*5 + ['On']*5
dist = data.distance(metric="correlation")
dist.labels = ["On"] * 5 + ["Off"] * 5 + ["On"] * 5
dist.plot_mds(n_components=3)

#########################################################################
Expand All @@ -114,9 +118,9 @@

import networkx as nx

dat = Adjacency(m1+m2+m3, matrix_type='similarity')
dat = Adjacency(m1 + m2 + m3, matrix_type="similarity")
g = dat.to_graph()

print('Degree of each node: %s' % g.degree())
print("Degree of each node: %s" % g.degree())

nx.draw_circular(g)
172 changes: 121 additions & 51 deletions nltools/data/adjacency.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
summarize_bootstrap,
matrix_permutation,
fisher_r_to_z,
fisher_z_to_r,
_calc_pvalue,
_bootstrap_isc,
)
Expand All @@ -36,7 +37,6 @@
concatenate,
_bootstrap_apply_func,
_df_meta_to_arr,
isiterable,
)
from .design_matrix import Design_Matrix
from joblib import Parallel, delayed
Expand Down Expand Up @@ -114,29 +114,40 @@ def __init__(self, data=None, Y=None, matrix_type=None, labels=[], **kwargs):
self.issymmetric = symmetric_all[0]
self.matrix_type = matrix_type_all[0]
self.is_single_matrix = False
elif (isinstance(data, str) or isinstance(data, Path)) and (
(".h5" in data) or (".hdf5" in data)
):
f = dd.io.load(data)
self.data = f["data"]
self.Y = pd.DataFrame(
f["Y"],
columns=[
e.decode("utf-8") if isinstance(e, bytes) else e
for e in f["Y_columns"]
],
index=[
elif isinstance(data, str) or isinstance(data, Path):
to_load = str(data)
# Data is a string or apth and h5
if (".h5" in to_load) or (".hdf5" in to_load):
f = dd.io.load(data)
self.data = f["data"]
self.Y = pd.DataFrame(
f["Y"],
columns=[
e.decode("utf-8") if isinstance(e, bytes) else e
for e in f["Y_columns"]
],
index=[
e.decode("utf-8") if isinstance(e, bytes) else e
for e in f["Y_index"]
],
)
self.matrix_type = f["matrix_type"]
self.is_single_matrix = f["is_single_matrix"]
self.issymmetric = f["issymmetric"]
self.labels = [
e.decode("utf-8") if isinstance(e, bytes) else e
for e in f["Y_index"]
],
)
self.matrix_type = f["matrix_type"]
self.is_single_matrix = f["is_single_matrix"]
self.issymmetric = f["issymmetric"]
self.labels = [
e.decode("utf-8") if isinstance(e, bytes) else e for e in f["labels"]
]
return
for e in f["labels"]
]
return
# Data is a string or path but not h5
else:
(
self.data,
self.issymmetric,
self.matrix_type,
self.is_single_matrix,
) = self._import_single_data(data, matrix_type=matrix_type)
# Data is not a string or path
else:
(
self.data,
Expand Down Expand Up @@ -511,6 +522,30 @@ def mean(self, axis=0):
elif axis == 1:
return np.nanmean(self.data, axis=axis)

def sum(self, axis=0):
"""Calculate sum of Adjacency
Args:
axis: (int) calculate mean over features (0) or data (1).
For data it will be on upper triangle.
Returns:
mean: float if single, adjacency if axis=0, np.array if axis=1
and multiple
"""

if self.is_single_matrix:
return np.nansum(self.data)
else:
if axis == 0:
return Adjacency(
data=np.nansum(self.data, axis=axis),
matrix_type=self.matrix_type + "_flat",
)
elif axis == 1:
return np.nansum(self.data, axis=axis)

def std(self, axis=0):
"""Calculate standard deviation of Adjacency
Expand Down Expand Up @@ -752,6 +787,13 @@ def r_to_z(self):
out.data = fisher_r_to_z(out.data)
return out

def z_to_r(self):
""" Convert z score back into r value for each element of data object"""

out = self.copy()
out.data = fisher_z_to_r(out.data)
return out

def threshold(self, upper=None, lower=None, binarize=False):
"""Threshold Adjacency instance. Provide upper and lower values or
percentages to perform two-sided thresholding. Binarize will return
Expand Down Expand Up @@ -1067,7 +1109,7 @@ def isc(
exclude_self_corr=exclude_self_corr,
random_state=random_state,
)
for i in range(n_bootstraps)
for _ in range(n_bootstraps)
)

stats["p"] = _calc_pvalue(all_bootstraps - stats["isc"], stats["isc"], tail)
Expand Down Expand Up @@ -1185,57 +1227,85 @@ def plot_mds(
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)

def distance_to_similarity(self, beta=1):
"""Convert distance matrix to similarity matrix
def distance_to_similarity(self, metric="correlation", beta=1):
"""Convert distance matrix to similarity matrix.
Note: currently only implemented for correlation and euclidean.
Args:
beta: (float) parameter to scale exponential function (default: 1)
metric: (str) Can only be correlation or euclidean
beta: (float) parameter to scale exponential function (default: 1) for euclidean
Returns:
out: (Adjacency) Adjacency object
"""
if self.matrix_type == "distance":
return Adjacency(
np.exp(-beta * self.squareform() / self.squareform().std()),
labels=self.labels,
matrix_type="similarity",
)
if metric == "correlation":
return Adjacency(1 - self.squareform(), matrix_type="similarity")
elif metric == "euclidean":
return Adjacency(
np.exp(-beta * self.squareform() / self.squareform().std()),
labels=self.labels,
matrix_type="similarity",
)
else:
raise ValueError('metric can only be ["correlation","euclidean"]')
else:
raise ValueError("Matrix is not a distance matrix.")

def similarity_to_distance(self):
"""Convert similarity matrix to distance matrix"""
if self.matrix_type == "similarity":
return Adjacency(
1 - self.squareform(), labels=self.labels, matrix_type="distance"
)
else:
raise ValueError("Matrix is not a similarity matrix.")
def cluster_summary(self, clusters=None, metric="mean", summary="within"):
"""This function provides summaries of clusters within Adjacency matrices.
def within_cluster_mean(self, clusters=None):
"""This function calculates mean within cluster labels
It can compute mean/median of within and between cluster values. Requires a
list of cluster ids indicating the row/column of each cluster.
Args:
clusters: (list) list of cluster labels
metric: (str) method to summarize mean or median. If 'None" then return all r values
summary: (str) summarize within cluster or between clusters
Returns:
dict: (dict) within cluster means
"""
if metric not in ["mean", "median", None]:
raise ValueError("metric must be ['mean','median', None]")

distance = pd.DataFrame(self.squareform())
clusters = np.array(clusters)

if len(clusters) != distance.shape[0]:
raise ValueError("Cluster labels must be same length as distance matrix")

out = pd.DataFrame(columns=["Mean", "Label"], index=None)
out = {}
for i in list(set(clusters)):
out[i] = np.mean(
distance.loc[clusters == i, clusters == i].values[
np.triu_indices(sum(clusters == i), k=1)
]
)
if summary == "within":
if metric == "mean":
out[i] = np.mean(
distance.loc[clusters == i, clusters == i].values[
np.triu_indices(sum(clusters == i), k=1)
]
)
elif metric == "median":
out[i] = np.median(
distance.loc[clusters == i, clusters == i].values[
np.triu_indices(sum(clusters == i), k=1)
]
)
elif metric is None:
out[i] = distance.loc[clusters == i, clusters == i].values[
np.triu_indices(sum(clusters == i), k=1)
]
elif summary == "between":
if metric == "mean":
out[i] = distance.loc[clusters == i, clusters != i].mean().mean()
elif metric == "median":
out[i] = (
distance.loc[clusters == i, clusters != i].median().median()
)
elif metric is None:
out[i] = distance.loc[clusters == i, clusters != i]
return out

def regress(self, X, mode="ols", **kwargs):
Expand Down Expand Up @@ -1281,11 +1351,11 @@ def regress(self, X, mode="ols", **kwargs):
def social_relations_model(self, summarize_results=True, nan_replace=True):
"""Estimate the social relations model from a matrix for a round-robin design.
X_{ij} = m + \alpha_i + \beta_j + g_{ij} + \episolon_{ijl}
X_{ij} = m + \alpha_i + \beta_j + g_{ij} + \epsilon_{ijl}
where X_{ij} is the score for person i rating person j, m is the group mean,
\alpha_i is person i's actor effect, \beta_j is person j's partner effect, g_{ij}
is the relationship effect and \episolon_{ijl} is the error in measure l for actor i and partner j.
is the relationship effect and \epsilon_{ijl} is the error in measure l for actor i and partner j.
This model is primarily concerned with partioning the variance of the various effects.
Expand Down Expand Up @@ -1551,7 +1621,7 @@ def fix_missing(data):
return (X, coord)

if nan_replace:
data, coord = replace_missing(self)
data, _ = replace_missing(self)
else:
data = self.copy()

Expand Down
Loading

0 comments on commit 4966277

Please sign in to comment.