Merge pull request #389 from cosanlab/fix

Fix
cosanlab · Apr 16, 2021 · 4966277 · 4966277
2 parents 3467117 + 7a07a33
commit 4966277
Show file tree

Hide file tree

Showing 10 changed files with 393 additions and 201 deletions.
diff --git a/examples/01_DataOperations/plot_adjacency.py b/examples/01_DataOperations/plot_adjacency.py
@@ -25,8 +25,10 @@
 m1 = block_diag(np.ones((4, 4)), np.zeros((4, 4)), np.zeros((4, 4)))
 m2 = block_diag(np.zeros((4, 4)), np.ones((4, 4)), np.zeros((4, 4)))
 m3 = block_diag(np.zeros((4, 4)), np.zeros((4, 4)), np.ones((4, 4)))
-noisy = (m1*1+m2*2+m3*3) + np.random.randn(12, 12)*.1
-dat = Adjacency(noisy, matrix_type='similarity', labels=['C1']*4 + ['C2']*4 + ['C3']*4)
+noisy = (m1 * 1 + m2 * 2 + m3 * 3) + np.random.randn(12, 12) * 0.1
+dat = Adjacency(
+    noisy, matrix_type="similarity", labels=["C1"] * 4 + ["C2"] * 4 + ["C3"] * 4
+)
 
 #########################################################################
 # Basic information about the object can be viewed by simply calling it.
@@ -44,37 +46,39 @@
 dat.plot()
 
 #########################################################################
-# The mean within a a grouping label can be calculated using the `.within_cluster_mean()` method.  You must specify a group variable to group the  data.  Here we use the labels.
+# The mean within a a grouping label can be calculated using the `.cluster_summary()` method.  You must specify a group variable to group the  data.  Here we use the labels.
 
-print(dat.within_cluster_mean(clusters=dat.labels))
+print(dat.cluster_summary(clusters=dat.labels, summary="within", metric="mean"))
 
 #########################################################################
 # Regression
 # ----------
 #
 # Adjacency objects can currently accommodate two different types of regression. Sometimes we might want to decompose an Adjacency matrix from a linear combination of other Adjacency matrices.  Other times we might want to perform a regression at each pixel in a stack of Adjacency matrices. Here we provide an example of each method.  We use the same data we generated above, but attempt to decompose it by each block of data.  We create the design matrix by simply concatenating the matrices we used to create the data object. The regress method returns a dictionary containing all of the relevant information from the regression. Here we show that the model recovers the average weight in each block.
 
-X = Adjacency([m1, m2, m3], matrix_type='similarity')
+X = Adjacency([m1, m2, m3], matrix_type="similarity")
 stats = dat.regress(X)
-print(stats['beta'])
+print(stats["beta"])
 
 #########################################################################
 # In addition to decomposing a single adjacency matrix, we can also estimate a model that predicts the variance over each voxel.  This is equivalent to a univariate regression in imaging analyses. Remember that just like in imaging these tests are non-independent and may require correcting for multiple comparisons.  Here we create some data that varies over matrices and identify pixels that follow a particular on-off-on pattern.  We plot the t-values that exceed 2.
 
 from nltools.data import Design_Matrix
 import matplotlib.pyplot as plt
 
-data = Adjacency([m1 + np.random.randn(12,12)*.5 for x in range(5)] +
-                 [np.zeros((12, 12)) + np.random.randn(12, 12)*.5 for x in range(5)] +
-                 [m1 + np.random.randn(12, 12)*.5 for x in range(5)])
+data = Adjacency(
+    [m1 + np.random.randn(12, 12) * 0.5 for x in range(5)]
+    + [np.zeros((12, 12)) + np.random.randn(12, 12) * 0.5 for x in range(5)]
+    + [m1 + np.random.randn(12, 12) * 0.5 for x in range(5)]
+)
 
-X = Design_Matrix([1]*5 + [0]*5 + [1]*5)
+X = Design_Matrix([1] * 5 + [0] * 5 + [1] * 5)
 f = X.plot()
-f.set_title('Model', fontsize=18)
+f.set_title("Model", fontsize=18)
 
 stats = data.regress(X)
-t = stats['t'].plot(vmin=2)
-plt.title('Significant Pixels',fontsize=18)
+t = stats["t"].plot(vmin=2)
+plt.title("Significant Pixels", fontsize=18)
 
 #########################################################################
 # Similarity/Distance
@@ -88,22 +92,22 @@
 #########################################################################
 # We can also calculate the distance between multiple matrices contained within a single Adjacency object. Any distance metric is available from the `sci-kit learn <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html>`_ by specifying the `method` flag. This outputs an Adjacency matrix.  In the example below we see that several matrices are more similar to each other (i.e., when the signal is on).  Remember that the nodes here now represent each matrix from the original distance matrix.
 
-dist = data.distance(metric='correlation')
+dist = data.distance(metric="correlation")
 dist.plot()
 
 #########################################################################
 # Similarity matrices can be converted to and from Distance matrices using `.similarity_to_distance()` and `.distance_to_similarity()`.
 
-dist.distance_to_similarity().plot()
+dist.distance_to_similarity(metric="correlation").plot()
 
 #########################################################################
 # Multidimensional Scaling
 # ------------------------
 #
 # We can perform additional analyses on distance matrices such as multidimensional scaling. Here we provide an example to create a 3D multidimensional scaling plot of our data to see if the on and off matrices might naturally group together.
 
-dist = data.distance(metric='correlation')
-dist.labels = ['On']*5 + ['Off']*5 + ['On']*5
+dist = data.distance(metric="correlation")
+dist.labels = ["On"] * 5 + ["Off"] * 5 + ["On"] * 5
 dist.plot_mds(n_components=3)
 
 #########################################################################
@@ -114,9 +118,9 @@
 
 import networkx as nx
 
-dat = Adjacency(m1+m2+m3, matrix_type='similarity')
+dat = Adjacency(m1 + m2 + m3, matrix_type="similarity")
 g = dat.to_graph()
 
-print('Degree of each node: %s' % g.degree())
+print("Degree of each node: %s" % g.degree())
 
 nx.draw_circular(g)
diff --git a/nltools/data/adjacency.py b/nltools/data/adjacency.py
@@ -25,6 +25,7 @@
     summarize_bootstrap,
     matrix_permutation,
     fisher_r_to_z,
+    fisher_z_to_r,
     _calc_pvalue,
     _bootstrap_isc,
 )
@@ -36,7 +37,6 @@
     concatenate,
     _bootstrap_apply_func,
     _df_meta_to_arr,
-    isiterable,
 )
 from .design_matrix import Design_Matrix
 from joblib import Parallel, delayed
@@ -114,29 +114,40 @@ def __init__(self, data=None, Y=None, matrix_type=None, labels=[], **kwargs):
                 self.issymmetric = symmetric_all[0]
                 self.matrix_type = matrix_type_all[0]
             self.is_single_matrix = False
-        elif (isinstance(data, str) or isinstance(data, Path)) and (
-            (".h5" in data) or (".hdf5" in data)
-        ):
-            f = dd.io.load(data)
-            self.data = f["data"]
-            self.Y = pd.DataFrame(
-                f["Y"],
-                columns=[
-                    e.decode("utf-8") if isinstance(e, bytes) else e
-                    for e in f["Y_columns"]
-                ],
-                index=[
+        elif isinstance(data, str) or isinstance(data, Path):
+            to_load = str(data)
+            # Data is a string or apth and h5
+            if (".h5" in to_load) or (".hdf5" in to_load):
+                f = dd.io.load(data)
+                self.data = f["data"]
+                self.Y = pd.DataFrame(
+                    f["Y"],
+                    columns=[
+                        e.decode("utf-8") if isinstance(e, bytes) else e
+                        for e in f["Y_columns"]
+                    ],
+                    index=[
+                        e.decode("utf-8") if isinstance(e, bytes) else e
+                        for e in f["Y_index"]
+                    ],
+                )
+                self.matrix_type = f["matrix_type"]
+                self.is_single_matrix = f["is_single_matrix"]
+                self.issymmetric = f["issymmetric"]
+                self.labels = [
                     e.decode("utf-8") if isinstance(e, bytes) else e
-                    for e in f["Y_index"]
-                ],
-            )
-            self.matrix_type = f["matrix_type"]
-            self.is_single_matrix = f["is_single_matrix"]
-            self.issymmetric = f["issymmetric"]
-            self.labels = [
-                e.decode("utf-8") if isinstance(e, bytes) else e for e in f["labels"]
-            ]
-            return
+                    for e in f["labels"]
+                ]
+                return
+            # Data is a string or path but not h5
+            else:
+                (
+                    self.data,
+                    self.issymmetric,
+                    self.matrix_type,
+                    self.is_single_matrix,
+                ) = self._import_single_data(data, matrix_type=matrix_type)
+        # Data is not a string or path
         else:
             (
                 self.data,
@@ -511,6 +522,30 @@ def mean(self, axis=0):
             elif axis == 1:
                 return np.nanmean(self.data, axis=axis)
 
+    def sum(self, axis=0):
+        """Calculate sum of Adjacency
+
+        Args:
+            axis:  (int) calculate mean over features (0) or data (1).
+                    For data it will be on upper triangle.
+
+        Returns:
+            mean:  float if single, adjacency if axis=0, np.array if axis=1
+                    and multiple
+
+        """
+
+        if self.is_single_matrix:
+            return np.nansum(self.data)
+        else:
+            if axis == 0:
+                return Adjacency(
+                    data=np.nansum(self.data, axis=axis),
+                    matrix_type=self.matrix_type + "_flat",
+                )
+            elif axis == 1:
+                return np.nansum(self.data, axis=axis)
+
     def std(self, axis=0):
         """Calculate standard deviation of Adjacency
 
@@ -752,6 +787,13 @@ def r_to_z(self):
         out.data = fisher_r_to_z(out.data)
         return out
 
+    def z_to_r(self):
+        """ Convert z score back into r value for each element of data object"""
+
+        out = self.copy()
+        out.data = fisher_z_to_r(out.data)
+        return out
+
     def threshold(self, upper=None, lower=None, binarize=False):
         """Threshold Adjacency instance. Provide upper and lower values or
            percentages to perform two-sided thresholding. Binarize will return
@@ -1067,7 +1109,7 @@ def isc(
                 exclude_self_corr=exclude_self_corr,
                 random_state=random_state,
             )
-            for i in range(n_bootstraps)
+            for _ in range(n_bootstraps)
         )
 
         stats["p"] = _calc_pvalue(all_bootstraps - stats["isc"], stats["isc"], tail)
@@ -1185,57 +1227,85 @@ def plot_mds(
         ax.xaxis.set_visible(False)
         ax.yaxis.set_visible(False)
 
-    def distance_to_similarity(self, beta=1):
-        """Convert distance matrix to similarity matrix
+    def distance_to_similarity(self, metric="correlation", beta=1):
+        """Convert distance matrix to similarity matrix.
+
+        Note: currently only implemented for correlation and euclidean.
 
         Args:
-            beta: (float) parameter to scale exponential function (default: 1)
+            metric: (str) Can only be correlation or euclidean
+            beta: (float) parameter to scale exponential function (default: 1) for euclidean
 
         Returns:
             out: (Adjacency) Adjacency object
 
         """
         if self.matrix_type == "distance":
-            return Adjacency(
-                np.exp(-beta * self.squareform() / self.squareform().std()),
-                labels=self.labels,
-                matrix_type="similarity",
-            )
+            if metric == "correlation":
+                return Adjacency(1 - self.squareform(), matrix_type="similarity")
+            elif metric == "euclidean":
+                return Adjacency(
+                    np.exp(-beta * self.squareform() / self.squareform().std()),
+                    labels=self.labels,
+                    matrix_type="similarity",
+                )
+            else:
+                raise ValueError('metric can only be ["correlation","euclidean"]')
         else:
             raise ValueError("Matrix is not a distance matrix.")
 
-    def similarity_to_distance(self):
-        """Convert similarity matrix to distance matrix"""
-        if self.matrix_type == "similarity":
-            return Adjacency(
-                1 - self.squareform(), labels=self.labels, matrix_type="distance"
-            )
-        else:
-            raise ValueError("Matrix is not a similarity matrix.")
+    def cluster_summary(self, clusters=None, metric="mean", summary="within"):
+        """This function provides summaries of clusters within Adjacency matrices.
 
-    def within_cluster_mean(self, clusters=None):
-        """This function calculates mean within cluster labels
+        It can compute mean/median of within and between cluster values. Requires a
+        list of cluster ids indicating the row/column of each cluster.
 
         Args:
             clusters: (list) list of cluster labels
+            metric: (str) method to summarize mean or median. If 'None" then return all r values
+            summary: (str) summarize within cluster or between clusters
+
         Returns:
             dict: (dict) within cluster means
+
         """
+        if metric not in ["mean", "median", None]:
+            raise ValueError("metric must be ['mean','median', None]")
 
         distance = pd.DataFrame(self.squareform())
         clusters = np.array(clusters)
 
         if len(clusters) != distance.shape[0]:
             raise ValueError("Cluster labels must be same length as distance matrix")
 
-        out = pd.DataFrame(columns=["Mean", "Label"], index=None)
         out = {}
         for i in list(set(clusters)):
-            out[i] = np.mean(
-                distance.loc[clusters == i, clusters == i].values[
-                    np.triu_indices(sum(clusters == i), k=1)
-                ]
-            )
+            if summary == "within":
+                if metric == "mean":
+                    out[i] = np.mean(
+                        distance.loc[clusters == i, clusters == i].values[
+                            np.triu_indices(sum(clusters == i), k=1)
+                        ]
+                    )
+                elif metric == "median":
+                    out[i] = np.median(
+                        distance.loc[clusters == i, clusters == i].values[
+                            np.triu_indices(sum(clusters == i), k=1)
+                        ]
+                    )
+                elif metric is None:
+                    out[i] = distance.loc[clusters == i, clusters == i].values[
+                        np.triu_indices(sum(clusters == i), k=1)
+                    ]
+            elif summary == "between":
+                if metric == "mean":
+                    out[i] = distance.loc[clusters == i, clusters != i].mean().mean()
+                elif metric == "median":
+                    out[i] = (
+                        distance.loc[clusters == i, clusters != i].median().median()
+                    )
+                elif metric is None:
+                    out[i] = distance.loc[clusters == i, clusters != i]
         return out
 
     def regress(self, X, mode="ols", **kwargs):
@@ -1281,11 +1351,11 @@ def regress(self, X, mode="ols", **kwargs):
     def social_relations_model(self, summarize_results=True, nan_replace=True):
         """Estimate the social relations model from a matrix for a round-robin design.
 
-        X_{ij} = m + \alpha_i + \beta_j + g_{ij} + \episolon_{ijl}
+        X_{ij} = m + \alpha_i + \beta_j + g_{ij} + \epsilon_{ijl}
 
         where X_{ij} is the score for person i rating person j, m is the group mean,
         \alpha_i  is person i's actor effect, \beta_j is person j's partner effect, g_{ij}
-        is the relationship  effect and \episolon_{ijl} is the error in measure l  for actor i and partner j.
+        is the relationship  effect and \epsilon_{ijl} is the error in measure l  for actor i and partner j.
 
         This model is primarily concerned with partioning the variance of the various effects.
 
@@ -1551,7 +1621,7 @@ def fix_missing(data):
             return (X, coord)
 
         if nan_replace:
-            data, coord = replace_missing(self)
+            data, _ = replace_missing(self)
         else:
             data = self.copy()