From f8e3f05e42c4e4e792ac99a6871440f8834ddd44 Mon Sep 17 00:00:00 2001 From: Luke Chang Date: Sun, 12 Nov 2017 17:20:03 -0500 Subject: [PATCH 1/2] Refactored data module Former-commit-id: 47992913cdcc9b281ba08dc4ca83eb0748aaf3b2 --- nltools/data/__init__.py | 13 + nltools/data/adjacency.py | 602 +++++++++++++ nltools/{data.py => data/brain_data.py} | 1065 +---------------------- nltools/data/design_matrix.py | 449 ++++++++++ nltools/utils.py | 40 +- 5 files changed, 1130 insertions(+), 1039 deletions(-) create mode 100644 nltools/data/__init__.py create mode 100644 nltools/data/adjacency.py rename nltools/{data.py => data/brain_data.py} (57%) create mode 100644 nltools/data/design_matrix.py diff --git a/nltools/data/__init__.py b/nltools/data/__init__.py new file mode 100644 index 00000000..61e090a0 --- /dev/null +++ b/nltools/data/__init__.py @@ -0,0 +1,13 @@ +""" +nltools data types. +""" + +from .brain_data import Brain_Data, Groupby +from .adjacency import Adjacency +from .design_matrix import Design_Matrix, Design_Matrix_Series + +__all__ = ['Brain_Data', + 'Adjacency', + 'Groupby', + 'Design_Matrix', + 'Design_Matrix_Series'] diff --git a/nltools/data/adjacency.py b/nltools/data/adjacency.py new file mode 100644 index 00000000..d9df1df7 --- /dev/null +++ b/nltools/data/adjacency.py @@ -0,0 +1,602 @@ +from __future__ import division + +''' +This data class is for working with similarity/dissimilarity matrices +''' + +__author__ = ["Luke Chang"] +__license__ = "MIT" + +import os +import pandas as pd +import numpy as np +import six +from copy import deepcopy +from sklearn.metrics.pairwise import pairwise_distances +from scipy.spatial.distance import squareform +import seaborn as sns +import matplotlib.pyplot as plt +from nltools.stats import (correlation_permutation, + one_sample_permutation, + two_sample_permutation, + summarize_bootstrap) +from nltools.plotting import (plot_stacked_adjacency, + plot_silhouette) +from nltools.utils import (all_same, + attempt_to_import, + concatenate, + _bootstrap_apply_func) +from joblib import Parallel, delayed + +# Optional dependencies +nx = attempt_to_import('networkx', 'nx') + +class Adjacency(object): + + ''' + Adjacency is a class to represent Adjacency matrices as a vector rather + than a 2-dimensional matrix. This makes it easier to perform data + manipulation and analyses. + + Args: + data: pandas data instance or list of files + matrix_type: (str) type of matrix. Possible values include: + ['distance','similarity','directed','distance_flat', + 'similarity_flat','directed_flat'] + Y: Pandas DataFrame of training labels + **kwargs: Additional keyword arguments + + ''' + + def __init__(self, data=None, Y=None, matrix_type=None, **kwargs): + if matrix_type is not None: + if matrix_type.lower() not in ['distance','similarity','directed', + 'distance_flat','similarity_flat', + 'directed_flat']: + raise ValueError("matrix_type must be [None,'distance', " + "'similarity','directed','distance_flat', " + "'similarity_flat','directed_flat']") + + if data is None: + self.data = np.array([]) + self.matrix_type = 'empty' + self.is_single_matrix = np.nan + self.issymmetric = np.nan + elif isinstance(data, list): + if isinstance(data[0], Adjacency): + tmp = concatenate(data) + for item in ['data', 'matrix_type', 'Y','issymmetric']: + setattr(self, item, getattr(tmp,item)) + else: + d_all = []; symmetric_all = []; matrix_type_all = [] + for d in data: + data_tmp, issymmetric_tmp, matrix_type_tmp, _ = self._import_single_data(d, matrix_type=matrix_type) + d_all.append(data_tmp) + symmetric_all.append(issymmetric_tmp) + matrix_type_all.append(matrix_type_tmp) + if not all_same(symmetric_all): + raise ValueError('Not all matrices are of the same ' + 'symmetric type.') + if not all_same(matrix_type_all): + raise ValueError('Not all matrices are of the same matrix ' + 'type.') + self.data = np.array(d_all) + self.issymmetric = symmetric_all[0] + self.matrix_type = matrix_type_all[0] + self.is_single_matrix = False + else: + self.data, self.issymmetric, self.matrix_type, self.is_single_matrix = self._import_single_data(data, matrix_type=matrix_type) + + if Y is not None: + if isinstance(Y, six.string_types): + if os.path.isfile(Y): + Y = pd.read_csv(Y, header=None, index_col=None) + if isinstance(Y, pd.DataFrame): + if self.data.shape[0] != len(Y): + raise ValueError("Y does not match the correct size of " + "data") + self.Y = Y + else: + raise ValueError("Make sure Y is a pandas data frame.") + else: + self.Y = pd.DataFrame() + + def __repr__(self): + return ("%s.%s(shape=%s, square_shape=%s, Y=%s, is_symmetric=%s," + "matrix_type=%s)") % ( + self.__class__.__module__, + self.__class__.__name__, + self.shape(), + self.square_shape(), + len(self.Y), + self.issymmetric, + self.matrix_type) + + def __getitem__(self,index): + new = self.copy() + if isinstance(index, int): + new.data = np.array(self.data[index, :]).flatten() + new.is_single_matrix = True + else: + new.data = np.array(self.data[index, :]) + if not self.Y.empty: + new.Y = self.Y.iloc[index] + return new + + def __len__(self): + if self.is_single_matrix: + return 1 + else: + return self.data.shape[0] + + def __iter__(self): + for x in range(len(self)): + yield self[x] + + def __add__(self, y): + new = deepcopy(self) + if isinstance(y, (int, float)): + new.data = new.data + y + if isinstance(y, Adjacency): + if self.shape() != y.shape(): + raise ValueError('Both Adjacency() instances need to be the ' + 'same shape.') + new.data = new.data + y.data + return new + + def __sub__(self, y): + new = deepcopy(self) + if isinstance(y, (int, float)): + new.data = new.data - y + if isinstance(y, Adjacency): + if self.shape() != y.shape(): + raise ValueError('Both Adjacency() instances need to be the ' + 'same shape.') + new.data = new.data - y.data + return new + + def __mul__(self, y): + new = deepcopy(self) + if isinstance(y, (int, float)): + new.data = new.data * y + if isinstance(y, Adjacency): + if self.shape() != y.shape(): + raise ValueError('Both Adjacency() instances need to be the ' + 'same shape.') + new.data = np.multiply(new.data, y.data) + return new + + def _import_single_data(self, data, matrix_type=None): + ''' Helper function to import single data matrix.''' + + if isinstance(data, six.string_types): + if os.path.isfile(data): + data = pd.read_csv(data) + else: + raise ValueError('Make sure you have specified a valid file ' + 'path.') + + def test_is_single_matrix(data): + if len(data.shape) == 1: + return True + else: + return False + + if matrix_type is not None: + if matrix_type.lower() == 'distance_flat': + matrix_type = 'distance' + data = np.array(data) + issymmetric = True + is_single_matrix = test_is_single_matrix(data) + elif matrix_type.lower() == 'similarity_flat': + matrix_type = 'similarity' + data = np.array(data) + issymmetric = True + is_single_matrix = test_is_single_matrix(data) + elif matrix_type.lower() == 'directed_flat': + matrix_type = 'directed' + data = np.array(data).flatten() + issymmetric = False + is_single_matrix = test_is_single_matrix(data) + elif matrix_type.lower() in ['distance', 'similarity', 'directed']: + if data.shape[0] != data.shape[1]: + raise ValueError('Data matrix must be square') + data = np.array(data) + matrix_type = matrix_type.lower() + if matrix_type in ['distance', 'similarity']: + issymmetric = True + data = data[np.triu_indices(data.shape[0], k=1)] + else: + issymmetric = False + if isinstance(data, pd.DataFrame): + data = data.values.flatten() + elif isinstance(data, np.ndarray): + data = data.flatten() + is_single_matrix = True + else: + if len(data.shape) == 1: # Single Vector + try: + data = squareform(data) + except ValueError: + print('Data is not flattened upper triangle from ' + 'similarity/distance matrix or flattened directed ' + 'matrix.') + is_single_matrix = True + elif data.shape[0] == data.shape[1]: # Square Matrix + is_single_matrix = True + else: # Rectangular Matrix + data_all = deepcopy(data) + try: + data = squareform(data_all[0, :]) + except ValueError: + print('Data is not flattened upper triangle from multiple ' + 'similarity/distance matrices or flattened directed ' + 'matrices.') + is_single_matrix = False + + # Test if matrix is symmetrical + if np.all(data[np.triu_indices(data.shape[0], k=1)] == data.T[np.triu_indices(data.shape[0], k=1)]): + issymmetric = True + else: + issymmetric = False + + # Determine matrix type + if issymmetric: + if np.sum(np.diag(data)) == 0: + matrix_type = 'distance' + elif np.sum(np.diag(data)) == data.shape[0]: + matrix_type = 'similarity' + data = data[np.triu_indices(data.shape[0], k=1)] + else: + matrix_type = 'directed' + data = data.flatten() + + if not is_single_matrix: + data = data_all + + return (data, issymmetric, matrix_type, is_single_matrix) + + def isempty(self): + '''Check if Adjacency object is empty''' + return bool(self.matrix_type is 'empty') + + def squareform(self): + '''Convert adjacency back to squareform''' + if self.issymmetric: + if self.is_single_matrix: + return squareform(self.data) + else: + return [squareform(x.data) for x in self] + else: + if self.is_single_matrix: + return self.data.reshape(int(np.sqrt(self.data.shape[0])), + int(np.sqrt(self.data.shape[0]))) + else: + return [x.data.reshape(int(np.sqrt(x.data.shape[0])), + int(np.sqrt(x.data.shape[0]))) for x in self] + + def plot(self, limit=3, **kwargs): + ''' Create Heatmap of Adjacency Matrix''' + if self.is_single_matrix: + return sns.heatmap(self.squareform(), square=True, **kwargs) + else: + f, a = plt.subplots(limit) + for i in range(limit): + sns.heatmap(self[i].squareform(), square=True, ax=a[i], + **kwargs) + return f + + def mean(self, axis=0): + ''' Calculate mean of Adjacency + + Args: + axis: calculate mean over features (0) or data (1). + For data it will be on upper triangle. + + Returns: + mean: float if single, adjacency if axis=0, np.array if axis=1 + and multiple + + ''' + + if self.is_single_matrix: + return np.mean(self.data) + else: + if axis == 0: + return Adjacency(data=np.mean(self.data, axis=axis), + matrix_type=self.matrix_type + '_flat') + elif axis == 1: + return np.mean(self.data, axis=axis) + + def std(self, axis=0): + ''' Calculate standard deviation of Adjacency + + Args: + axis: calculate std over features (0) or data (1). + For data it will be on upper triangle. + + Returns: + std: float if single, adjacency if axis=0, np.array if axis=1 and + multiple + + ''' + + if self.is_single_matrix: + return np.std(self.data) + else: + if axis == 0: + return Adjacency(data=np.std(self.data, axis=axis), + matrix_type=self.matrix_type + '_flat') + elif axis == 1: + return np.std(self.data, axis=axis) + + def shape(self): + ''' Calculate shape of data. ''' + return self.data.shape + + def square_shape(self): + ''' Calculate shape of squareform data. ''' + if self.matrix_type is 'empty': + return np.array([]) + else: + if self.is_single_matrix: + return self.squareform().shape + else: + return self[0].squareform().shape + + def copy(self): + ''' Create a copy of Adjacency object.''' + return deepcopy(self) + + def append(self, data): + ''' Append data to Adjacency instance + + Args: + data: Adjacency instance to append + + Returns: + out: new appended Adjacency instance + + ''' + + if not isinstance(data, Adjacency): + raise ValueError('Make sure data is a Adjacency instance.') + + if self.isempty(): + out = data.copy() + else: + out = self.copy() + if self.square_shape() != data.square_shape(): + raise ValueError('Data is not the same shape as Adjacency ' + 'instance.') + + out.data = np.vstack([self.data, data.data]) + out.is_single_matrix = False + if out.Y.size: + out.Y = self.Y.append(data.Y) + + return out + + def write(self, file_name, method='long'): + ''' Write out Adjacency object to csv file. + + Args: + file_name (str): name of file name to write + method (str): method to write out data ['long','square'] + + ''' + if method not in ['long', 'square']: + raise ValueError('Make sure method is ["long","square"].') + if self.is_single_matrix: + if method is 'long': + out = pd.DataFrame(self.data).to_csv(file_name, index=None) + elif method is 'square': + out = pd.DataFrame(self.squareform()).to_csv(file_name, + index=None) + else: + if method is 'long': + out = pd.DataFrame(self.data).to_csv(file_name, index=None) + elif method is 'square': + raise NotImplementedError('Need to decide how we should write ' + 'out multiple matrices. As separate ' + 'files?') + + def similarity(self, data, plot=False, **kwargs): + ''' Calculate similarity between two Adjacency matrices. + Default is to use spearman correlation and permutation test.''' + if not isinstance(data, Adjacency): + data2 = Adjacency(data) + else: + data2 = data.copy() + if self.is_single_matrix: + if plot: + plot_stacked_adjacency(self, data) + return correlation_permutation(self.data, data2.data, **kwargs) + else: + if plot: + _, a = plt.subplots(len(self)) + for i in a: + plot_stacked_adjacency(self, data, ax=i) + return [correlation_permutation(x.data, data2.data, **kwargs) for x in self] + + def distance(self, method='correlation', **kwargs): + ''' Calculate distance between images within an Adjacency() instance. + + Args: + method: type of distance metric (can use any scikit learn or + sciypy metric) + + Returns: + dist: Outputs a 2D distance matrix. + + ''' + return Adjacency(pairwise_distances(self.data, metric=method, **kwargs), + matrix_type='distance') + + def threshold(self, thresh=0, binarize=False): + '''Threshold Adjacency instance + + Args: + thresh: cutoff to threshold image (float). if 'threshold'=50%, + will calculate percentile. + binarize (bool): if 'binarize'=True then binarize output + Returns: + Brain_Data: thresholded Brain_Data instance + + ''' + + b = self.copy() + if isinstance(thresh, six.string_types): + if thresh[-1] is '%': + thresh = np.percentile(b.data, float(thresh[:-1])) + if binarize: + b.data = b.data > thresh + else: + b.data[b.data < thresh] = 0 + return b + + def to_graph(self): + ''' Convert Adjacency into networkx graph. only works on + single_matrix for now.''' + + if self.is_single_matrix: + if self.matrix_type == 'directed': + return nx.DiGraph(self.squareform()) + else: + return nx.Graph(self.squareform()) + else: + raise NotImplementedError('This function currently only works on ' + 'single matrices.') + + def ttest(self, **kwargs): + ''' Calculate ttest across samples. ''' + if self.is_single_matrix: + raise ValueError('t-test cannot be run on single matrices.') + m = []; p = [] + for i in range(self.data.shape[1]): + stats = one_sample_permutation(self.data[:, i], **kwargs) + m.append(stats['mean']) + p.append(stats['p']) + mn = Adjacency(np.array(m)) + pval = Adjacency(np.array(p)) + return (mn, pval) + + def plot_label_distance(self, labels, ax=None): + ''' Create a violin plot indicating within and between label distance + + Args: + labels (np.array): numpy array of labels to plot + + Returns: + violin plot handles + + ''' + + if not self.is_single_matrix: + raise ValueError('This function only works on single adjacency ' + 'matrices.') + + distance = pd.DataFrame(self.squareform()) + + if len(labels) != distance.shape[0]: + raise ValueError('Labels must be same length as distance matrix') + + within = []; between = [] + out = pd.DataFrame(columns=['Distance', 'Group', 'Type'], index=None) + for i in np.unique(labels): + tmp_w = pd.DataFrame(columns=out.columns, index=None) + tmp_w['Distance'] = distance.loc[labels == i, labels == i].values[np.triu_indices(sum(labels == i), k=1)] + tmp_w['Type'] = 'Within' + tmp_w['Group'] = i + tmp_b = pd.DataFrame(columns=out.columns, index=None) + tmp_b['Distance'] = distance.loc[labels != i, labels != i].values[np.triu_indices(sum(labels == i), k=1)] + tmp_b['Type'] = 'Between' + tmp_b['Group'] = i + out = out.append(tmp_w).append(tmp_b) + f = sns.violinplot(x="Group", y="Distance", hue="Type", data=out, split=True, inner='quartile', + palette={"Within": "lightskyblue", "Between": "red"}, ax=ax) + f.set_ylabel('Average Distance') + f.set_title('Average Group Distance') + return f + + def stats_label_distance(self, labels, n_permute=5000, n_jobs=-1): + ''' Calculate permutation tests on within and between label distance. + + Args: + labels (np.array): numpy array of labels to plot + n_permute (int): number of permutations to run (default=5000) + + Returns: + dict: dictionary of within and between group differences + and p-values + + ''' + + if not self.is_single_matrix: + raise ValueError('This function only works on single adjacency ' + 'matrices.') + + distance = pd.DataFrame(self.squareform()) + + if len(labels) != distance.shape[0]: + raise ValueError('Labels must be same length as distance matrix') + + within = []; between = [] + out = pd.DataFrame(columns=['Distance', 'Group', 'Type'], index=None) + for i in np.unique(labels): + tmp_w = pd.DataFrame(columns=out.columns, index=None) + tmp_w['Distance'] = distance.loc[labels == i, labels == i].values[np.triu_indices(sum(labels == i), k=1)] + tmp_w['Type'] = 'Within' + tmp_w['Group'] = i + tmp_b = pd.DataFrame(columns=out.columns, index=None) + tmp_b['Distance'] = distance.loc[labels == i, labels != i].values.flatten() + tmp_b['Type'] = 'Between' + tmp_b['Group'] = i + out = out.append(tmp_w).append(tmp_b) + stats = dict() + for i in np.unique(labels): + # Within group test + tmp1 = out.loc[(out['Group'] == i) & (out['Type'] == 'Within'), 'Distance'] + tmp2 = out.loc[(out['Group'] == i) & (out['Type'] == 'Between'), 'Distance'] + stats[str(i)] = two_sample_permutation(tmp1, tmp2, + n_permute=n_permute, n_jobs=n_jobs) + return stats + + def plot_silhouette(self, labels, ax=None, permutation_test=True, + n_permute=5000, **kwargs): + '''Create a silhouette plot''' + distance = pd.DataFrame(self.squareform()) + + if len(labels) != distance.shape[0]: + raise ValueError('Labels must be same length as distance matrix') + + (f,outAll) = plot_silhouette(distance, labels, ax=None, + permutation_test=True, + n_permute=5000, **kwargs) + return (f,outAll) + + def bootstrap(self, function, n_samples=5000, save_weights=False, + n_jobs=-1, *args, **kwargs): + '''Bootstrap an Adjacency method. + + Example Useage: + b = dat.bootstrap('mean', n_samples=5000) + b = dat.bootstrap('predict', n_samples=5000, algorithm='ridge') + b = dat.bootstrap('predict', n_samples=5000, save_weights=True) + + Args: + function: (str) method to apply to data for each bootstrap + n_samples: (int) number of samples to bootstrap with replacement + save_weights: (bool) Save each bootstrap iteration + (useful for aggregating many bootstraps on a cluster) + n_jobs: (int) The number of CPUs to use to do the computation. + -1 means all CPUs.Returns: + output: summarized studentized bootstrap output + + ''' + + bootstrapped = Parallel(n_jobs=n_jobs)( + delayed(_bootstrap_apply_func)(self, + function, *args, **kwargs) for i in range(n_samples)) + bootstrapped = Adjacency(bootstrapped) + return summarize_bootstrap(bootstrapped, save_weights=save_weights) diff --git a/nltools/data.py b/nltools/data/brain_data.py similarity index 57% rename from nltools/data.py rename to nltools/data/brain_data.py index f6706360..631fcae4 100644 --- a/nltools/data.py +++ b/nltools/data/brain_data.py @@ -11,23 +11,43 @@ # Notes: # Need to figure out how to speed up loading and resampling of data -__all__ = ['Brain_Data', - 'Adjacency', - 'Groupby', - 'Design_Matrix', - 'Design_Matrix_Series'] __author__ = ["Luke Chang"] __license__ = "MIT" -import os import pickle # import cPickle +from nilearn.signal import clean +from scipy.stats import ttest_1samp, t, norm +from scipy.signal import detrend +from scipy.spatial.distance import squareform +import os +import shutil import nibabel as nib +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import warnings +import tempfile +from copy import deepcopy +import six +from sklearn.metrics.pairwise import pairwise_distances +from pynv import Client +from joblib import Parallel, delayed +from nltools.mask import expand_mask, collapse_mask +from nltools.analysis import Roc +from nilearn.input_data import NiftiMasker +from nilearn.image import resample_img +from nilearn.masking import intersect_masks +from nilearn.regions import connected_regions, connected_label_regions +from nilearn.plotting.img_plotting import plot_epi, plot_roi, plot_stat_map from nltools.utils import (get_resource_path, set_algorithm, get_anatomical, make_cosine_basis, glover_hrf, - attempt_to_import) + attempt_to_import, + concatenate, + _bootstrap_apply_func) from nltools.cross_validation import set_cv from nltools.plotting import (dist_from_hyperplane_plot, scatterplot, @@ -47,34 +67,10 @@ zscore, transform_pairwise, summarize_bootstrap) -from nltools.mask import expand_mask, collapse_mask -from nltools.analysis import Roc -from nilearn.input_data import NiftiMasker -from nilearn.image import resample_img -from nilearn.masking import intersect_masks -from nilearn.regions import connected_regions, connected_label_regions -from nilearn.plotting.img_plotting import plot_epi, plot_roi, plot_stat_map -from nilearn.signal import clean -from copy import deepcopy -import pandas as pd -from pandas import DataFrame, Series -import numpy as np -from scipy.stats import ttest_1samp, t, norm -from scipy.signal import detrend -from scipy.spatial.distance import squareform -import six -from sklearn.metrics.pairwise import pairwise_distances from nltools.pbs_job import PBS_Job -import warnings -import shutil -import tempfile -import seaborn as sns -from pynv import Client -import matplotlib.pyplot as plt -from joblib import Parallel, delayed +from .adjacency import Adjacency # Optional dependencies -nx = attempt_to_import('networkx', 'nx') mne_stats = attempt_to_import('mne.stats',name='mne_stats', fromlist= ['spatio_temporal_cluster_1samp_test', 'ttest_1samp_no_p']) @@ -1301,575 +1297,6 @@ def bootstrap(self, function, n_samples=5000, save_weights=False, bootstrapped = Brain_Data(bootstrapped) return summarize_bootstrap(bootstrapped, save_weights=save_weights) -class Adjacency(object): - - ''' - Adjacency is a class to represent Adjacency matrices as a vector rather - than a 2-dimensional matrix. This makes it easier to perform data - manipulation and analyses. - - Args: - data: pandas data instance or list of files - matrix_type: (str) type of matrix. Possible values include: - ['distance','similarity','directed','distance_flat', - 'similarity_flat','directed_flat'] - Y: Pandas DataFrame of training labels - **kwargs: Additional keyword arguments - - ''' - - def __init__(self, data=None, Y=None, matrix_type=None, **kwargs): - if matrix_type is not None: - if matrix_type.lower() not in ['distance','similarity','directed','distance_flat','similarity_flat','directed_flat']: - raise ValueError("matrix_type must be [None,'distance', " - "'similarity','directed','distance_flat', " - "'similarity_flat','directed_flat']") - - if data is None: - self.data = np.array([]) - self.matrix_type = 'empty' - self.is_single_matrix = np.nan - self.issymmetric = np.nan - elif isinstance(data, list): - if isinstance(data[0], Adjacency): - tmp = concatenate(data) - for item in ['data', 'matrix_type', 'Y','issymmetric']: - setattr(self, item, getattr(tmp,item)) - else: - d_all = []; symmetric_all = []; matrix_type_all = [] - for d in data: - data_tmp, issymmetric_tmp, matrix_type_tmp, _ = self._import_single_data(d,matrix_type=matrix_type) - d_all.append(data_tmp) - symmetric_all.append(issymmetric_tmp) - matrix_type_all.append(matrix_type_tmp) - if not all_same(symmetric_all): - raise ValueError('Not all matrices are of the same symmetric ' - 'type.') - if not all_same(matrix_type_all): - raise ValueError('Not all matrices are of the same matrix ' - 'type.') - self.data = np.array(d_all) - self.issymmetric = symmetric_all[0] - self.matrix_type = matrix_type_all[0] - self.is_single_matrix = False - else: - self.data, self.issymmetric, self.matrix_type, self.is_single_matrix = self._import_single_data(data, matrix_type=matrix_type) - - if Y is not None: - if isinstance(Y, six.string_types): - if os.path.isfile(Y): - Y = pd.read_csv(Y, header=None, index_col=None) - if isinstance(Y, pd.DataFrame): - if self.data.shape[0] != len(Y): - raise ValueError("Y does not match the correct size of " - "data") - self.Y = Y - else: - raise ValueError("Make sure Y is a pandas data frame.") - else: - self.Y = pd.DataFrame() - - def __repr__(self): - return ("%s.%s(shape=%s, square_shape=%s, Y=%s, is_symmetric=%s," - "matrix_type=%s)") % ( - self.__class__.__module__, - self.__class__.__name__, - self.shape(), - self.square_shape(), - len(self.Y), - self.issymmetric, - self.matrix_type) - - def __getitem__(self,index): - new = self.copy() - if isinstance(index, int): - new.data = np.array(self.data[index, :]).flatten() - new.is_single_matrix = True - else: - new.data = np.array(self.data[index, :]) - if not self.Y.empty: - new.Y = self.Y.iloc[index] - return new - - def __len__(self): - if self.is_single_matrix: - return 1 - else: - return self.data.shape[0] - - def __iter__(self): - for x in range(len(self)): - yield self[x] - - def __add__(self, y): - new = deepcopy(self) - if isinstance(y, (int, float)): - new.data = new.data + y - if isinstance(y, Adjacency): - if self.shape() != y.shape(): - raise ValueError('Both Adjacency() instances need to be the ' - 'same shape.') - new.data = new.data + y.data - return new - - def __sub__(self, y): - new = deepcopy(self) - if isinstance(y, (int, float)): - new.data = new.data - y - if isinstance(y, Adjacency): - if self.shape() != y.shape(): - raise ValueError('Both Adjacency() instances need to be the ' - 'same shape.') - new.data = new.data - y.data - return new - - def __mul__(self, y): - new = deepcopy(self) - if isinstance(y, (int, float)): - new.data = new.data * y - if isinstance(y, Adjacency): - if self.shape() != y.shape(): - raise ValueError('Both Adjacency() instances need to be the ' - 'same shape.') - new.data = np.multiply(new.data, y.data) - return new - - def _import_single_data(self, data, matrix_type=None): - ''' Helper function to import single data matrix.''' - - if isinstance(data, six.string_types): - if os.path.isfile(data): - data = pd.read_csv(data) - else: - raise ValueError('Make sure you have specified a valid file ' - 'path.') - - def test_is_single_matrix(data): - if len(data.shape) == 1: - return True - else: - return False - - if matrix_type is not None: - if matrix_type.lower() == 'distance_flat': - matrix_type = 'distance' - data = np.array(data) - issymmetric = True - is_single_matrix = test_is_single_matrix(data) - elif matrix_type.lower() == 'similarity_flat': - matrix_type = 'similarity' - data = np.array(data) - issymmetric = True - is_single_matrix = test_is_single_matrix(data) - elif matrix_type.lower() == 'directed_flat': - matrix_type = 'directed' - data = np.array(data).flatten() - issymmetric = False - is_single_matrix = test_is_single_matrix(data) - elif matrix_type.lower() in ['distance', 'similarity', 'directed']: - if data.shape[0] != data.shape[1]: - raise ValueError('Data matrix must be square') - data = np.array(data) - matrix_type = matrix_type.lower() - if matrix_type in ['distance', 'similarity']: - issymmetric = True - data = data[np.triu_indices(data.shape[0], k=1)] - else: - issymmetric = False - if isinstance(data, pd.DataFrame): - data = data.values.flatten() - elif isinstance(data, np.ndarray): - data = data.flatten() - is_single_matrix = True - else: - if len(data.shape) == 1: # Single Vector - try: - data = squareform(data) - except ValueError: - print('Data is not flattened upper triangle from ' - 'similarity/distance matrix or flattened directed ' - 'matrix.') - is_single_matrix = True - elif data.shape[0] == data.shape[1]: # Square Matrix - is_single_matrix = True - else: # Rectangular Matrix - data_all = deepcopy(data) - try: - data = squareform(data_all[0, :]) - except ValueError: - print('Data is not flattened upper triangle from multiple ' - 'similarity/distance matrices or flattened directed ' - 'matrices.') - is_single_matrix = False - - # Test if matrix is symmetrical - if np.all(data[np.triu_indices(data.shape[0], k=1)] == data.T[np.triu_indices(data.shape[0], k=1)]): - issymmetric = True - else: - issymmetric = False - - # Determine matrix type - if issymmetric: - if np.sum(np.diag(data)) == 0: - matrix_type = 'distance' - elif np.sum(np.diag(data)) == data.shape[0]: - matrix_type = 'similarity' - data = data[np.triu_indices(data.shape[0], k=1)] - else: - matrix_type = 'directed' - data = data.flatten() - - if not is_single_matrix: - data = data_all - - return (data, issymmetric, matrix_type, is_single_matrix) - - def isempty(self): - '''Check if Adjacency object is empty''' - return bool(self.matrix_type is 'empty') - - def squareform(self): - '''Convert adjacency back to squareform''' - if self.issymmetric: - if self.is_single_matrix: - return squareform(self.data) - else: - return [squareform(x.data) for x in self] - else: - if self.is_single_matrix: - return self.data.reshape(int(np.sqrt(self.data.shape[0])), - int(np.sqrt(self.data.shape[0]))) - else: - return [x.data.reshape(int(np.sqrt(x.data.shape[0])), - int(np.sqrt(x.data.shape[0]))) for x in self] - - def plot(self, limit=3, **kwargs): - ''' Create Heatmap of Adjacency Matrix''' - if self.is_single_matrix: - return sns.heatmap(self.squareform(), square=True, **kwargs) - else: - f, a = plt.subplots(limit) - for i in range(limit): - sns.heatmap(self[i].squareform(), square=True, ax=a[i], - **kwargs) - return f - - def mean(self, axis=0): - ''' Calculate mean of Adjacency - - Args: - axis: calculate mean over features (0) or data (1). - For data it will be on upper triangle. - - Returns: - mean: float if single, adjacency if axis=0, np.array if axis=1 - and multiple - - ''' - - if self.is_single_matrix: - return np.mean(self.data) - else: - if axis == 0: - return Adjacency(data=np.mean(self.data, axis=axis), - matrix_type=self.matrix_type + '_flat') - elif axis == 1: - return np.mean(self.data, axis=axis) - - def std(self, axis=0): - ''' Calculate standard deviation of Adjacency - - Args: - axis: calculate std over features (0) or data (1). - For data it will be on upper triangle. - - Returns: - std: float if single, adjacency if axis=0, np.array if axis=1 and - multiple - - ''' - - if self.is_single_matrix: - return np.std(self.data) - else: - if axis == 0: - return Adjacency(data=np.std(self.data, axis=axis), - matrix_type=self.matrix_type + '_flat') - elif axis == 1: - return np.std(self.data, axis=axis) - - def shape(self): - ''' Calculate shape of data. ''' - return self.data.shape - - def square_shape(self): - ''' Calculate shape of squareform data. ''' - if self.matrix_type is 'empty': - return np.array([]) - else: - if self.is_single_matrix: - return self.squareform().shape - else: - return self[0].squareform().shape - - def copy(self): - ''' Create a copy of Adjacency object.''' - return deepcopy(self) - - def append(self, data): - ''' Append data to Adjacency instance - - Args: - data: Adjacency instance to append - - Returns: - out: new appended Adjacency instance - - ''' - - if not isinstance(data, Adjacency): - raise ValueError('Make sure data is a Adjacency instance.') - - if self.isempty(): - out = data.copy() - else: - out = self.copy() - if self.square_shape() != data.square_shape(): - raise ValueError('Data is not the same shape as Adjacency ' - 'instance.') - - out.data = np.vstack([self.data, data.data]) - out.is_single_matrix = False - if out.Y.size: - out.Y = self.Y.append(data.Y) - - return out - - def write(self, file_name, method='long'): - ''' Write out Adjacency object to csv file. - - Args: - file_name (str): name of file name to write - method (str): method to write out data ['long','square'] - - ''' - if method not in ['long', 'square']: - raise ValueError('Make sure method is ["long","square"].') - if self.is_single_matrix: - if method is 'long': - out = pd.DataFrame(self.data).to_csv(file_name, index=None) - elif method is 'square': - out = pd.DataFrame(self.squareform()).to_csv(file_name, - index=None) - else: - if method is 'long': - out = pd.DataFrame(self.data).to_csv(file_name, index=None) - elif method is 'square': - raise NotImplementedError('Need to decide how we should write ' - 'out multiple matrices. As separate ' - 'files?') - - def similarity(self, data, plot=False, **kwargs): - ''' Calculate similarity between two Adjacency matrices. - Default is to use spearman correlation and permutation test.''' - if not isinstance(data, Adjacency): - data2 = Adjacency(data) - else: - data2 = data.copy() - if self.is_single_matrix: - if plot: - plot_stacked_adjacency(self, data) - return correlation_permutation(self.data, data2.data, **kwargs) - else: - if plot: - _, a = plt.subplots(len(self)) - for i in a: - plot_stacked_adjacency(self, data, ax=i) - return [correlation_permutation(x.data, data2.data, **kwargs) for x in self] - - def distance(self, method='correlation', **kwargs): - ''' Calculate distance between images within an Adjacency() instance. - - Args: - method: type of distance metric (can use any scikit learn or - sciypy metric) - - Returns: - dist: Outputs a 2D distance matrix. - - ''' - return Adjacency(pairwise_distances(self.data, metric=method, **kwargs), - matrix_type='distance') - - def threshold(self, thresh=0, binarize=False): - '''Threshold Adjacency instance - - Args: - thresh: cutoff to threshold image (float). if 'threshold'=50%, - will calculate percentile. - binarize (bool): if 'binarize'=True then binarize output - Returns: - Brain_Data: thresholded Brain_Data instance - - ''' - - b = self.copy() - if isinstance(thresh, six.string_types): - if thresh[-1] is '%': - thresh = np.percentile(b.data, float(thresh[:-1])) - if binarize: - b.data = b.data > thresh - else: - b.data[b.data < thresh] = 0 - return b - - def to_graph(self): - ''' Convert Adjacency into networkx graph. only works on - single_matrix for now.''' - - if self.is_single_matrix: - if self.matrix_type == 'directed': - return nx.DiGraph(self.squareform()) - else: - return nx.Graph(self.squareform()) - else: - raise NotImplementedError('This function currently only works on ' - 'single matrices.') - - def ttest(self, **kwargs): - ''' Calculate ttest across samples. ''' - if self.is_single_matrix: - raise ValueError('t-test cannot be run on single matrices.') - m = []; p = [] - for i in range(self.data.shape[1]): - stats = one_sample_permutation(self.data[:, i], **kwargs) - m.append(stats['mean']) - p.append(stats['p']) - mn = Adjacency(np.array(m)) - pval = Adjacency(np.array(p)) - return (mn, pval) - - def plot_label_distance(self, labels, ax=None): - ''' Create a violin plot indicating within and between label distance - - Args: - labels (np.array): numpy array of labels to plot - - Returns: - violin plot handles - - ''' - - if not self.is_single_matrix: - raise ValueError('This function only works on single adjacency ' - 'matrices.') - - distance = pd.DataFrame(self.squareform()) - - if len(labels) != distance.shape[0]: - raise ValueError('Labels must be same length as distance matrix') - - within = []; between = [] - out = pd.DataFrame(columns=['Distance', 'Group', 'Type'], index=None) - for i in np.unique(labels): - tmp_w = pd.DataFrame(columns=out.columns, index=None) - tmp_w['Distance'] = distance.loc[labels == i, labels == i].values[np.triu_indices(sum(labels == i), k=1)] - tmp_w['Type'] = 'Within' - tmp_w['Group'] = i - tmp_b = pd.DataFrame(columns=out.columns, index=None) - tmp_b['Distance'] = distance.loc[labels != i, labels != i].values[np.triu_indices(sum(labels == i), k=1)] - tmp_b['Type'] = 'Between' - tmp_b['Group'] = i - out = out.append(tmp_w).append(tmp_b) - f = sns.violinplot(x="Group", y="Distance", hue="Type", data=out, split=True, inner='quartile', - palette={"Within": "lightskyblue", "Between": "red"}, ax=ax) - f.set_ylabel('Average Distance') - f.set_title('Average Group Distance') - return f - - def stats_label_distance(self, labels, n_permute=5000, n_jobs=-1): - ''' Calculate permutation tests on within and between label distance. - - Args: - labels (np.array): numpy array of labels to plot - n_permute (int): number of permutations to run (default=5000) - - Returns: - dict: dictionary of within and between group differences - and p-values - - ''' - - if not self.is_single_matrix: - raise ValueError('This function only works on single adjacency ' - 'matrices.') - - distance = pd.DataFrame(self.squareform()) - - if len(labels) != distance.shape[0]: - raise ValueError('Labels must be same length as distance matrix') - - within = []; between = [] - out = pd.DataFrame(columns=['Distance', 'Group', 'Type'], index=None) - for i in np.unique(labels): - tmp_w = pd.DataFrame(columns=out.columns, index=None) - tmp_w['Distance'] = distance.loc[labels == i, labels == i].values[np.triu_indices(sum(labels == i), k=1)] - tmp_w['Type'] = 'Within' - tmp_w['Group'] = i - tmp_b = pd.DataFrame(columns=out.columns, index=None) - tmp_b['Distance'] = distance.loc[labels == i, labels != i].values.flatten() - tmp_b['Type'] = 'Between' - tmp_b['Group'] = i - out = out.append(tmp_w).append(tmp_b) - stats = dict() - for i in np.unique(labels): - # Within group test - tmp1 = out.loc[(out['Group'] == i) & (out['Type'] == 'Within'), 'Distance'] - tmp2 = out.loc[(out['Group'] == i) & (out['Type'] == 'Between'), 'Distance'] - stats[str(i)] = two_sample_permutation(tmp1, tmp2, - n_permute=n_permute, n_jobs=n_jobs) - return stats - - def plot_silhouette(self, labels, ax=None, permutation_test=True, - n_permute=5000, **kwargs): - '''Create a silhouette plot''' - distance = pd.DataFrame(self.squareform()) - - if len(labels) != distance.shape[0]: - raise ValueError('Labels must be same length as distance matrix') - - (f,outAll) = plot_silhouette(distance, labels, ax=None, - permutation_test=True, - n_permute=5000, **kwargs) - return (f,outAll) - - def bootstrap(self, function, n_samples=5000, save_weights=False, - n_jobs=-1, *args, **kwargs): - '''Bootstrap an Adjacency method. - - Example Useage: - b = dat.bootstrap('mean', n_samples=5000) - b = dat.bootstrap('predict', n_samples=5000, algorithm='ridge') - b = dat.bootstrap('predict', n_samples=5000, save_weights=True) - - Args: - function: (str) method to apply to data for each bootstrap - n_samples: (int) number of samples to bootstrap with replacement - save_weights: (bool) Save each bootstrap iteration - (useful for aggregating many bootstraps on a cluster) - n_jobs: (int) The number of CPUs to use to do the computation. - -1 means all CPUs.Returns: - output: summarized studentized bootstrap output - - ''' - - bootstrapped = Parallel(n_jobs=n_jobs)( - delayed(_bootstrap_apply_func)(self, - function, *args, **kwargs) for i in range(n_samples)) - bootstrapped = Adjacency(bootstrapped) - return summarize_bootstrap(bootstrapped, save_weights=save_weights) - - class Groupby(object): def __init__(self, data, mask): @@ -1942,439 +1369,3 @@ def combine(self, value_dict): raise ValueError('No method for aggregation implented for %s ' 'yet.' % type(value_dict[i])) return out.sum() - - -class Design_Matrix_Series(Series): - - """ - This is a sub-class of pandas series. While not having additional methods - of it's own required to retain normal slicing functionality for the - Design_Matrix class, i.e. how slicing is typically handled in pandas. - All methods should be called on Design_Matrix below. - """ - - @property - def _constructor(self): - return Design_Matrix_Series - - @property - def _constructor_expanddim(self): - return Design_Matrix - - -class Design_Matrix(DataFrame): - - """Design_Matrix is a class to represent design matrices with convenience - functionality for convolution, upsampling and downsampling. It plays - nicely with Brain_Data and can be used to build an experimental design - to pass to Brain_Data's X attribute. It is essentially an enhanced - pandas df, with extra attributes and methods. Methods always return a - new design matrix instance. - - Args: - convolved (list, optional): on what columns convolution has been performed; defaults to None - hasIntercept (bool, optional): whether the design matrix has an - intercept column; defaults to False - sampling_rate (float, optional): sampling rate of each row in seconds (e.g. TR in neuroimaging); defaults to None - - """ - - _metadata = ['sampling_rate', 'convolved', 'hasIntercept'] - - def __init__(self, *args, **kwargs): - - sampling_rate = kwargs.pop('sampling_rate',None) - convolved = kwargs.pop('convolved', None) - hasIntercept = kwargs.pop('hasIntercept', False) - self.sampling_rate = sampling_rate - self.convolved = convolved - self.hasIntercept = hasIntercept - - super(Design_Matrix, self).__init__(*args, **kwargs) - - @property - def _constructor(self): - return Design_Matrix - - @property - def _constructor_sliced(self): - return Design_Matrix_Series - - - def info(self): - """Print class meta data. - - """ - return '%s.%s(sampling_rate=%s, shape=%s, convolved=%s, hasIntercept=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.sampling_rate, - self.shape, - self.convolved, - self.hasIntercept - ) - - def append(self, df, axis, **kwargs): - """Method for concatenating another design matrix row or column-wise. - Can "uniquify" certain columns when appending row-wise, and by - default will attempt to do that with the intercept. - - Args: - axis (int): 0 for row-wise (vert-cat), 1 for column-wise (horz-cat) - separate (bool,optional): whether try and uniquify columns; - defaults to True; only applies - when axis==0 - addIntercept (bool,optional): whether to add intercepts to matrices - before appending; defaults to False; - only applies when axis==0 - uniqueCols (list,optional): what additional columns to try to keep - separated by uniquifying; defaults to - intercept only; only applies when - axis==0 - - """ - if axis == 1: - return self.horzcat(df) - elif axis == 0: - return self.vertcat(df, **kwargs) - else: - raise ValueError("Axis must be 0 (row) or 1 (column)") - - - def horzcat(self, df): - """Used by .append(). Append another design matrix, column-wise - (horz cat). Always returns a new design_matrix. - - """ - if self.shape[0] != df.shape[0]: - raise ValueError("Can't append differently sized design matrices! " - "Mat 1 has %s rows and Mat 2 has %s rows." - % (self.shape[0]), df.shape[0]) - out = pd.concat([self, df], axis=1) - out.sampling_rate = self.sampling_rate - out.convolved = self.convolved - out.hasIntercept = self.hasIntercept - return out - - - def vertcat(self, df, separate=True, addIntercept=False, uniqueCols=[]): - """Used by .append(). Append another design matrix row-wise (vert cat). - Always returns a new design matrix. - - """ - - outdf = df.copy() - assert self.hasIntercept == outdf.hasIntercept, ("Intercepts are " - "ambigious. Both design matrices should match in whether " - "they do or don't have intercepts.") - - if addIntercept: - self['intercept'] = 1 - self.hasIntercept = True - outdf['intercept'] = 1 - outdf.hasIntercept = True - if separate: - if self.hasIntercept: - uniqueCols += ['intercept'] - idx_1 = [] - idx_2 = [] - for col in uniqueCols: - # To match substrings within column names, we loop over each - # element and search for the uniqueCol as a substring within it; - # first we do to check if the uniqueCol actually occurs in both - # design matrices, if so then we change it's name before - # concatenating - joint = set(self.columns) & set(outdf.columns) - shared = [elem for elem in joint if col in elem] - if shared: - idx_1 += [i for i, elem in enumerate(self.columns) if col in elem] - idx_2 += [i for i, elem in enumerate(outdf.columns) if col in elem] - aRename = {self.columns[elem]: '0_'+self.columns[elem] for i, elem in enumerate(idx_1)} - bRename = {outdf.columns[elem]: '1_'+outdf.columns[elem] for i, elem in enumerate(idx_2)} - if aRename: - out = self.rename(columns=aRename) - outdf = outdf.rename(columns=bRename) - colOrder = [] - #retain original column order as closely as possible - for colA,colB in zip(out.columns, outdf.columns): - colOrder.append(colA) - if colA != colB: - colOrder.append(colB) - out = super(Design_Matrix, out).append(outdf, ignore_index=True) - # out = out.append(outdf,separate=False,axis=0,ignore_index=True).fillna(0) - out = out[colOrder] - else: - raise ValueError("Separate concatentation impossible. None of " - "the requested unique columns were found in " - "both design_matrices.") - else: - out = super(Design_Matrix, self).append(outdf, ignore_index=True) - # out = self.append(df,separate=False,axis=0,ignore_index=True).fillna(0) - - out.convolved = self.convolved - out.sampling_rate = self.sampling_rate - out.hasIntercept = self.hasIntercept - - return out - - def vif(self): - """Compute variance inflation factor amongst columns of design matrix, - ignoring the intercept. Much faster that statsmodels and more - reliable too. Uses the same method as Matlab and R (diagonal - elements of the inverted correlation matrix). - - Returns: - vifs (list): list with length == number of columns - intercept - - """ - assert self.shape[1] > 1, "Can't compute vif with only 1 column!" - if self.hasIntercept: - idx = [i for i, elem in enumerate(self.columns) if 'intercept' not in elem] - out = self[self.columns[np.r_[idx]]] - else: - out = self[self.columns] - - if any(out.corr() < 0): - warnings.warn("Correlation matrix has negative values!") - - return np.diag(np.linalg.inv(out.corr()), 0) - - def heatmap(self, figsize=(8, 6), **kwargs): - """Visualize Design Matrix spm style. Use .plot() for typical pandas - plotting functionality. Can pass optional keyword args to seaborn - heatmap. - - """ - fig, ax = plt.subplots(1, figsize=figsize) - ax = sns.heatmap(self, cmap='gray', cbar=False, ax=ax, **kwargs) - for _, spine in ax.spines.items(): - spine.set_visible(True) - for i, label in enumerate(ax.get_yticklabels()): - if i == 0 or i == self.shape[0]-1: - label.set_visible(True) - else: - label.set_visible(False) - ax.axhline(linewidth=4, color="k") - ax.axvline(linewidth=4, color="k") - ax.axhline(y=self.shape[0], color='k', linewidth=4) - ax.axvline(x=self.shape[1], color='k', linewidth=4) - plt.yticks(rotation=0) - - def convolve(self, conv_func='hrf', colNames=None): - """Perform convolution using an arbitrary function. - - Args: - conv_func (ndarray or string): either a 1d numpy array containing output of a function that you want to convolve; a samples by kernel 2d array of several kernels to convolve; or th string 'hrf' which defaults to a glover HRF function at the Design_matrix's sampling_freq - colNames (list): what columns to perform convolution on; defaults - to all skipping intercept, and columns containing 'poly' or 'cosine' - - """ - assert self.sampling_rate is not None, "Design_matrix has no sampling_rate set!" - - if colNames is None: - colNames = [col for col in self.columns if 'intercept' not in col and 'poly' not in col and 'cosine' not in col] - nonConvolved = [col for col in self.columns if col not in colNames] - - if isinstance(conv_func,six.string_types): - assert conv_func == 'hrf',"Did you mean 'hrf'? 'hrf' can generate a kernel for you, otherwise custom kernels should be passed in as 1d or 2d arrays." - - assert self.sampling_rate is not None, "Design_matrix sampling rate not set. Can't figure out how to generate HRF!" - conv_func = glover_hrf(self.sampling_rate, oversampling=1) - - else: - assert type(conv_func) == np.ndarray, 'Must provide a function for convolution!' - - if len(conv_func.shape) > 1: - assert conv_func.shape[0] > conv_func.shape[1], '2d conv_func must be formatted as, samples X kernels!' - conv_mats = [] - for i in range(conv_func.shape[1]): - c = self[colNames].apply(lambda x: np.convolve(x, conv_func[:,i])[:self.shape[0]]) - c.columns = [str(col)+'_c'+str(i) for col in c.columns] - conv_mats.append(c) - out = pd.concat(conv_mats+ [self[nonConvolved]], axis=1) - else: - c = self[colNames].apply(lambda x: np.convolve(x, conv_func)[:self.shape[0]]) - c.columns = [str(col)+'_c0' for col in c.columns] - out = pd.concat([c,self[nonConvolved]], axis=1) - - out.convolved = colNames - out.sampling_rate = self.sampling_rate - out.hasIntercept = self.hasIntercept - return out - - def downsample(self, target,**kwargs): - """Downsample columns of design matrix. Relies on - nltools.stats.downsample, but ensures that returned object is a - design matrix. - - Args: - target(float): downsampling target, typically in samples not - seconds - kwargs: additional inputs to nltools.stats.downsample - - """ - df = downsample(self, sampling_freq=self.sampling_rate, target=target, **kwargs) - - # convert df to a design matrix - newMat = Design_Matrix(df, sampling_rate=target, - convolved=self.convolved, - hasIntercept=self.hasIntercept) - return newMat - - def upsample(self, target,**kwargs): - """Upsample columns of design matrix. Relies on - nltools.stats.upsample, but ensures that returned object is a - design matrix. - - Args: - target(float): downsampling target, typically in samples not - seconds - kwargs: additional inputs to nltools.stats.downsample - - """ - df = upsample(self, sampling_freq=self.sampling_rate, target=target, **kwargs) - - # convert df to a design matrix - newMat = Design_Matrix(df, sampling_rate=target, - convolved=self.convolved, - hasIntercept=self.hasIntercept) - return newMat - - def zscore(self, colNames=[]): - """Z-score specific columns of design matrix. Relies on - nltools.stats.downsample, but ensures that returned object is a - design matrix. - - Args: - colNames (list): columns to z-score; defaults to all columns - - """ - colOrder = self.columns - if not list(colNames): - colNames = self.columns - nonZ = [col for col in self.columns if col not in colNames] - df = zscore(self[colNames]) - df = pd.concat([df, self[nonZ]], axis=1) - df = df[colOrder] - newMat = Design_Matrix(df, sampling_rate=self.sampling_rate, - convolved=self.convolved, - hasIntercept=self.hasIntercept) - - return newMat - - def addpoly(self, order=0, include_lower=True): - """Add nth order polynomial terms as columns to design matrix. - - Args: - order (int): what order terms to add; 0 = constant/intercept - (default), 1 = linear, 2 = quadratic, etc - include_lower: (bool) whether to add lower order terms if order > 0 - - """ - - #This method is kind of ugly - polyDict = {} - if include_lower: - if order > 0: - for i in range(0, order+1): - if i == 0: - if self.hasIntercept: - warnings.warn("Design Matrix already has " - "intercept...skipping") - else: - polyDict['intercept'] = np.repeat(1, self.shape[0]) - else: - polyDict['poly_' + str(i)] = (range(self.shape[0]) - np.mean(range(self.shape[0]))) ** i - else: - if self.hasIntercept: - raise ValueError("Design Matrix already has intercept!") - else: - polyDict['intercept'] = np.repeat(1, self.shape[0]) - else: - if order == 0: - if self.hasIntercept: - raise ValueError("Design Matrix already has intercept!") - else: - polyDict['intercept'] = np.repeat(1, self.shape[0]) - else: - polyDict['poly_'+str(order)] = (range(self.shape[0]) - np.mean(range(self.shape[0])))**order - - toAdd = Design_Matrix(polyDict) - out = self.append(toAdd, axis=1) - if 'intercept' in polyDict.keys() or self.hasIntercept: - out.hasIntercept = True - return out - - def add_dct_basis(self,duration=180): - """Adds cosine basis functions to Design_Matrix columns, - based on spm-style discrete cosine transform for use in - high-pass filtering. - - Args: - duration (int): length of filter in seconds - - """ - assert self.sampling_rate is not None, "Design_Matrix has no sampling_rate set!" - basis_mat = make_cosine_basis(self.shape[0],self.sampling_rate,duration) - - basis_frame = Design_Matrix(basis_mat, - sampling_rate=self.sampling_rate, - hasIntercept=False, convolved=False) - - basis_frame.columns = ['cosine_'+str(i+1) for i in range(basis_frame.shape[1])] - - out = self.append(basis_frame,axis=1) - - return out - -def concatenate(data): - '''Concatenate a list of Brain_Data() or Adjacency() objects''' - - if not isinstance(data, list): - raise ValueError('Make sure you are passing a list of objects.') - - if all([type(x) for x in data]): - if not isinstance(data[0], (Brain_Data, Adjacency)): - raise ValueError('Make sure you are passing a list of Brain_Data or Adjacency objects.') - - out = data[0].__class__() - for i in data: - out = out.append(i) - else: - raise ValueError('Make sure all objects in the list are the same type.') - return out - -def all_same(items): - return np.all(x == items[0] for x in items) - -def _vif(X, y): - """ - DEPRECATED - Helper function to compute variance inflation factor. Unclear whether - there are errors with this method relative to stats.models. Seems like - stats.models is sometimes inconsistent with R. R always uses the - diagonals of the inverted covariance matrix which is what's implemented - instead of this. - - Args: - X: (Dataframe) explanatory variables - y: (Dataframe/Series) outcome variable - - """ - - b,resid, _, _ = np.linalg.lstsq(X, y) - SStot = y.var(ddof=0)*len(y) - if SStot == 0: - SStot = .0001 # to prevent divide by 0 errors - r2 = 1.0 - (resid/SStot) - if r2 == 1: - r2 = 0.9999 # to prevent divide by 0 errors - return (1.0/(1.0-r2))[0] - -def _bootstrap_apply_func(data, function, *args, **kwargs): - '''Bootstrap helper function. Sample with replacement and apply function''' - data_row_id = range(data.shape()[0]) - new_dat = data[np.random.choice(data_row_id, - size=len(data_row_id), - replace=True)] - return getattr(new_dat, function)( *args, **kwargs) diff --git a/nltools/data/design_matrix.py b/nltools/data/design_matrix.py new file mode 100644 index 00000000..96c3b8a9 --- /dev/null +++ b/nltools/data/design_matrix.py @@ -0,0 +1,449 @@ +from __future__ import division + +''' +NeuroLearn Data Classes +======================= + +Classes to represent various types of data + +''' + +# Notes: +# Need to figure out how to speed up loading and resampling of data + +__all__ = ['Brain_Data', + 'Adjacency', + 'Groupby', + 'Design_Matrix', + 'Design_Matrix_Series'] +__author__ = ["Luke Chang"] +__license__ = "MIT" + +from pandas import DataFrame, Series +import pandas as pd +import numpy as np +import seaborn as sns +import warnings +import six +from nltools.utils import (make_cosine_basis, + glover_hrf) +from nltools.stats import (pearson, + fdr, + threshold, + fisher_r_to_z, + correlation_permutation, + one_sample_permutation, + two_sample_permutation, + downsample, + upsample, + zscore, + ) +class Design_Matrix_Series(Series): + + """ + This is a sub-class of pandas series. While not having additional methods + of it's own required to retain normal slicing functionality for the + Design_Matrix class, i.e. how slicing is typically handled in pandas. + All methods should be called on Design_Matrix below. + """ + + @property + def _constructor(self): + return Design_Matrix_Series + + @property + def _constructor_expanddim(self): + return Design_Matrix + + +class Design_Matrix(DataFrame): + + """Design_Matrix is a class to represent design matrices with convenience + functionality for convolution, upsampling and downsampling. It plays + nicely with Brain_Data and can be used to build an experimental design + to pass to Brain_Data's X attribute. It is essentially an enhanced + pandas df, with extra attributes and methods. Methods always return a + new design matrix instance. + + Args: + convolved (list, optional): on what columns convolution has been performed; defaults to None + hasIntercept (bool, optional): whether the design matrix has an + intercept column; defaults to False + sampling_rate (float, optional): sampling rate of each row in seconds (e.g. TR in neuroimaging); defaults to None + + """ + + _metadata = ['sampling_rate', 'convolved', 'hasIntercept'] + + def __init__(self, *args, **kwargs): + + sampling_rate = kwargs.pop('sampling_rate', None) + convolved = kwargs.pop('convolved', None) + hasIntercept = kwargs.pop('hasIntercept', False) + self.sampling_rate = sampling_rate + self.convolved = convolved + self.hasIntercept = hasIntercept + + super(Design_Matrix, self).__init__(*args, **kwargs) + + @property + def _constructor(self): + return Design_Matrix + + @property + def _constructor_sliced(self): + return Design_Matrix_Series + + + def info(self): + """Print class meta data. + + """ + return '%s.%s(sampling_rate=%s, shape=%s, convolved=%s, hasIntercept=%s)' % ( + self.__class__.__module__, + self.__class__.__name__, + self.sampling_rate, + self.shape, + self.convolved, + self.hasIntercept + ) + + def append(self, df, axis, **kwargs): + """Method for concatenating another design matrix row or column-wise. + Can "uniquify" certain columns when appending row-wise, and by + default will attempt to do that with the intercept. + + Args: + axis (int): 0 for row-wise (vert-cat), 1 for column-wise (horz-cat) + separate (bool,optional): whether try and uniquify columns; + defaults to True; only applies + when axis==0 + addIntercept (bool,optional): whether to add intercepts to matrices + before appending; defaults to False; + only applies when axis==0 + uniqueCols (list,optional): what additional columns to try to keep + separated by uniquifying; defaults to + intercept only; only applies when + axis==0 + + """ + if axis == 1: + return self.horzcat(df) + elif axis == 0: + return self.vertcat(df, **kwargs) + else: + raise ValueError("Axis must be 0 (row) or 1 (column)") + + + def horzcat(self, df): + """Used by .append(). Append another design matrix, column-wise + (horz cat). Always returns a new design_matrix. + + """ + if self.shape[0] != df.shape[0]: + raise ValueError("Can't append differently sized design matrices! " + "Mat 1 has %s rows and Mat 2 has %s rows." + % (self.shape[0]), df.shape[0]) + out = pd.concat([self, df], axis=1) + out.sampling_rate = self.sampling_rate + out.convolved = self.convolved + out.hasIntercept = self.hasIntercept + return out + + + def vertcat(self, df, separate=True, addIntercept=False, uniqueCols=[]): + """Used by .append(). Append another design matrix row-wise (vert cat). + Always returns a new design matrix. + + """ + + outdf = df.copy() + assert self.hasIntercept == outdf.hasIntercept, ("Intercepts are " + "ambigious. Both design matrices should match in whether " + "they do or don't have intercepts.") + + if addIntercept: + self['intercept'] = 1 + self.hasIntercept = True + outdf['intercept'] = 1 + outdf.hasIntercept = True + if separate: + if self.hasIntercept: + uniqueCols += ['intercept'] + idx_1 = [] + idx_2 = [] + for col in uniqueCols: + # To match substrings within column names, we loop over each + # element and search for the uniqueCol as a substring within it; + # first we do to check if the uniqueCol actually occurs in both + # design matrices, if so then we change it's name before + # concatenating + joint = set(self.columns) & set(outdf.columns) + shared = [elem for elem in joint if col in elem] + if shared: + idx_1 += [i for i, elem in enumerate(self.columns) if col in elem] + idx_2 += [i for i, elem in enumerate(outdf.columns) if col in elem] + aRename = {self.columns[elem]: '0_'+self.columns[elem] for i, elem in enumerate(idx_1)} + bRename = {outdf.columns[elem]: '1_'+outdf.columns[elem] for i, elem in enumerate(idx_2)} + if aRename: + out = self.rename(columns=aRename) + outdf = outdf.rename(columns=bRename) + colOrder = [] + #retain original column order as closely as possible + for colA,colB in zip(out.columns, outdf.columns): + colOrder.append(colA) + if colA != colB: + colOrder.append(colB) + out = super(Design_Matrix, out).append(outdf, ignore_index=True) + # out = out.append(outdf,separate=False,axis=0,ignore_index=True).fillna(0) + out = out[colOrder] + else: + raise ValueError("Separate concatentation impossible. None of " + "the requested unique columns were found in " + "both design_matrices.") + else: + out = super(Design_Matrix, self).append(outdf, ignore_index=True) + # out = self.append(df,separate=False,axis=0,ignore_index=True).fillna(0) + + out.convolved = self.convolved + out.sampling_rate = self.sampling_rate + out.hasIntercept = self.hasIntercept + + return out + + def vif(self): + """Compute variance inflation factor amongst columns of design matrix, + ignoring the intercept. Much faster that statsmodels and more + reliable too. Uses the same method as Matlab and R (diagonal + elements of the inverted correlation matrix). + + Returns: + vifs (list): list with length == number of columns - intercept + + """ + assert self.shape[1] > 1, "Can't compute vif with only 1 column!" + if self.hasIntercept: + idx = [i for i, elem in enumerate(self.columns) if 'intercept' not in elem] + out = self[self.columns[np.r_[idx]]] + else: + out = self[self.columns] + + if any(out.corr() < 0): + warnings.warn("Correlation matrix has negative values!") + + return np.diag(np.linalg.inv(out.corr()), 0) + + def heatmap(self, figsize=(8, 6), **kwargs): + """Visualize Design Matrix spm style. Use .plot() for typical pandas + plotting functionality. Can pass optional keyword args to seaborn + heatmap. + + """ + fig, ax = plt.subplots(1, figsize=figsize) + ax = sns.heatmap(self, cmap='gray', cbar=False, ax=ax, **kwargs) + for _, spine in ax.spines.items(): + spine.set_visible(True) + for i, label in enumerate(ax.get_yticklabels()): + if i == 0 or i == self.shape[0]-1: + label.set_visible(True) + else: + label.set_visible(False) + ax.axhline(linewidth=4, color="k") + ax.axvline(linewidth=4, color="k") + ax.axhline(y=self.shape[0], color='k', linewidth=4) + ax.axvline(x=self.shape[1], color='k', linewidth=4) + plt.yticks(rotation=0) + + def convolve(self, conv_func='hrf', colNames=None): + """Perform convolution using an arbitrary function. + + Args: + conv_func (ndarray or string): either a 1d numpy array containing output of a function that you want to convolve; a samples by kernel 2d array of several kernels to convolve; or th string 'hrf' which defaults to a glover HRF function at the Design_matrix's sampling_freq + colNames (list): what columns to perform convolution on; defaults + to all skipping intercept, and columns containing 'poly' or 'cosine' + + """ + assert self.sampling_rate is not None, "Design_matrix has no sampling_rate set!" + + if colNames is None: + colNames = [col for col in self.columns if 'intercept' not in col and 'poly' not in col and 'cosine' not in col] + nonConvolved = [col for col in self.columns if col not in colNames] + + if isinstance(conv_func,six.string_types): + assert conv_func == 'hrf',"Did you mean 'hrf'? 'hrf' can generate a kernel for you, otherwise custom kernels should be passed in as 1d or 2d arrays." + + assert self.sampling_rate is not None, "Design_matrix sampling rate not set. Can't figure out how to generate HRF!" + conv_func = glover_hrf(self.sampling_rate, oversampling=1) + + else: + assert type(conv_func) == np.ndarray, 'Must provide a function for convolution!' + + if len(conv_func.shape) > 1: + assert conv_func.shape[0] > conv_func.shape[1], '2d conv_func must be formatted as, samples X kernels!' + conv_mats = [] + for i in range(conv_func.shape[1]): + c = self[colNames].apply(lambda x: np.convolve(x, conv_func[:,i])[:self.shape[0]]) + c.columns = [str(col)+'_c'+str(i) for col in c.columns] + conv_mats.append(c) + out = pd.concat(conv_mats+ [self[nonConvolved]], axis=1) + else: + c = self[colNames].apply(lambda x: np.convolve(x, conv_func)[:self.shape[0]]) + c.columns = [str(col)+'_c0' for col in c.columns] + out = pd.concat([c,self[nonConvolved]], axis=1) + + out.convolved = colNames + out.sampling_rate = self.sampling_rate + out.hasIntercept = self.hasIntercept + return out + + def downsample(self, target,**kwargs): + """Downsample columns of design matrix. Relies on + nltools.stats.downsample, but ensures that returned object is a + design matrix. + + Args: + target(float): downsampling target, typically in samples not + seconds + kwargs: additional inputs to nltools.stats.downsample + + """ + df = downsample(self, sampling_freq=self.sampling_rate, target=target, **kwargs) + + # convert df to a design matrix + newMat = Design_Matrix(df, sampling_rate=target, + convolved=self.convolved, + hasIntercept=self.hasIntercept) + return newMat + + def upsample(self, target,**kwargs): + """Upsample columns of design matrix. Relies on + nltools.stats.upsample, but ensures that returned object is a + design matrix. + + Args: + target(float): downsampling target, typically in samples not + seconds + kwargs: additional inputs to nltools.stats.downsample + + """ + df = upsample(self, sampling_freq=self.sampling_rate, target=target, **kwargs) + + # convert df to a design matrix + newMat = Design_Matrix(df, sampling_rate=target, + convolved=self.convolved, + hasIntercept=self.hasIntercept) + return newMat + + def zscore(self, colNames=[]): + """Z-score specific columns of design matrix. Relies on + nltools.stats.downsample, but ensures that returned object is a + design matrix. + + Args: + colNames (list): columns to z-score; defaults to all columns + + """ + colOrder = self.columns + if not list(colNames): + colNames = self.columns + nonZ = [col for col in self.columns if col not in colNames] + df = zscore(self[colNames]) + df = pd.concat([df, self[nonZ]], axis=1) + df = df[colOrder] + newMat = Design_Matrix(df, sampling_rate=self.sampling_rate, + convolved=self.convolved, + hasIntercept=self.hasIntercept) + + return newMat + + def addpoly(self, order=0, include_lower=True): + """Add nth order polynomial terms as columns to design matrix. + + Args: + order (int): what order terms to add; 0 = constant/intercept + (default), 1 = linear, 2 = quadratic, etc + include_lower: (bool) whether to add lower order terms if order > 0 + + """ + + #This method is kind of ugly + polyDict = {} + if include_lower: + if order > 0: + for i in range(0, order+1): + if i == 0: + if self.hasIntercept: + warnings.warn("Design Matrix already has " + "intercept...skipping") + else: + polyDict['intercept'] = np.repeat(1, self.shape[0]) + else: + polyDict['poly_' + str(i)] = (range(self.shape[0]) - np.mean(range(self.shape[0]))) ** i + else: + if self.hasIntercept: + raise ValueError("Design Matrix already has intercept!") + else: + polyDict['intercept'] = np.repeat(1, self.shape[0]) + else: + if order == 0: + if self.hasIntercept: + raise ValueError("Design Matrix already has intercept!") + else: + polyDict['intercept'] = np.repeat(1, self.shape[0]) + else: + polyDict['poly_'+str(order)] = (range(self.shape[0]) - np.mean(range(self.shape[0])))**order + + toAdd = Design_Matrix(polyDict) + out = self.append(toAdd, axis=1) + if 'intercept' in polyDict.keys() or self.hasIntercept: + out.hasIntercept = True + return out + + def add_dct_basis(self,duration=180): + """Adds cosine basis functions to Design_Matrix columns, + based on spm-style discrete cosine transform for use in + high-pass filtering. + + Args: + duration (int): length of filter in seconds + + """ + assert self.sampling_rate is not None, "Design_Matrix has no sampling_rate set!" + basis_mat = make_cosine_basis(self.shape[0],self.sampling_rate,duration) + + basis_frame = Design_Matrix(basis_mat, + sampling_rate=self.sampling_rate, + hasIntercept=False, convolved=False) + + basis_frame.columns = ['cosine_'+str(i+1) for i in range(basis_frame.shape[1])] + + out = self.append(basis_frame,axis=1) + + return out + +def all_same(items): + return np.all(x == items[0] for x in items) + +def _vif(X, y): + """ + DEPRECATED + Helper function to compute variance inflation factor. Unclear whether + there are errors with this method relative to stats.models. Seems like + stats.models is sometimes inconsistent with R. R always uses the + diagonals of the inverted covariance matrix which is what's implemented + instead of this. + + Args: + X: (Dataframe) explanatory variables + y: (Dataframe/Series) outcome variable + + """ + + b,resid, _, _ = np.linalg.lstsq(X, y) + SStot = y.var(ddof=0)*len(y) + if SStot == 0: + SStot = .0001 # to prevent divide by 0 errors + r2 = 1.0 - (resid/SStot) + if r2 == 1: + r2 = 0.9999 # to prevent divide by 0 errors + return (1.0/(1.0-r2))[0] diff --git a/nltools/utils.py b/nltools/utils.py index 9c82f970..ef62b352 100644 --- a/nltools/utils.py +++ b/nltools/utils.py @@ -13,7 +13,12 @@ 'spm_time_derivative', 'glover_time_derivative', 'spm_dispersion_derivative', - 'make_cosine_basis'] + 'make_cosine_basis', + 'attempt_to_import', + 'all_same', + 'concatenate', + '_bootstrap_apply_func', + ] __author__ = ["Luke Chang"] __license__ = "MIT" @@ -274,7 +279,8 @@ def make_cosine_basis(nsamples,sampling_freq,filter_length): #Drop constant C = C[:,1:] if C.size == 0: - raise ValueError('Basis function creation failed! nsamples is too small for requested filter_length.') + raise ValueError('Basis function creation failed!' + ' nsamples is too small for requested filter_length.') else: return C @@ -294,3 +300,33 @@ def attempt_to_import(dependency, name=None, fromlist=None): mod = None module_names[name] = Dependency(dependency, mod) return mod + +def all_same(items): + return np.all(x == items[0] for x in items) + +def concatenate(data): + '''Concatenate a list of Brain_Data() or Adjacency() objects''' + + if not isinstance(data, list): + raise ValueError('Make sure you are passing a list of objects.') + + if all([type(x) for x in data]): + # Temporarily Removing this for circular imports (LC) + # if not isinstance(data[0], (Brain_Data, Adjacency)): + # raise ValueError('Make sure you are passing a list of Brain_Data' + # ' or Adjacency objects.') + + out = data[0].__class__() + for i in data: + out = out.append(i) + else: + raise ValueError('Make sure all objects in the list are the same type.') + return out + +def _bootstrap_apply_func(data, function, *args, **kwargs): + '''Bootstrap helper function. Sample with replacement and apply function''' + data_row_id = range(data.shape()[0]) + new_dat = data[np.random.choice(data_row_id, + size=len(data_row_id), + replace=True)] + return getattr(new_dat, function)( *args, **kwargs) From cfcdd69c733c308e19691d10c6c17163a81fc2ef Mon Sep 17 00:00:00 2001 From: Luke Chang Date: Sun, 12 Nov 2017 17:21:21 -0500 Subject: [PATCH 2/2] moved all_same to utils Former-commit-id: ba5f15d8a0e332d050388da2fbb11e0b3b5c2454 --- nltools/data/design_matrix.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nltools/data/design_matrix.py b/nltools/data/design_matrix.py index 96c3b8a9..a94e1f9a 100644 --- a/nltools/data/design_matrix.py +++ b/nltools/data/design_matrix.py @@ -421,9 +421,6 @@ def add_dct_basis(self,duration=180): return out -def all_same(items): - return np.all(x == items[0] for x in items) - def _vif(X, y): """ DEPRECATED