From 3cd6b494d33aad05e4a5b1ee219f0c29ae49ef7c Mon Sep 17 00:00:00 2001 From: Rishi Kulkarni Date: Mon, 3 May 2021 01:19:43 -0700 Subject: [PATCH] comments, etc. --- hierarch/internal_functions.py | 308 ++++++++++++++++----------------- hierarch/stats.py | 10 +- 2 files changed, 154 insertions(+), 164 deletions(-) diff --git a/hierarch/internal_functions.py b/hierarch/internal_functions.py index 93735aa..3855378 100644 --- a/hierarch/internal_functions.py +++ b/hierarch/internal_functions.py @@ -1,14 +1,45 @@ import numpy as np import numba as nb -import scipy.stats as stats -from collections import Counter import sympy.utilities.iterables as iterables import hierarch.numba_overloads -@nb.njit +@nb.jit(nopython=True, cache=True) +def nb_tuple(a,b): + ''' + Makes a tuple from two numbers. + + Parameters + ---------- + a, b: numbers of the same dtype (or can be cast to the same dtype). + + Returns + ---------- + tuple(a, b) + + ''' + return tuple((a,b)) + +@nb.jit(nopython=True, cache=True) def nb_data_grabber(data, col, treatment_labels): + ''' + Numba-accelerated fancy indexing. + + Parameters + ---------- + data: 2D array + + col: int + + treatment_labels: 1D array or list + + Returns + ---------- + ret_list: list of 1D arrays + + ''' ret_list = [] for key in treatment_labels: + #grab values from the data column for each label ret_list.append(data[:,-1][np.equal(data[:,col],key)]) return ret_list @@ -56,31 +87,40 @@ def nb_unique(input_data, axis=0): idx = np.append(idx, additional_uniques) #get counts for each unique row - counts = [1] counts = np.append(idx[1:], data.shape[0]) counts = counts - idx return data[idx], orig_idx[idx], counts -@nb.jit(nopython=True) +@nb.jit(nopython=True, cache=True) def welch_statistic(data, col, treatment_labels): ''' - Internal function that calculates Welch's t statistic. + Calculates Welch's t statistic. - Details on the validity of this test statistic can be found in "Studentized permutation tests for non-i.i.d. hypotheses and the generalized Behrens-Fisher problem -" by Arnold Janssen. https://doi.org/10.1016/S0167-7152(97)00043-6. + Takes a 2D data matrix, a column to classify data by, and the labels corresponding to the data of interest. Assumes that the largest (-1) column in the data matrix is the dependent variable. + Parameters ---------- - - sample_a, sample_b: 1D arrays + data: 2D array + Data matrix. The first n - 1 columns are the design matrix and the nth column is the dependent variable. + + col: int + Columns of the design matrix used to classify the dependent variable into two groups. + + treatment_labels: + The values of the elements in col. Returns ---------- t: float64 + Welch's t statistic. - Note: The formula for Welch's t reduces to Student's t when sample_a and sample_b are the same size, so use this function whenever you need a t statistic. - + Notes + ---------- + Details on the validity of this test statistic can be found in "Studentized permutation tests for non-i.i.d. hypotheses and the generalized Behrens-Fisher problem +" by Arnold Janssen. https://doi.org/10.1016/S0167-7152(97)00043-6. + ''' #get our two samples from the data matrix @@ -89,70 +129,110 @@ def welch_statistic(data, col, treatment_labels): #mean difference meandiff = (np.mean(sample_a) - np.mean(sample_b)) - #weighted sample variances + #weighted sample variances - might be able to speed this up var_weight_one = (np.var(sample_a)*(sample_a.size/(sample_a.size - 1))) / len(sample_a) var_weight_two = (np.var(sample_b)*(sample_b.size/(sample_b.size - 1))) / len(sample_b) #compute t statistic t = meandiff / np.sqrt(var_weight_one + var_weight_two) + return t -@nb.jit(nopython=True) -def quick_shuffle(w): - ''' - This is faster than np.random.shuffle for some reason. +@nb.jit(nopython=True, cache=True) +def iter_return(data, col_to_permute, iterator, counts): ''' - np.random.shuffle(w) - - -@nb.jit() -def randomize_chunks(shuffled_col, splits): - ''' - Internal function for permuting a column a data while paying attention to the dependency structure of the prior column. Numba's implementation of np.random.permutation is faster than numpy's, so we're using this. + Shuffles a column based on an input. Cannot be cluster-aware. Parameters ---------- - values: 2D array - 2D array of unique rows, second-to-last column is the column to be permuted - keys: 2D array - 2D array of unique rows of the values array, not including the final row + data: 2D array of floats + Original data matrix + + col_to_permute: int + Column n, which is immediately right of the column that will be shuffled. + + iterator: 1D array + Shuffled values to put into data. + + counts: 1D array of ints + Number of times each value in iterator needs to be repeated (typically 1) Returns ---------- - append_col: list - List of permuted values of the second-to-last column of the values array + + permute: 2D array + Array the same shape as data, but with column n-1 resampled with replacement. ''' - for idx, _ in enumerate(splits): - if idx < len(splits)-1: - np.random.shuffle(shuffled_col[splits[idx]:splits[idx+1]-1]) - else: - np.random.shuffle(shuffled_col[splits[idx]:]) + + #the shuffled column is defined by an input variable + shuffled_col_values = np.array(iterator) + + #check if the shuffled column needs to be duplicated to fit back into the original matrix + if len(shuffled_col_values) < data.shape[0]: + shuffled_col_values = np.repeat(shuffled_col_values, counts) + + permute = data.copy() + permute[:,col_to_permute-1] = shuffled_col_values + return permute + @nb.jit(nopython=True, cache=True) -def np_all_axis1(x): - """ - Numba compatible version of np.all(x, axis=1) +def randomized_return(data, col_to_permute, shuffled_col_values, keys, counts): + ''' + Randomly shuffles a column in a cluster-aware fashion if necessary. Parameters ---------- - x: 2D array + data: 2D array of floats + Original data matrix + + col_to_permute: int + Column n, which is immediately right of the column that will be shuffled. + + shuffled_col_values: 1D array + Values in the column to be shuffled + + keys: 1D array of ints + Indexes of the clusters in the shuffled column. If there is no clustering in the column to be shuffled, this still needs to be a 1D array to get Numba to behave properly. + + counts: 1D array of ints + Number of times each value in shuffled_col_values needs to be repeated (typically 1) + Returns ---------- - out: 1D array - Boolean array the same length as x in axis 0 - """ - out = np.ones(x.shape[0], dtype=np.bool8) - for i in range(x.shape[1]): - out = np.logical_and(out, x[:, i]) - return out + permute: 2D array + Array the same shape as data, but with column n-1 resampled with replacement. + + ''' + + #if there are no clusters, just shuffle the columns + if col_to_permute == 1: + np.random.shuffle(shuffled_col_values) + + #if not, shuffle between the indices in keys + else: + for idx, _ in enumerate(keys): + if idx < len(keys)-1: + np.random.shuffle(shuffled_col_values[keys[idx]:keys[idx+1]-1]) + else: + np.random.shuffle(shuffled_col_values[keys[idx]:]) + + #check if the shuffled column needs to be duplicated to fit back into the original matrix + if len(shuffled_col_values) < data.shape[0]: + shuffled_col_values = np.repeat(shuffled_col_values, counts) + + permute = data.copy() + permute[:,col_to_permute-1] = shuffled_col_values + + return permute -@nb.jit(nopython=True) +@nb.jit(nopython=True, cache=True) def nb_reindexer(resampled_idx, data, columns_to_resample, cluster_dict, randnos, start, shape): ''' Internal function for shuffling a design matrix. @@ -185,7 +265,7 @@ def nb_reindexer(resampled_idx, data, columns_to_resample, cluster_dict, randnos ---------- resampled: 2D array - Cluster-aware resampling of the input data array + Nested bootstrapped resample of the input data array ''' @@ -213,7 +293,7 @@ def nb_reindexer(resampled_idx, data, columns_to_resample, cluster_dict, randnos resampled = data[resampled_idx] return resampled -def permute_column(data, col_to_permute=-2, iterator=None, seed=None): +def permute_column(data, col_to_permute=-2, iterator=None): """ This function takes column n and permutes the column n - 1 while accounting for the clustering in column n - 2. This function is memoized based on the hash of the data and col_to_permute variable, which improves performance significantly. @@ -226,16 +306,16 @@ def permute_column(data, col_to_permute=-2, iterator=None, seed=None): col_to_permute: int Column n, which is immediately right of the column that will be shuffled. - iterator: an iterator representing the multiset of permutations of column n - 1 - In very small samples (n = 3 or 4), it is worthwhile to iterate through every permutation rather than hoping to randomly sample all of them. To use this, construct a multiset of permutations and iterate through using a for loop. Even at n = 5, it's faster to generate the multiset and randomly choose 100 elements from it than it is to approach it randomly. + iterator: 1D array + An iterator representing an instance of the multiset of permutations of column n - 1. In very small samples (n = 3 or 4), it is worthwhile to iterate through every permutation rather than hoping to randomly sample all of them. To use this, construct a multiset of permutations and iterate through using a for loop. Returns --------- - permuted: an array the same size as data with column n - 1 permuted within column n - 2's clusters. + permuted: an array the same size as data with column n - 1 permuted within column n - 2's clusters (if there are any). """ - key = hash(data[:,:col_to_permute+1].tobytes()) + key = hash(data[:,col_to_permute-1:col_to_permute+1].tobytes()) try: values, indexes, counts = permute_column.__dict__[key] @@ -246,50 +326,23 @@ def permute_column(data, col_to_permute=-2, iterator=None, seed=None): permute_column.__dict__.pop(list(permute_column.__dict__)[0]) - if iterator == None: + if iterator is not None: + return iter_return(data, col_to_permute, tuple(iterator), counts) + + else: shuffled_col_values = data[:,col_to_permute-1][indexes] - if col_to_permute == 1: - quick_shuffle(shuffled_col_values) - else: + try: keys = unique_idx_w_cache(values)[-2] - randomize_chunks(shuffled_col_values, keys) + except: + keys = unique_idx_w_cache(values)[-1] + return randomized_return(data, col_to_permute, shuffled_col_values, keys, counts) + - else: - shuffled_col_values = iterator - - - if len(shuffled_col_values) < data.shape[0]: - shuffled_col_values = np.repeat(shuffled_col_values, counts) - - permute = np.array(data) - permute[:,col_to_permute-1] = shuffled_col_values - - return permute - -def bootstrap_agg(bootstrap_sample, func=np.nanmean, agg_to=-2, first_data_col=-1): - - ''' - Groupby_aggregate function, but slower than mean_agg below. Probably not worth calling in most cases unless you have a strange aggregation to do, but you will probably save a lot of time in the end by jitting your aggregation in numba. - ''' - - iterations = first_data_col - agg_to - for i in range(iterations): - append_col = np.empty(0) - for key in np.unique(bootstrap_sample[:,first_data_col - 1]): - append_col = np.append(append_col, func(bootstrap_sample[bootstrap_sample[:, first_data_col - 1] == key][:,first_data_col])) - append_col = append_col.reshape(append_col.size,1) - bootstrap_sample = bootstrap_sample[:,:-1] - idx = np.unique(bootstrap_sample, axis=0, return_index=True)[1] - bootstrap_sample = bootstrap_sample[np.sort(idx)] - bootstrap_sample = bootstrap_sample[:,:-1] - bootstrap_sample = np.append(bootstrap_sample, append_col,axis=1) - return bootstrap_sample - def mean_agg(data, ref='None', groupby=-3): """ - Performs a "groupby" aggregation by taking the mean. Much better performance than bootstrap_agg above, but can only be used for quantities that can be calculated element-wise (such as mean.) + Performs a "groupby" aggregation by taking the mean. Can only be used for quantities that can be calculated element-wise (such as mean.) Potentially workable with Numba guvectorized ufuncs, too. Parameters ---------- @@ -302,10 +355,9 @@ def mean_agg(data, ref='None', groupby=-3): Returns ---------- - permute: array + data_agg: array A reduced array such that the labels in column groupby (now column index -2) are no longer duplicated and column index -1 contains averaged data values. - """ if isinstance(ref, str): key = hash((data[:,:groupby+1].tobytes(), ref)) @@ -403,63 +455,6 @@ def bootstrap_sample(data, start=0, data_col=-1, skip=[], seed=None): return resampled - - - -# def relabel_col(resample_data, original_data, col): - - -# """ -# Relabels in-place duplicate clusters for later calculations that require aggregation based on cluster. This function requires input of the raw (unresampled) data to determine how large each cluster should be. - -# Note: This is going to be deprecated as it is much slower than make_ufunc_list. - -# Parameters -# ---------- - -# resample_data, original_data: arrays (dtype='float') -# The column to relabel must have the same column index in both data sets. - -# col: int -# Column index of the column to relabel - -# Returns -# ---------- - -# Modifies resample_data in-place by adding multiples of 0.01 to repeated clusters. If the test statistic of interest does not require aggregation by cluster, this function is not doing much aside from adding a little overhead. - -# """ - -# a = resample_data[:,:col+1] -# b = original_data[:,:col+1] - -# unique, counts = nb_unique(a)[0::2] -# unique = np.frombuffer(unique.tobytes(), dtype = np.dtype('S{:d}'.format(unique.shape[1] * unique.dtype.itemsize))) -# ca = dict(zip(unique, counts)) -# ca = Counter(ca) - -# unique, counts = nb_unique(b)[0::2] -# unique = np.frombuffer(unique.tobytes(), dtype = np.dtype('S{:d}'.format(unique.shape[1] * unique.dtype.itemsize))) -# cb = dict(zip(unique, counts)) -# cb = Counter(cb) - - -# relabel = 0.99 - -# while any((ca - cb) & cb): -# idx=[] -# for key in (ca - cb) & cb: -# idx.append(list(np.where(np_all_axis1(a == np.frombuffer(key)))[0][-(cb)[key]:])) - -# np.add.at(resample_data[:,col], np.concatenate(idx), relabel) - -# a = resample_data[:,:col+1] -# unique, counts = nb_unique(a)[0::2] -# unique = np.frombuffer(unique.tobytes(), dtype = np.dtype('S{:d}'.format(unique.shape[1] * unique.dtype.itemsize))) -# ca = dict(zip(unique, counts)) -# ca = Counter(ca) -# relabel -= 0.01 - def msp(items): '''Yield the permutations of `items` where items is either a list @@ -568,7 +563,7 @@ def label_encode(a): a[:,i] = np.unique(a[:,i], return_inverse=True)[1] + 1 return a -@nb.jit +@nb.jit(nopython=True, cache=True) def make_ufunc_list(target, ref): ''' Makes a list of indices to perform a ufunc.reduceat operation along. This is necessary when an aggregation operation is performed while grouping by a column that was resampled. @@ -592,17 +587,17 @@ def make_ufunc_list(target, ref): reference = ref for i in range(reference.shape[1] - target.shape[1]): reference, counts = nb_unique(reference[:,:-1])[0::2] - reference = np.hstack((reference, counts.reshape(1,-1).T)).astype(np.int64) + counts = counts.astype(np.int64) ufunc_list = np.empty(0, dtype=np.int64) i = 0 while i < target.shape[0]: ufunc_list = np.append(ufunc_list, i) - i += reference[:,-1][np_all_axis1(reference[:,:-1] == target[i])][0] + i += counts[np.all((reference == target[i]), axis=1)][0] return ufunc_list -@nb.jit(nopython=True) +@nb.jit(nopython=True, cache=True) def id_clusters(unique_idx_list): ''' Constructs a Typed Dictionary from a tuple of arrays corresponding to unique cluster positions in a design matrix. Again, this indirectly assumes that the design matrix is lexsorted starting from the last column. @@ -630,9 +625,4 @@ def id_clusters(unique_idx_list): return cluster_dict -@nb.jit(nopython=True) -def nb_tuple(a,b): - ''' - Makes a tuple from two ints. - ''' - return tuple((a,b)) + diff --git a/hierarch/stats.py b/hierarch/stats.py index 3059ec0..18e07b5 100644 --- a/hierarch/stats.py +++ b/hierarch/stats.py @@ -4,12 +4,10 @@ -def two_sample_test(data_array, treatment_col, teststat="welch", skip=[], bootstraps=500, permutations=100, seed=None): +def two_sample_test(data_array, treatment_col, teststat="welch", skip=[], bootstraps=500, permutations=100, return_null=False, seed=None): data = np.copy(data_array) - - if data.dtype != 'float64': data[:,:-1] = internal_functions.label_encode(data[:,:-1]) data = data.astype('float64') @@ -64,8 +62,10 @@ def two_sample_test(data_array, treatment_col, teststat="welch", skip=[], bootst pval = np.where((np.array(np.abs(means)) >= truediff))[0].size / len(means) - - return pval + if return_null==True: + return pval, means + else: + return pval def two_sample_test_jackknife(data, treatment_col, permutations='all', teststat='welch'):