diff --git a/hierarch/__init__.py b/hierarch/__init__.py index dad7131..749dbc8 100644 --- a/hierarch/__init__.py +++ b/hierarch/__init__.py @@ -1,3 +1,4 @@ import hierarch.stats import hierarch.power -from hierarch.internal_functions import bootstrap_sample +import hierarch.resampling + diff --git a/hierarch/internal_functions.py b/hierarch/internal_functions.py index 3855378..c2cb6cd 100644 --- a/hierarch/internal_functions.py +++ b/hierarch/internal_functions.py @@ -2,6 +2,7 @@ import numba as nb import sympy.utilities.iterables as iterables import hierarch.numba_overloads +import pandas as pd @nb.jit(nopython=True, cache=True) def nb_tuple(a,b): @@ -293,51 +294,6 @@ def nb_reindexer(resampled_idx, data, columns_to_resample, cluster_dict, randnos resampled = data[resampled_idx] return resampled -def permute_column(data, col_to_permute=-2, iterator=None): - - """ - This function takes column n and permutes the column n - 1 while accounting for the clustering in column n - 2. This function is memoized based on the hash of the data and col_to_permute variable, which improves performance significantly. - - Parameters - ---------- - data: arrays - The numpy array that contains the data of interest. - - col_to_permute: int - Column n, which is immediately right of the column that will be shuffled. - - iterator: 1D array - An iterator representing an instance of the multiset of permutations of column n - 1. In very small samples (n = 3 or 4), it is worthwhile to iterate through every permutation rather than hoping to randomly sample all of them. To use this, construct a multiset of permutations and iterate through using a for loop. - - Returns - --------- - permuted: an array the same size as data with column n - 1 permuted within column n - 2's clusters (if there are any). - - """ - - key = hash(data[:,col_to_permute-1:col_to_permute+1].tobytes()) - - try: - values, indexes, counts = permute_column.__dict__[key] - except: - values, indexes, counts = np.unique(data[:,:col_to_permute+1],return_index=True, return_counts=True,axis=0) - permute_column.__dict__[key] = values, indexes, counts - if len(permute_column.__dict__.keys()) > 50: - permute_column.__dict__.pop(list(permute_column.__dict__)[0]) - - - if iterator is not None: - return iter_return(data, col_to_permute, tuple(iterator), counts) - - else: - shuffled_col_values = data[:,col_to_permute-1][indexes] - try: - keys = unique_idx_w_cache(values)[-2] - except: - keys = unique_idx_w_cache(values)[-1] - return randomized_return(data, col_to_permute, shuffled_col_values, keys, counts) - - def mean_agg(data, ref='None', groupby=-3): @@ -384,77 +340,6 @@ def mean_agg(data, ref='None', groupby=-3): return data_agg - - - -def bootstrap_sample(data, start=0, data_col=-1, skip=[], seed=None): - - ''' - Performs a numba-accelerated multi-level bootstrap of input data. - - Parameters - ---------- - data: 2D array - The input array containing your data in the final (or more) columns and categorical variables classifying the data in every prior column. Each column is resampled based on the column prior, so make sure your column ordering reflects that. - - start: int - This is the first column corresponding to a level that you want to resample. Note: this column won't be resampled, but the next one will be resampled based on this column. - - data_col: int - This is the first column that has your measured values (and therefore shouldn't be resampled). Default assumes you have a single column of measured values. - - skip: list of ints - Column indices provided here will be sampled WITHOUT replacement. Ideally, you should skip columns that do not represent randomly sampled data (i.e. rather than having a random sample from that level, you have all the data). - - seed: int or numpy.random.Generator object - Enables seeding of the random resampling for reproducibility. The function runs much faster in a loop if it does not have to initialize a generator every time, so passing it something is good for performance. - - Returns - ---------- - resampled: 2D array - Data array the same number of columns as data, might be longer or shorter if your experimental data is imbalanced. - - - ''' - data_key = hash(data[:,start:data_col].tobytes()) - - unique_idx_list = unique_idx_w_cache(data) - - ### check if we've cached the cluster_dict - try: - cluster_dict = bootstrap_sample.__dict__[data_key] - ### if we haven't generate it and cache it - except: - cluster_dict = id_clusters(tuple(unique_idx_list)) - bootstrap_sample.__dict__[data_key] = cluster_dict - - ###seedable for reproducibility - rng = np.random.default_rng(seed) - ###generating a long list of random ints is cheaper than making a bunch of short lists. we know we'll never need more random numbers than the size of the design matrix, so make exactly that many random ints. - randnos = rng.integers(low=2**32,size=data[:,:data_col].size) - - - ###figure out the bounds of the design matrix - if data_col < 0: - shape = data.shape[1] + data_col - else: - shape = data_col - 1 - - ###resample these columns - columns_to_resample = np.array([True for k in range(shape)]) - - ###figure out if we need to skip resampling any columns - for key in skip: - columns_to_resample[key] = False - - ###initialize the indices of clusters in the last column that isn't resampled - resampled_idx = unique_idx_list[start] - - ###generate the bootstrapped sample - resampled = nb_reindexer(resampled_idx, data, columns_to_resample, cluster_dict, randnos, start, shape) - - return resampled - def msp(items): '''Yield the permutations of `items` where items is either a list @@ -545,24 +430,6 @@ def unique_idx_w_cache(data): return unique_lists -def label_encode(a): - ''' - Performs label encoding on an array. - - Parameters - ---------- - - a: array - - Returns - ---------- - a: float64 array - - ''' - for i in range(a.shape[1]): - a[:,i] = np.unique(a[:,i], return_inverse=True)[1] + 1 - return a - @nb.jit(nopython=True, cache=True) def make_ufunc_list(target, ref): ''' @@ -624,5 +491,148 @@ def id_clusters(unique_idx_list): cluster_dict[nb_tuple(unique_idx_list[i-1][j], i)] = value return cluster_dict +def preprocess_data(data): + ''' + Performs label encoding without overwriting numerical variables. + + Parameters + ---------- + data: 2D array or pandas DataFrame + Data to be encoded. + + Returns + ---------- + encoded: 2D array of float64s + The array underlying data, but all elements that cannot be cast to np.float64s replaced with integer values. + + ''' + if isinstance(data, np.ndarray): + encoded = data.copy() + elif isinstance(data, pd.DataFrame): + encoded = data.to_numpy() + for idx, v in enumerate(encoded.T): + try: + encoded = encoded.astype(np.float64) + break + except: + try: + encoded[:,idx] = encoded[:,idx].astype(np.float64) + except: + encoded[:,idx] = np.unique(v, return_inverse=True)[1] + encoded = np.unique(encoded, axis=0) + return encoded + +# def permute_column(data, col_to_permute=-2, iterator=None): + +# """ +# This function takes column n and permutes the column n - 1 while accounting for the clustering in column n - 2. This function is memoized based on the hash of the data and col_to_permute variable, which improves performance significantly. + +# Parameters +# ---------- +# data: arrays +# The numpy array that contains the data of interest. + +# col_to_permute: int +# Column n, which is immediately right of the column that will be shuffled. + +# iterator: 1D array +# An iterator representing an instance of the multiset of permutations of column n - 1. In very small samples (n = 3 or 4), it is worthwhile to iterate through every permutation rather than hoping to randomly sample all of them. To use this, construct a multiset of permutations and iterate through using a for loop. + +# Returns +# --------- +# permuted: an array the same size as data with column n - 1 permuted within column n - 2's clusters (if there are any). + +# """ + +# key = hash(data[:,col_to_permute-1:col_to_permute+1].tobytes()) + +# try: +# values, indexes, counts = permute_column.__dict__[key] +# except: +# values, indexes, counts = np.unique(data[:,:col_to_permute+1],return_index=True, return_counts=True,axis=0) +# permute_column.__dict__[key] = values, indexes, counts +# if len(permute_column.__dict__.keys()) > 50: +# permute_column.__dict__.pop(list(permute_column.__dict__)[0]) + + +# if iterator is not None: +# return iter_return(data, col_to_permute, tuple(iterator), counts) + +# else: +# shuffled_col_values = data[:,col_to_permute-1][indexes] +# try: +# keys = unique_idx_w_cache(values)[-2] +# except: +# keys = unique_idx_w_cache(values)[-1] +# return randomized_return(data, col_to_permute, shuffled_col_values, keys, counts) + + +# def bootstrap_sample(data, start=0, data_col=-1, skip=[], seed=None): + +# ''' +# Performs a numba-accelerated multi-level bootstrap of input data. + +# Parameters +# ---------- +# data: 2D array +# The input array containing your data in the final (or more) columns and categorical variables classifying the data in every prior column. Each column is resampled based on the column prior, so make sure your column ordering reflects that. + +# start: int +# This is the first column corresponding to a level that you want to resample. Note: this column won't be resampled, but the next one will be resampled based on this column. + +# data_col: int +# This is the first column that has your measured values (and therefore shouldn't be resampled). Default assumes you have a single column of measured values. + +# skip: list of ints +# Column indices provided here will be sampled WITHOUT replacement. Ideally, you should skip columns that do not represent randomly sampled data (i.e. rather than having a random sample from that level, you have all the data). + +# seed: int or numpy.random.Generator object +# Enables seeding of the random resampling for reproducibility. The function runs much faster in a loop if it does not have to initialize a generator every time, so passing it something is good for performance. + +# Returns +# ---------- +# resampled: 2D array +# Data array the same number of columns as data, might be longer or shorter if your experimental data is imbalanced. + + +# ''' +# data_key = hash(data[:,start:data_col].tobytes()) + +# unique_idx_list = unique_idx_w_cache(data) + +# ### check if we've cached the cluster_dict +# try: +# cluster_dict = bootstrap_sample.__dict__[data_key] +# ### if we haven't generate it and cache it +# except: +# cluster_dict = id_clusters(tuple(unique_idx_list)) +# bootstrap_sample.__dict__[data_key] = cluster_dict + +# ###seedable for reproducibility +# rng = np.random.default_rng(seed) +# ###generating a long list of random ints is cheaper than making a bunch of short lists. we know we'll never need more random numbers than the size of the design matrix, so make exactly that many random ints. +# randnos = rng.integers(low=2**32,size=data[:,:data_col].size) + + +# ###figure out the bounds of the design matrix +# if data_col < 0: +# shape = data.shape[1] + data_col +# else: +# shape = data_col - 1 + +# ###resample these columns +# columns_to_resample = np.array([True for k in range(shape)]) + +# ###figure out if we need to skip resampling any columns +# for key in skip: +# columns_to_resample[key] = False + +# ###initialize the indices of clusters in the last column that isn't resampled +# resampled_idx = unique_idx_list[start] + +# ###generate the bootstrapped sample +# resampled = nb_reindexer(resampled_idx, data, columns_to_resample, cluster_dict, randnos, start, shape) +# return resampled + \ No newline at end of file diff --git a/hierarch/resampling.py b/hierarch/resampling.py new file mode 100644 index 0000000..9886e5b --- /dev/null +++ b/hierarch/resampling.py @@ -0,0 +1,92 @@ +import numpy as np +import hierarch.internal_functions as internal_functions +from itertools import cycle + +class Bootstrapper: + ''' + Nested bootstrapping class. + + Methods + ------- + fit(data, skip=[], y=-1) + Fit the bootstrapper to data. + + skip: List of ints. Columns to skip in the bootstrap. Skip columns that were sampled without replacement from the prior column. + + y indicates the column containing the dependent variable. This variable is exposed just to make the user aware of it - many functions break if the dependent variable column is in the middle of the array. + + transform(data, start=0) + Generate a bootstrapped sample from data. start indicates the last column in the array that should not be resampled. + + ''' + def __init__(self, random_state=None): + self.random_generator = np.random.default_rng(random_state) + + def fit(self, data, skip=[], y=-1): + self.unique_idx_list = internal_functions.unique_idx_w_cache(data) + self.cluster_dict = internal_functions.id_clusters(tuple(self.unique_idx_list)) + if y < 0: + self.shape = data.shape[1] + y + else: + self.shape = y - 1 + self.columns_to_resample = np.array([True for k in range(self.shape)]) + for key in skip: + self.columns_to_resample[key] = False + + def transform(self, data, start=0): + resampled_idx = self.unique_idx_list[start] + randnos = self.random_generator.integers(low=2**32, size=data.size) + resampled = internal_functions.nb_reindexer(resampled_idx, data, + self.columns_to_resample, + self.cluster_dict, randnos, + start, self.shape) + return resampled + +class Permuter: + + ''' + Cluster permutation class. + + Methods + ------- + fit(data, col_to_permute, exact=False) + Fit the permuter to data. + + col_to_permute indicates the column immediately to the right of the shuffled column - i.e. the column that indicates the treated samples. + + exact, if True, will construct a generator that will iterate deterministically through every possible permutation. Note: this functions properly only if there are no clusters in the column, though that might be implemented at a later date. It is typically not that useful. + + transform(data) + Generates a permuted sample. If exact is True, permutations will always be generated in a deterministic order, otherwise they will be random. + + ''' + + def __init__(self): + pass + + def fit(self, data, col_to_permute, exact=False): + self.values, self.indexes, self.counts = np.unique(data[:,:col_to_permute+1], + return_index=True, + return_counts=True, + axis=0) + self.col_to_permute = col_to_permute + self.exact = exact + + try: + self.keys = internal_functions.unique_idx_w_cache(self.values)[-2] + except: + self.keys = internal_functions.unique_idx_w_cache(self.values)[-1] + + if self.exact==True: + col_values = self.values[:,-2].copy() + self.iterator = cycle(internal_functions.msp(col_values)) + + else: + self.shuffled_col_values = data[:,self.col_to_permute-1][self.indexes] + + def transform(self, data): + if self.exact == True: + next_iter = next(self.iterator) + return internal_functions.iter_return(data, self.col_to_permute, tuple(next_iter), self.counts) + else: + return internal_functions.randomized_return(data, self.col_to_permute, self.shuffled_col_values, self.keys, self.counts) diff --git a/hierarch/stats.py b/hierarch/stats.py index 18e07b5..6121db8 100644 --- a/hierarch/stats.py +++ b/hierarch/stats.py @@ -1,69 +1,116 @@ import numpy as np +import math import hierarch.internal_functions as internal_functions +import hierarch.resampling as resampling import sympy.utilities.iterables as iterables -def two_sample_test(data_array, treatment_col, teststat="welch", skip=[], bootstraps=500, permutations=100, return_null=False, seed=None): +def two_sample_test(data_array, treatment_col, teststat="welch", skip=[], bootstraps=100, permutations=1000, return_null=False, seed=None): - data = np.copy(data_array) + ''' + Two-tailed two-sample hierarchical permutation test. - if data.dtype != 'float64': - data[:,:-1] = internal_functions.label_encode(data[:,:-1]) - data = data.astype('float64') - data = np.unique(data, axis=0) ###sorts the data matrix by row. 100% necessary. - + Parameters + ----------- + data_array: 2D array or pandas DataFrame + Array-like containing both the independent and dependent variables to be analyzed. It's assumed that the final (rightmost) column contains the dependent variable values. + + treatment_col: int + The index number of the column containing "two samples" to be compared. Indexing starts at 0. + + teststat: function or string + The test statistic to use to perform the hypothesis test. "Welch" automatically calls the Welch t-statistic for a difference of means test. + + skip: list of ints + Columns to skip in the bootstrap. Skip columns that were sampled without replacement from the prior column. + + bootstraps: int + Number of bootstraps to perform. + + permutations: int or "all" + Number of permutations to perform PER bootstrap sample. "all" for exact test. + + return_null: bool + Set to true to return the null distribution as well as the p value. + + seed: int or numpy random Generator + Seedable for reproducibility. + + Returns + --------- + pval: float64 + Two-tailed p-value. + + null_distribution: list of floats + The empirical null distribution used to calculate the p-value. + + ''' + + #turns the input array or dataframe into a float64 array + data = internal_functions.preprocess_data(data_array) + + #set random state + rng = np.random.default_rng(seed) + + #initialize and fit the bootstrapper to the data + bootstrapper = resampling.Bootstrapper(random_state=rng) + bootstrapper.fit(data, skip) + + #gather labels treatment_labels = np.unique(data[:,treatment_col]) + #raise an exception if there are more than two treatment labels if treatment_labels.size != 2: raise Exception("Needs 2 samples.") - - rng = np.random.default_rng(seed) + #shorthand for welch_statistic if teststat == "welch": teststat = internal_functions.welch_statistic - - if permutations == "all": - #determine total number of possible level 0 permutations - indexes = np.unique(data[:,:treatment_col+2],return_index=True,axis=0)[1] - pre_col_values = data[:,treatment_col][indexes] - it_list = list(internal_functions.msp(pre_col_values)) - + + + #aggregate our data up to the treated level and determine the observed test statistic levels_to_agg = data.shape[1] - treatment_col - 3 test = data for m in range(levels_to_agg): - test = internal_functions.mean_agg(test) - - truediff = np.abs(teststat(test, treatment_col, treatment_labels)) - + test = internal_functions.mean_agg(test) + truediff = teststat(test, treatment_col, treatment_labels) - means = [] + #initialize and fit the permuter to the aggregated data + permuter = resampling.Permuter() + + if permutations == "all": + permuter.fit(test, treatment_col+1, exact=True) + + #in the exact case, determine and set the total number of possible permutations + counts = np.unique(test[:,0], return_counts=True)[1] + permutations = binomial(counts.sum(), counts[0]) + + else: + #just fit the permuter if this is a randomized test + permuter.fit(test, treatment_col+1) + + #initialize empty null distribution list + null_distribution = [] + for j in range(bootstraps): - - #resample level 1 data from level 2s, using our generator for reproducible rng - - bootstrapped_sample = internal_functions.bootstrap_sample(data, start=treatment_col+1, skip=skip, seed=rng) + #generate a bootstrapped sample and aggregate it up to the treated level + bootstrapped_sample = bootstrapper.transform(data, start=treatment_col+1) for m in range(levels_to_agg): bootstrapped_sample = internal_functions.mean_agg(bootstrapped_sample, data) - - - - - if permutations == "all": - #we are sampling all 20 permutations, so no need for rng. - for k in it_list: - permute_resample = internal_functions.permute_column(bootstrapped_sample, treatment_col+1, k) - means.append(teststat(permute_resample, treatment_col, treatment_labels)) - else: - for k in range(permutations): - permute_resample = internal_functions.permute_column(bootstrapped_sample, treatment_col+1) - means.append(teststat(permute_resample, treatment_col, treatment_labels)) + #generate permuted samples, calculate test statistic, append to null distribution + for k in range(permutations): + permute_resample = permuter.transform(bootstrapped_sample) + null_distribution.append(teststat(permute_resample, treatment_col, treatment_labels)) - pval = np.where((np.array(np.abs(means)) >= truediff))[0].size / len(means) - + #two tailed test, so check where absolute values of the null distribution are greater or equal to the absolute value of the observed difference + pval = np.where((np.array(np.abs(null_distribution)) >= np.abs(truediff)))[0].size / len(null_distribution) + + if return_null==True: - return pval, means + return pval, null_distribution + else: return pval @@ -98,4 +145,10 @@ def two_sample_test_jackknife(data, treatment_col, permutations='all', teststat= pval = np.where((np.array(np.abs(means)) >= np.abs(truediff)))[0].size / len(means) - return pval \ No newline at end of file + return pval + +def binomial(x, y): + try: + return math.factorial(x) // math.factorial(y) // math.factorial(x - y) + except ValueError: + return 0 \ No newline at end of file