Skip to content

Commit

Permalink
created hierarch.resampling module to contain Bootstrapper and Permut…
Browse files Browse the repository at this point in the history
…er classes
  • Loading branch information
rishi-kulkarni committed May 4, 2021
1 parent 3cd6b49 commit a2f13d7
Show file tree
Hide file tree
Showing 4 changed files with 333 additions and 177 deletions.
3 changes: 2 additions & 1 deletion hierarch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hierarch.stats
import hierarch.power
from hierarch.internal_functions import bootstrap_sample
import hierarch.resampling

278 changes: 144 additions & 134 deletions hierarch/internal_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numba as nb
import sympy.utilities.iterables as iterables
import hierarch.numba_overloads
import pandas as pd

@nb.jit(nopython=True, cache=True)
def nb_tuple(a,b):
Expand Down Expand Up @@ -293,51 +294,6 @@ def nb_reindexer(resampled_idx, data, columns_to_resample, cluster_dict, randnos
resampled = data[resampled_idx]
return resampled

def permute_column(data, col_to_permute=-2, iterator=None):

"""
This function takes column n and permutes the column n - 1 while accounting for the clustering in column n - 2. This function is memoized based on the hash of the data and col_to_permute variable, which improves performance significantly.
Parameters
----------
data: arrays
The numpy array that contains the data of interest.
col_to_permute: int
Column n, which is immediately right of the column that will be shuffled.
iterator: 1D array
An iterator representing an instance of the multiset of permutations of column n - 1. In very small samples (n = 3 or 4), it is worthwhile to iterate through every permutation rather than hoping to randomly sample all of them. To use this, construct a multiset of permutations and iterate through using a for loop.
Returns
---------
permuted: an array the same size as data with column n - 1 permuted within column n - 2's clusters (if there are any).
"""

key = hash(data[:,col_to_permute-1:col_to_permute+1].tobytes())

try:
values, indexes, counts = permute_column.__dict__[key]
except:
values, indexes, counts = np.unique(data[:,:col_to_permute+1],return_index=True, return_counts=True,axis=0)
permute_column.__dict__[key] = values, indexes, counts
if len(permute_column.__dict__.keys()) > 50:
permute_column.__dict__.pop(list(permute_column.__dict__)[0])


if iterator is not None:
return iter_return(data, col_to_permute, tuple(iterator), counts)

else:
shuffled_col_values = data[:,col_to_permute-1][indexes]
try:
keys = unique_idx_w_cache(values)[-2]
except:
keys = unique_idx_w_cache(values)[-1]
return randomized_return(data, col_to_permute, shuffled_col_values, keys, counts)



def mean_agg(data, ref='None', groupby=-3):

Expand Down Expand Up @@ -384,77 +340,6 @@ def mean_agg(data, ref='None', groupby=-3):


return data_agg



def bootstrap_sample(data, start=0, data_col=-1, skip=[], seed=None):

'''
Performs a numba-accelerated multi-level bootstrap of input data.
Parameters
----------
data: 2D array
The input array containing your data in the final (or more) columns and categorical variables classifying the data in every prior column. Each column is resampled based on the column prior, so make sure your column ordering reflects that.
start: int
This is the first column corresponding to a level that you want to resample. Note: this column won't be resampled, but the next one will be resampled based on this column.
data_col: int
This is the first column that has your measured values (and therefore shouldn't be resampled). Default assumes you have a single column of measured values.
skip: list of ints
Column indices provided here will be sampled WITHOUT replacement. Ideally, you should skip columns that do not represent randomly sampled data (i.e. rather than having a random sample from that level, you have all the data).
seed: int or numpy.random.Generator object
Enables seeding of the random resampling for reproducibility. The function runs much faster in a loop if it does not have to initialize a generator every time, so passing it something is good for performance.
Returns
----------
resampled: 2D array
Data array the same number of columns as data, might be longer or shorter if your experimental data is imbalanced.
'''
data_key = hash(data[:,start:data_col].tobytes())

unique_idx_list = unique_idx_w_cache(data)

### check if we've cached the cluster_dict
try:
cluster_dict = bootstrap_sample.__dict__[data_key]
### if we haven't generate it and cache it
except:
cluster_dict = id_clusters(tuple(unique_idx_list))
bootstrap_sample.__dict__[data_key] = cluster_dict

###seedable for reproducibility
rng = np.random.default_rng(seed)
###generating a long list of random ints is cheaper than making a bunch of short lists. we know we'll never need more random numbers than the size of the design matrix, so make exactly that many random ints.
randnos = rng.integers(low=2**32,size=data[:,:data_col].size)


###figure out the bounds of the design matrix
if data_col < 0:
shape = data.shape[1] + data_col
else:
shape = data_col - 1

###resample these columns
columns_to_resample = np.array([True for k in range(shape)])

###figure out if we need to skip resampling any columns
for key in skip:
columns_to_resample[key] = False

###initialize the indices of clusters in the last column that isn't resampled
resampled_idx = unique_idx_list[start]

###generate the bootstrapped sample
resampled = nb_reindexer(resampled_idx, data, columns_to_resample, cluster_dict, randnos, start, shape)

return resampled


def msp(items):
'''Yield the permutations of `items` where items is either a list
Expand Down Expand Up @@ -545,24 +430,6 @@ def unique_idx_w_cache(data):

return unique_lists

def label_encode(a):
'''
Performs label encoding on an array.
Parameters
----------
a: array
Returns
----------
a: float64 array
'''
for i in range(a.shape[1]):
a[:,i] = np.unique(a[:,i], return_inverse=True)[1] + 1
return a

@nb.jit(nopython=True, cache=True)
def make_ufunc_list(target, ref):
'''
Expand Down Expand Up @@ -624,5 +491,148 @@ def id_clusters(unique_idx_list):
cluster_dict[nb_tuple(unique_idx_list[i-1][j], i)] = value
return cluster_dict

def preprocess_data(data):
'''
Performs label encoding without overwriting numerical variables.
Parameters
----------
data: 2D array or pandas DataFrame
Data to be encoded.
Returns
----------
encoded: 2D array of float64s
The array underlying data, but all elements that cannot be cast to np.float64s replaced with integer values.
'''
if isinstance(data, np.ndarray):
encoded = data.copy()
elif isinstance(data, pd.DataFrame):
encoded = data.to_numpy()
for idx, v in enumerate(encoded.T):
try:
encoded = encoded.astype(np.float64)
break
except:
try:
encoded[:,idx] = encoded[:,idx].astype(np.float64)
except:
encoded[:,idx] = np.unique(v, return_inverse=True)[1]
encoded = np.unique(encoded, axis=0)
return encoded

# def permute_column(data, col_to_permute=-2, iterator=None):

# """
# This function takes column n and permutes the column n - 1 while accounting for the clustering in column n - 2. This function is memoized based on the hash of the data and col_to_permute variable, which improves performance significantly.

# Parameters
# ----------
# data: arrays
# The numpy array that contains the data of interest.

# col_to_permute: int
# Column n, which is immediately right of the column that will be shuffled.

# iterator: 1D array
# An iterator representing an instance of the multiset of permutations of column n - 1. In very small samples (n = 3 or 4), it is worthwhile to iterate through every permutation rather than hoping to randomly sample all of them. To use this, construct a multiset of permutations and iterate through using a for loop.

# Returns
# ---------
# permuted: an array the same size as data with column n - 1 permuted within column n - 2's clusters (if there are any).

# """

# key = hash(data[:,col_to_permute-1:col_to_permute+1].tobytes())

# try:
# values, indexes, counts = permute_column.__dict__[key]
# except:
# values, indexes, counts = np.unique(data[:,:col_to_permute+1],return_index=True, return_counts=True,axis=0)
# permute_column.__dict__[key] = values, indexes, counts
# if len(permute_column.__dict__.keys()) > 50:
# permute_column.__dict__.pop(list(permute_column.__dict__)[0])


# if iterator is not None:
# return iter_return(data, col_to_permute, tuple(iterator), counts)

# else:
# shuffled_col_values = data[:,col_to_permute-1][indexes]
# try:
# keys = unique_idx_w_cache(values)[-2]
# except:
# keys = unique_idx_w_cache(values)[-1]
# return randomized_return(data, col_to_permute, shuffled_col_values, keys, counts)


# def bootstrap_sample(data, start=0, data_col=-1, skip=[], seed=None):

# '''
# Performs a numba-accelerated multi-level bootstrap of input data.

# Parameters
# ----------
# data: 2D array
# The input array containing your data in the final (or more) columns and categorical variables classifying the data in every prior column. Each column is resampled based on the column prior, so make sure your column ordering reflects that.

# start: int
# This is the first column corresponding to a level that you want to resample. Note: this column won't be resampled, but the next one will be resampled based on this column.

# data_col: int
# This is the first column that has your measured values (and therefore shouldn't be resampled). Default assumes you have a single column of measured values.

# skip: list of ints
# Column indices provided here will be sampled WITHOUT replacement. Ideally, you should skip columns that do not represent randomly sampled data (i.e. rather than having a random sample from that level, you have all the data).

# seed: int or numpy.random.Generator object
# Enables seeding of the random resampling for reproducibility. The function runs much faster in a loop if it does not have to initialize a generator every time, so passing it something is good for performance.

# Returns
# ----------
# resampled: 2D array
# Data array the same number of columns as data, might be longer or shorter if your experimental data is imbalanced.


# '''
# data_key = hash(data[:,start:data_col].tobytes())

# unique_idx_list = unique_idx_w_cache(data)

# ### check if we've cached the cluster_dict
# try:
# cluster_dict = bootstrap_sample.__dict__[data_key]
# ### if we haven't generate it and cache it
# except:
# cluster_dict = id_clusters(tuple(unique_idx_list))
# bootstrap_sample.__dict__[data_key] = cluster_dict

# ###seedable for reproducibility
# rng = np.random.default_rng(seed)
# ###generating a long list of random ints is cheaper than making a bunch of short lists. we know we'll never need more random numbers than the size of the design matrix, so make exactly that many random ints.
# randnos = rng.integers(low=2**32,size=data[:,:data_col].size)


# ###figure out the bounds of the design matrix
# if data_col < 0:
# shape = data.shape[1] + data_col
# else:
# shape = data_col - 1

# ###resample these columns
# columns_to_resample = np.array([True for k in range(shape)])

# ###figure out if we need to skip resampling any columns
# for key in skip:
# columns_to_resample[key] = False

# ###initialize the indices of clusters in the last column that isn't resampled
# resampled_idx = unique_idx_list[start]

# ###generate the bootstrapped sample
# resampled = nb_reindexer(resampled_idx, data, columns_to_resample, cluster_dict, randnos, start, shape)

# return resampled


Loading

0 comments on commit a2f13d7

Please sign in to comment.