Skip to content

Commit

Permalink
Merge pull request #31 from rishi-kulkarni/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
rishi-kulkarni authored Jul 7, 2021
2 parents 61c48de + 12ae78a commit 5092391
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 242 deletions.
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

## A Hierarchical Resampling Package for Python

Version 1.1.2
Version 1.1.3

hierarch is a package for hierarchical resampling (bootstrapping, permutation) of datasets in Python. Because for loops are ultimately intrinsic to cluster-aware resampling, hierarch uses Numba to accelerate many of its key functions.

Expand All @@ -13,6 +13,7 @@ hierarch has several functions to assist in performing resampling-based (and the
1. [Introduction](#introduction)
2. [Setup](#setup)
3. [Documentation](#documentation)
4. [Citation](#citation)


<a name="introduction"></a>
Expand Down Expand Up @@ -50,4 +51,12 @@ Alternatively, you can install from Anaconda.

<a name="documentation"></a>
## Documentation
Check out our user guide at [readthedocs](https://hierarch.readthedocs.io/).
Check out our user guide at [readthedocs](https://hierarch.readthedocs.io/).

<a name="citation"></a>
## Citation
If hierarch helps you analyze your data, please consider citing it. The manuscript also
contains a set of simulations validating hierarchical randomization tests in a variety of
conditions.

Analyzing Nested Experimental Designs – A User-Friendly Resampling Method to Determine Experimental Significance. Rishikesh U. Kulkarni, Catherine L. Wang, Carolyn R. Bertozzi. bioRxiv 2021.06.29.450439; doi: https://doi.org/10.1101/2021.06.29.450439
8 changes: 4 additions & 4 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
sphinx==4.0.2
sphinx==4.0.3
sphinx_rtd_theme==0.5.2
numpydoc==1.1.0
numba==0.53.1
numpy==1.20.3
pandas==1.2.4
scipy==1.6.3
numpy==1.21.0
pandas==1.3.0
scipy==1.7.0
8 changes: 7 additions & 1 deletion docs/user/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,10 @@ The code to perform a hierarchical permutation t-test on this dataset looks like
from hierarch.stats import hypothesis_test

hypothesis_test(data, treatment_col=0,
bootstraps=1000, permutations='all')
bootstraps=1000, permutations='all')

If you find hierarch useful for analyzing your data, please consider citing it.

Analyzing Nested Experimental Designs – A User-Friendly Resampling Method to Determine Experimental Significance
Rishikesh U. Kulkarni, Catherine L. Wang, Carolyn R. Bertozzi
bioRxiv 2021.06.29.450439; doi: https://doi.org/10.1101/2021.06.29.450439
138 changes: 3 additions & 135 deletions hierarch/internal_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def bounded_uint(ub):
Returns
-------
int
"""
"""
x = np.random.randint(low=2 ** 32)
m = ub * x
l = np.uint32(m)
Expand All @@ -193,7 +193,7 @@ def bounded_uint(ub):
@nb.jit(nopython=True, cache=True)
def nb_fast_shuffle(arr):
"""Reimplementation of Fisher-Yates shuffle using bounded_uint to generate random numbers.
"""
"""
i = arr.shape[0] - 1
while i > 0:
j = bounded_uint(i + 1)
Expand All @@ -211,7 +211,7 @@ def nb_strat_shuffle(arr, stratification):
Target array.
stratification : 1D array-like
Ranges to shuffle within. Must be sorted.
"""
"""
for v, w in zip(stratification[:-1], stratification[1:]):
i = w - v - 1
while i > 0:
Expand Down Expand Up @@ -248,137 +248,6 @@ def id_cluster_counts(design):
return cluster_dict


@nb.jit(nopython=True, cache=True)
def nb_reweighter(
data, columns_to_resample, clusternum_dict, start: int, shape: int, indexes=True
):
"""Internal function for bootstrapping a design matrix with integer
weights.
Parameters
----------
data : 2D array
Target data to be bootstrapped.
columns_to_resample : 1D bool array-like
False for columns to be skipped in the resampling plan.
clusternum_dict : TypedDict
Hierarchy dictionary produced by id_cluster_counts
start : int
First column of the data matrix to resample
shape : int
Last column of the data matrix to resample
indexes : bool, optional
If True, returns a reindexed array. If False, returns
a reweighted array, by default True.
Returns
-------
2D array
Nonparametrically bootstrapped sample from the input data array.
"""

out = data.astype(np.float64)
# at the start, everything is weighted equally
weights = np.array([1 for i in clusternum_dict[start]])

for key in range(start, shape):
# fetch design matrix info for current column
to_do = clusternum_dict[key]
# preallocate the full array for new_weight
new_weight = np.empty(to_do.sum(), np.int64)
place = 0

# if not resampling this column, new_weight is all 1
if not columns_to_resample[key]:
for idx, v in enumerate(to_do):
num = np.array([1 for m in range(v.item())])
# carry over resampled weights from previous columns
num *= weights[idx]
for idx_2, w in enumerate(num):
new_weight[place + idx_2] = w.item()
place += v

# else do a multinomial experiment to generate new_weight
else:
for idx, v in enumerate(to_do):
num = v.item()
# num*weights[idx] carries over weights from previous columns
randos = np.random.multinomial(num * weights[idx], [1 / num] * num)
for idx_2, w in enumerate(randos):
new_weight[place + idx_2] = w.item()
place += v

weights = new_weight

if indexes is False:
out[:, -1] = out[:, -1] * weights
return out
else:
indexes = weights_to_index(weights)
return out[indexes]


@nb.jit(nopython=True, cache=True)
def nb_reweighter_real(data, columns_to_resample, clusternum_dict, start, shape):
"""Internal function for bootstrapping a design matrix with real number
weights.
Parameters
----------
data : 2D array
Target data to be bootstrapped.
columns_to_resample : 1D bool array-like
False for columns to be skipped in the resampling plan.
clusternum_dict : TypedDict
Hierarchy dictionary produced by id_cluster_counts
start : int
First column of the data matrix to resample
shape : int
Last column of the data matrix to resample
Returns
-------
2D array
Nonparametrically bootstrapped sample from the input data array.
"""

out = data.astype(np.float64)
# at the start, everything is weighted equally
# dype is float64 because weights can be any real number
weights = np.array([1 for i in clusternum_dict[start]], dtype=np.float64)

for key in range(start, shape):
# fetch design matrix info for current column
to_do = clusternum_dict[key]
# preallocate the full array for new_weight
new_weight = np.empty(to_do.sum(), np.float64)
place = 0

# if not resampling this column, new_weight is all 1
if not columns_to_resample[key]:
for idx, v in enumerate(to_do):
num = np.array([1 for m in range(v.item())], dtype=np.float64)
num *= weights[idx]
for idx_2, w in enumerate(num):
new_weight[place + idx_2] = w.item()
place += v

# else do a dirichlet experiment to generate new_weight
else:
for idx, v in enumerate(to_do):
num = [1 for a in range(v.item())]
# multiplying by weights[idx] carries over prior columns
randos = np.random.dirichlet(num, size=None) * weights[idx] * v.item()
for idx_2, w in enumerate(randos):
new_weight[place + idx_2] = w.item()
place += v

weights = new_weight

out[:, -1] = out[:, -1] * weights
return out


@nb.jit(nopython=True, cache=True)
def weights_to_index(weights):
"""Converts a 1D array of integer weights to indices.
Expand Down Expand Up @@ -591,4 +460,3 @@ def class_make_ufunc_list(target, reference, counts):

return ufunc_list


Loading

0 comments on commit 5092391

Please sign in to comment.