Merge pull request #31 from rishi-kulkarni/development

Development
rishi-kulkarni · Jul 7, 2021 · 5092391 · 5092391
2 parents 61c48de + 12ae78a
commit 5092391
Show file tree

Hide file tree

Showing 8 changed files with 247 additions and 242 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 ## A Hierarchical Resampling Package for Python
 
-Version 1.1.2
+Version 1.1.3
 
 hierarch is a package for hierarchical resampling (bootstrapping, permutation) of datasets in Python. Because for loops are ultimately intrinsic to cluster-aware resampling, hierarch uses Numba to accelerate many of its key functions.
 
@@ -13,6 +13,7 @@ hierarch has several functions to assist in performing resampling-based (and the
 1. [Introduction](#introduction)
 2. [Setup](#setup)
 3. [Documentation](#documentation)
+4. [Citation](#citation)
 
 
 <a name="introduction"></a>
@@ -50,4 +51,12 @@ Alternatively, you can install from Anaconda.
 
 <a name="documentation"></a>
 ## Documentation
-Check out our user guide at [readthedocs](https://hierarch.readthedocs.io/).
+Check out our user guide at [readthedocs](https://hierarch.readthedocs.io/).
+
+<a name="citation"></a>
+## Citation
+If hierarch helps you analyze your data, please consider citing it. The manuscript also
+contains a set of simulations validating hierarchical randomization tests in a variety of
+conditions.
+
+Analyzing Nested Experimental Designs – A User-Friendly Resampling Method to Determine Experimental Significance. Rishikesh U. Kulkarni, Catherine L. Wang, Carolyn R. Bertozzi. bioRxiv 2021.06.29.450439; doi: https://doi.org/10.1101/2021.06.29.450439
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,7 +1,7 @@
-sphinx==4.0.2
+sphinx==4.0.3
 sphinx_rtd_theme==0.5.2
 numpydoc==1.1.0
 numba==0.53.1
-numpy==1.20.3
-pandas==1.2.4
-scipy==1.6.3
+numpy==1.21.0
+pandas==1.3.0
+scipy==1.7.0
diff --git a/docs/user/overview.rst b/docs/user/overview.rst
@@ -54,4 +54,10 @@ The code to perform a hierarchical permutation t-test on this dataset looks like
     from hierarch.stats import hypothesis_test
 
     hypothesis_test(data, treatment_col=0,
-                    bootstraps=1000, permutations='all')
+                    bootstraps=1000, permutations='all')
+
+If you find hierarch useful for analyzing your data, please consider citing it. 
+
+Analyzing Nested Experimental Designs – A User-Friendly Resampling Method to Determine Experimental Significance
+Rishikesh U. Kulkarni, Catherine L. Wang, Carolyn R. Bertozzi
+bioRxiv 2021.06.29.450439; doi: https://doi.org/10.1101/2021.06.29.450439
diff --git a/hierarch/internal_functions.py b/hierarch/internal_functions.py
@@ -173,7 +173,7 @@ def bounded_uint(ub):
     Returns
     -------
     int
-    """    
+    """
     x = np.random.randint(low=2 ** 32)
     m = ub * x
     l = np.uint32(m)
@@ -193,7 +193,7 @@ def bounded_uint(ub):
 @nb.jit(nopython=True, cache=True)
 def nb_fast_shuffle(arr):
     """Reimplementation of Fisher-Yates shuffle using bounded_uint to generate random numbers.
-    """    
+    """
     i = arr.shape[0] - 1
     while i > 0:
         j = bounded_uint(i + 1)
@@ -211,7 +211,7 @@ def nb_strat_shuffle(arr, stratification):
         Target array.
     stratification : 1D array-like
         Ranges to shuffle within. Must be sorted.
-    """    
+    """
     for v, w in zip(stratification[:-1], stratification[1:]):
         i = w - v - 1
         while i > 0:
@@ -248,137 +248,6 @@ def id_cluster_counts(design):
     return cluster_dict
 
 
-@nb.jit(nopython=True, cache=True)
-def nb_reweighter(
-    data, columns_to_resample, clusternum_dict, start: int, shape: int, indexes=True
-):
-    """Internal function for bootstrapping a design matrix with integer
-    weights.
-
-    Parameters
-    ----------
-    data : 2D array
-        Target data to be bootstrapped.
-    columns_to_resample : 1D bool array-like
-        False for columns to be skipped in the resampling plan.
-    clusternum_dict : TypedDict
-        Hierarchy dictionary produced by id_cluster_counts
-    start : int
-        First column of the data matrix to resample
-    shape : int
-        Last column of the data matrix to resample
-    indexes : bool, optional
-        If True, returns a reindexed array. If False, returns
-        a reweighted array, by default True.
-
-    Returns
-    -------
-    2D array
-        Nonparametrically bootstrapped sample from the input data array.
-    """
-
-    out = data.astype(np.float64)
-    # at the start, everything is weighted equally
-    weights = np.array([1 for i in clusternum_dict[start]])
-
-    for key in range(start, shape):
-        # fetch design matrix info for current column
-        to_do = clusternum_dict[key]
-        # preallocate the full array for new_weight
-        new_weight = np.empty(to_do.sum(), np.int64)
-        place = 0
-
-        # if not resampling this column, new_weight is all 1
-        if not columns_to_resample[key]:
-            for idx, v in enumerate(to_do):
-                num = np.array([1 for m in range(v.item())])
-                # carry over resampled weights from previous columns
-                num *= weights[idx]
-                for idx_2, w in enumerate(num):
-                    new_weight[place + idx_2] = w.item()
-                place += v
-
-        # else do a multinomial experiment to generate new_weight
-        else:
-            for idx, v in enumerate(to_do):
-                num = v.item()
-                # num*weights[idx] carries over weights from previous columns
-                randos = np.random.multinomial(num * weights[idx], [1 / num] * num)
-                for idx_2, w in enumerate(randos):
-                    new_weight[place + idx_2] = w.item()
-                place += v
-
-        weights = new_weight
-
-    if indexes is False:
-        out[:, -1] = out[:, -1] * weights
-        return out
-    else:
-        indexes = weights_to_index(weights)
-        return out[indexes]
-
-
-@nb.jit(nopython=True, cache=True)
-def nb_reweighter_real(data, columns_to_resample, clusternum_dict, start, shape):
-    """Internal function for bootstrapping a design matrix with real number
-    weights.
-
-    Parameters
-    ----------
-    data : 2D array
-        Target data to be bootstrapped.
-    columns_to_resample : 1D bool array-like
-        False for columns to be skipped in the resampling plan.
-    clusternum_dict : TypedDict
-        Hierarchy dictionary produced by id_cluster_counts
-    start : int
-        First column of the data matrix to resample
-    shape : int
-        Last column of the data matrix to resample
-
-    Returns
-    -------
-    2D array
-        Nonparametrically bootstrapped sample from the input data array.
-    """
-
-    out = data.astype(np.float64)
-    # at the start, everything is weighted equally
-    # dype is float64 because weights can be any real number
-    weights = np.array([1 for i in clusternum_dict[start]], dtype=np.float64)
-
-    for key in range(start, shape):
-        # fetch design matrix info for current column
-        to_do = clusternum_dict[key]
-        # preallocate the full array for new_weight
-        new_weight = np.empty(to_do.sum(), np.float64)
-        place = 0
-
-        # if not resampling this column, new_weight is all 1
-        if not columns_to_resample[key]:
-            for idx, v in enumerate(to_do):
-                num = np.array([1 for m in range(v.item())], dtype=np.float64)
-                num *= weights[idx]
-                for idx_2, w in enumerate(num):
-                    new_weight[place + idx_2] = w.item()
-                place += v
-
-        # else do a dirichlet experiment to generate new_weight
-        else:
-            for idx, v in enumerate(to_do):
-                num = [1 for a in range(v.item())]
-                # multiplying by weights[idx] carries over prior columns
-                randos = np.random.dirichlet(num, size=None) * weights[idx] * v.item()
-                for idx_2, w in enumerate(randos):
-                    new_weight[place + idx_2] = w.item()
-                place += v
-
-        weights = new_weight
-
-    out[:, -1] = out[:, -1] * weights
-    return out
-
-
 @nb.jit(nopython=True, cache=True)
 def weights_to_index(weights):
     """Converts a 1D array of integer weights to indices.
@@ -591,4 +460,3 @@ def class_make_ufunc_list(target, reference, counts):
 
     return ufunc_list
 
-