Revert "Revert #163"

ACCLAB · Mar 2, 2024 · c0e694d · c0e694d
1 parent 24ca6b0
commit c0e694d
Show file tree

Hide file tree

Showing 26 changed files with 711 additions and 173 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,14 @@
+# Release notes
+
+<!-- do not remove -->
+
+## 2023.03.29
+
+### New Features
+- Add new form of paired proportion plots for a better support of Repeated Measures
+
+
+## 0.2.3
+
+### Bug Fixes
+- Fixes a bug that jammed up when the xvar column was already a pandas Categorical. Now we check for this and act appropriately.
diff --git a/README.md b/README.md
@@ -166,7 +166,6 @@ contributing](CONTRIBUTING.md), create a new issue using Feature request
 template or create a new post in [our Google
 Group](https://groups.google.com/g/estimationstats).
 
-
 ## Acknowledgements
 
 We would like to thank alpha testers from the [Claridge-Chang
@@ -179,7 +178,6 @@ Stanislav Ott.
 ## Testing
 
 To test DABEST, you need to install
-
 [pytest](https://docs.pytest.org/en/latest) and
 [nbdev](https://nbdev.fast.ai/).
 

diff --git a/dabest/_bootstrap_tools.py b/dabest/_bootstrap_tools.py
@@ -108,7 +108,15 @@ def __init__(
                 ttest_single = "NIL"
                 ttest_2_ind = "NIL"
                 ttest_2_paired = ttest_rel(x1, x2)[1]
-                wilcoxonresult = wilcoxon(x1, x2)[1]
+
+                try:
+                    wilcoxonresult = wilcoxon(x1, x2)[1]
+                except ValueError as e:
+                    warnings.warn("Wilcoxon test could not be performed. This might be due "
+                    "to no variability in the difference of the paired groups. \n"
+                    "Error: {}\n"
+                    "For detailed information, please refer to https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html "
+                    .format(e))
             mannwhitneyresult = "NIL"
 
             # Turns data into array, then tuple.

diff --git a/dabest/_dabest_object.py b/dabest/_dabest_object.py
@@ -10,7 +10,6 @@
 from scipy.stats import norm
 from scipy.stats import randint
 
-
 # %% ../nbs/API/dabest_object.ipynb 6
 class Dabest(object):
 
@@ -58,6 +57,18 @@ def __init__(
         self._check_errors(x, y, idx, experiment, experiment_label, x1_level)
 
 
+        # Check if there is NaN under any of the paired settings
+        if self.__is_paired and self.__output_data.isnull().values.any():
+            import warnings
+            warn1 = f"NaN values detected under paired setting and removed,"
+            warn2 = f" please check your data."
+            warnings.warn(warn1 + warn2)
+            if x is not None and y is not None:
+                rmname = self.__output_data[self.__output_data[y].isnull()][self.__id_col].tolist()
+                self.__output_data = self.__output_data[~self.__output_data[self.__id_col].isin(rmname)]
+            elif x is None and y is None:
+                self.__output_data.dropna(inplace=True)
+
         # create new x & idx and record the second variable if this is a valid 2x2 ANOVA case
         if idx is None and x is not None and y is not None:
             # Add a length check for unique values in the first element in list x,
@@ -442,26 +453,47 @@ def _check_errors(self, x, y, idx, experiment, experiment_label, x1_level):
                 raise ValueError(err0)
 
             # Check if the columns stated are valid
-            # TODO instead of traversing twice idx you can traverse only once
-            # and break the loop if the condition is not satisfied?
-            # TODO What if the type is not str and not tuple,list? missing raise Error
-            if all([isinstance(i, str) for i in idx]):
-                if len(pd.unique([t for t in idx]).tolist()) != 2:
+            # Initialize a flag to track if any element in idx is neither str nor (tuple, list)
+            valid_types = True
+
+            # Initialize variables to track the conditions for str and (tuple, list)
+            is_str_condition_met, is_tuple_list_condition_met = False, False
+
+            # Single traversal for optimization
+            for item in idx:
+                if isinstance(item, str):
+                    is_str_condition_met = True
+                elif isinstance(item, (tuple, list)) and len(item) == 2:
+                    is_tuple_list_condition_met = True
+                else:
+                    valid_types = False
+                    break  # Exit the loop if an invalid type is found
+
+            # Check if all types are valid
+            if not valid_types:
+                err0 = "`mini_meta` is True, but `idx` ({})".format(idx)
+                err1 = "does not contain exactly 2 unique columns."
+                raise ValueError(err0 + err1)
+
+            # Handling str type condition
+            if is_str_condition_met:
+                if len(pd.unique(idx).tolist()) != 2:
                     err0 = "`mini_meta` is True, but `idx` ({})".format(idx)
-                    err1 = "does not contain exactly 2 columns."
+                    err1 = "does not contain exactly 2 unique columns."
                     raise ValueError(err0 + err1)
 
-            if all([isinstance(i, (tuple, list)) for i in idx]):
+            # Handling (tuple, list) type condition
+            if is_tuple_list_condition_met:
                 all_idx_lengths = [len(t) for t in idx]
                 if (array(all_idx_lengths) != 2).any():
-                    err1 = "`mini_meta` is True, but some idx "
-                    err2 = "in {} does not consist only of two groups.".format(idx)
+                    err1 = "`mini_meta` is True, but some elements in idx "
+                    err2 = "in {} do not consist only of two groups.".format(idx)
                     raise ValueError(err1 + err2)
 
-        # TODO can you have True mini_meta and delta2 at the same time?
+
         # Check if this is a 2x2 ANOVA case and x & y are valid columns
         # Create experiment_label and x1_level
-        if self.__delta2:
+        elif self.__delta2:
             if x is None:
                 error_msg = "If `delta2` is True. `x` parameter cannot be None. String or list expected"
                 raise ValueError(error_msg)
@@ -534,7 +566,6 @@ def _check_errors(self, x, y, idx, experiment, experiment_label, x1_level):
             else:
                 x1_level = self.__output_data[x[0]].unique()
 
-        # TODO what if experiment is None?
         elif experiment:
             experiment_label = self.__output_data[experiment].unique()
             x1_level = self.__output_data[x[0]].unique()
@@ -545,7 +576,16 @@ def _get_plot_data(self, x, y, all_plot_groups):
         """
         Function to prepare some attributes for plotting
         """
-
+        # Check if there is NaN under any of the paired settings
+        if self.__is_paired is not None and self.__output_data.isnull().values.any():
+            print("Nan")
+            import warnings
+            warn1 = f"NaN values detected under paired setting and removed,"
+            warn2 = f" please check your data."
+            warnings.warn(warn1 + warn2)
+            rmname = self.__output_data[self.__output_data[y].isnull()][self.__id_col].tolist()
+            self.__output_data = self.__output_data[~self.__output_data[self.__id_col].isin(rmname)]
+
         # Identify the type of data that was passed in.
         if x is not None and y is not None:
             # Assume we have a long dataset.
@@ -589,6 +629,13 @@ def _get_plot_data(self, x, y, all_plot_groups):
             self.__xvar = "group"
             self.__yvar = "value"
 
+            # Check if there is NaN under any of the paired settings
+            if self.__is_paired is not None and self.__output_data.isnull().values.any():
+                import warnings
+                warn1 = f"NaN values detected under paired setting and removed,"
+                warn2 = f" please check your data."
+                warnings.warn(warn1 + warn2)
+
             # First, check we have all columns in the dataset.
             for g in all_plot_groups:
                 if g not in self.__output_data.columns:
@@ -611,10 +658,7 @@ def _get_plot_data(self, x, y, all_plot_groups):
         # Added in v0.2.7.
         plot_data.dropna(axis=0, how="any", subset=[self.__yvar], inplace=True)
 
-        # TODO these comments should not be in the code but on the release notes of the package version
-        # Lines 131 to 140 added in v0.2.3.
-        # Fixes a bug that jammed up when the xvar column was already
-        # a pandas Categorical. Now we check for this and act appropriately.
+
         if isinstance(plot_data[self.__xvar].dtype, pd.CategoricalDtype):
             plot_data[self.__xvar].cat.remove_unused_categories(inplace=True)
             plot_data[self.__xvar].cat.reorder_categories(

diff --git a/dabest/_delta_objects.py b/dabest/_delta_objects.py
@@ -555,7 +555,7 @@ def __repr__(self, header=True, sigfig=3):
         bs = bs1 + bs2
 
         pval_def1 = "Any p-value reported is the probability of observing the" + \
-                    "effect size (or greater),\nassuming the null hypothesis of" + \
+                    "effect size (or greater),\nassuming the null hypothesis of " + \
                     "zero difference is true."
         pval_def2 = "\nFor each p-value, 5000 reshuffles of the " + \
                     "control and test labels were performed."

diff --git a/dabest/_effsize_objects.py b/dabest/_effsize_objects.py
@@ -219,7 +219,7 @@ def __repr__(self, show_resample_count=True, define_pval=True, sigfig=3):
 
         pval_def1 = (
             "Any p-value reported is the probability of observing the"
-            + "effect size (or greater),\nassuming the null hypothesis of"
+            + "effect size (or greater),\nassuming the null hypothesis of "
             + "zero difference is true."
         )
         pval_def2 = (
@@ -299,7 +299,6 @@ def _compute_bca_intervals(self, sorted_bootstraps):
                 )
 
         else:
-            # TODO improve error handling, separate file with error messages?
             err1 = "The $lim_type limit of the BCa interval cannot be computed."
             err2 = "It is set to the effect size itself."
             err3 = "All bootstrap values were likely all the same."
@@ -330,9 +329,16 @@ def _perform_statistical_test(self):
 
         if self.__is_paired and not self.__proportional:
             # Wilcoxon, a non-parametric version of the paired T-test.
-            wilcoxon = spstats.wilcoxon(self.__control, self.__test)
-            self.__pvalue_wilcoxon = wilcoxon.pvalue
-            self.__statistic_wilcoxon = wilcoxon.statistic
+            try:
+                wilcoxon = spstats.wilcoxon(self.__control, self.__test)
+                self.__pvalue_wilcoxon = wilcoxon.pvalue
+                self.__statistic_wilcoxon = wilcoxon.statistic
+            except ValueError as e:
+                warnings.warn("Wilcoxon test could not be performed. This might be due "
+                    "to no variability in the difference of the paired groups. \n"
+                    "Error: {}\n"
+                    "For detailed information, please refer to https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html "
+                    .format(e))
 
             if self.__effect_size != "median_diff":
                 # Paired Student's t-test.
@@ -357,6 +363,16 @@ def _perform_statistical_test(self):
             self.__pvalue_mcnemar = _mcnemar.pvalue
             self.__statistic_mcnemar = _mcnemar.statistic
 
+        elif self.__proportional:
+            # The Cohen's h calculation is for binary categorical data
+            try:
+                self.__proportional_difference = es.cohens_h(
+                    self.__control, self.__test
+                )
+            except ValueError as e:
+                warnings.warn(f"Calculation of Cohen's h failed. This method is applicable "
+                  f"only for binary data (0's and 1's). Details: {e}")
+
         elif self.__effect_size == "cliffs_delta":
             # Let's go with Brunner-Munzel!
             brunner_munzel = spstats.brunnermunzel(
@@ -398,23 +414,13 @@ def _perform_statistical_test(self):
                 )
                 self.__pvalue_mann_whitney = mann_whitney.pvalue
                 self.__statistic_mann_whitney = mann_whitney.statistic
-            except ValueError:
-                # TODO At least print some warning?
-                # Occurs when the control and test are exactly identical
-                # in terms of rank (eg. all zeros.)
-                pass
+            except ValueError as e:
+                warnings.warn("Mann-Whitney test could not be performed. This might be due "
+                  "to identical rank values in both control and test groups. "
+                  "Details: {}".format(e))
 
             standardized_es = es.cohens_d(self.__control, self.__test, is_paired=None)
 
-            # The Cohen's h calculation is for binary categorical data
-            try:
-                self.__proportional_difference = es.cohens_h(
-                    self.__control, self.__test
-                )
-            except ValueError:
-                # TODO At least print some warning?
-                # Occur only when the data consists not only 0's and 1's.
-                pass
 
     def to_dict(self):
         """
@@ -567,87 +573,79 @@ def statistic_mcnemar(self):
 
     @property
     def pvalue_paired_students_t(self):
-        # TODO Missing docstring
         try:
             return self.__pvalue_paired_students_t
         except AttributeError:
             return npnan
 
     @property
     def statistic_paired_students_t(self):
-        # TODO Missing docstring
         try:
             return self.__statistic_paired_students_t
         except AttributeError:
             return npnan
 
     @property
     def pvalue_kruskal(self):
-        # TODO Missing docstring
         try:
             return self.__pvalue_kruskal
         except AttributeError:
             return npnan
 
     @property
     def statistic_kruskal(self):
-        # TODO Missing docstring
         try:
             return self.__statistic_kruskal
         except AttributeError:
             return npnan
 
     @property
     def pvalue_welch(self):
-        # TODO Missing docstring
         try:
             return self.__pvalue_welch
         except AttributeError:
             return npnan
 
     @property
     def statistic_welch(self):
-        # TODO Missing docstring
         try:
             return self.__statistic_welch
         except AttributeError:
             return npnan
 
     @property
     def pvalue_students_t(self):
-        # TODO Missing docstring
         try:
             return self.__pvalue_students_t
         except AttributeError:
             return npnan
 
     @property
     def statistic_students_t(self):
-        # TODO Missing docstring
         try:
             return self.__statistic_students_t
         except AttributeError:
             return npnan
 
     @property
     def pvalue_mann_whitney(self):
-        # TODO Missing docstring
         try:
             return self.__pvalue_mann_whitney
         except AttributeError:
             return npnan
 
     @property
     def statistic_mann_whitney(self):
-        # TODO Missing docstring
         try:
             return self.__statistic_mann_whitney
         except AttributeError:
             return npnan
 
     @property
     def pvalue_permutation(self):
-        # TODO Missing docstring
+        """
+        p value of permutation test
+        """
         return self.__PermutationTest_result.pvalue
 
     @property
@@ -663,12 +661,10 @@ def permutations(self):
 
     @property
     def permutations_var(self):
-         # TODO Missing docstring
         return self.__PermutationTest_result.permutations_var
 
     @property
     def proportional_difference(self):
-         # TODO Missing docstring
         try:
             return self.__proportional_difference
         except AttributeError:

diff --git a/dabest/_stats_tools/confint_1group.py b/dabest/_stats_tools/confint_1group.py
@@ -38,7 +38,9 @@ def compute_1group_jackknife(x, func, *args, **kwargs):
 
 
 def compute_1group_acceleration(jack_dist):
-    # TODO is it needed a function to just call one line?
+    """
+    Returns the accaleration value based on the jackknife distribution.
+    """
     from . import confint_2group_diff as ci_2g
 
     return ci_2g._calc_accel(jack_dist)