alchemistry · dnlbauer · Jul 31, 2021 · Jul 31, 2021 · Jul 31, 2021 · Jul 31, 2021
diff --git a/README.md b/README.md
@@ -117,10 +117,4 @@ Alchemical Analysis with the same input files:
   Coulomb:      -41.067  +-  0.180    -41.022  +-  0.129    -41.096  +-  0.170 
   vdWaals:       11.912  +-  0.160     11.954  +-  0.111     12.022  +-  0.139 
     TOTAL:      -29.154  +-  0.241    -29.067  +-  0.170    -29.074  +-  0.220
-```
-
-# Planed features:
-- **Output of statistical inefficiencies**
-alchemical-analysis offers information about the statistical inefficiencies of the input datasets.
-- **Uncorrelation threshold**
-In alchemical-analysis it is possible to specify a threshold for the number of samples to keep in the uncorrelation process.
+```
diff --git a/flamel.py b/flamel.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import argparse
+import pandas as pd
 
 
 def get_available_plugin_ids(type):
@@ -98,14 +99,15 @@ def main():
     parser.add_argument('-q', '--suffix', dest='suffix', help='Suffix for datafile sets, i.e. \'xvg\' (default).', default='xvg')
     parser.add_argument('-e', dest='estimators', type=str, default=None, help="Comma separated Estimator methods")
     parser.add_argument('-n', '--uncorr', dest='uncorr', help='The observable to be used for the autocorrelation analysis; either \'dhdl_all\' (obtained as a sum over all energy components) or \'dhdl\' (obtained as a sum over those energy components that are changing; default) or \'dE\'. In the latter case the energy differences dE_{i,i+1} (dE_{i,i-1} for the last lambda) are used.', default='dhdl')
+    parser.add_argument('-i', '--uncorr_threshold', dest='uncorr_threshold', help='Proceed with correlated samples (N) if the number of uncorrelated samples (N_k) is found to be less than this number. If 0 is given, the time series analysis will not be performed at all. Default: 50.', default=50, type=int)
     parser.add_argument('-r', '--decimal', dest='decimal', help='The number of decimal places the free energies are to be reported with. No worries, this is for the text output only; the full-precision data will be stored in \'results.pickle\'. Default: 3.', default=3, type=int)
     parser.add_argument('-o', '--output', dest='output', type=str, default=None, help="Output methods")
     parser.add_argument('-a', '--software', dest='software', help='Package\'s name the data files come from: Gromacs, Sire, Desmond, or AMBER. Default: Gromacs.', default='Gromacs')
     parser.add_argument('-s', '--skiptime', dest='equiltime', help='Discard data prior to this specified time as \'equilibration\' data. Units picoseconds. Default: 0 ps.', default=0, type=float)
     args = parser.parse_args()
 
     parser = load_plugin_by_name('parser', args.software, args.temperature, args.prefix, args.suffix)
-    uncorrelator = load_plugin_by_name('uncorrelate', args.uncorr)
+    uncorrelator = load_plugin_by_name('uncorrelate', args.uncorr, args.uncorr_threshold)
     outputs = load_plugins('output', argsplit(args.output))
     estimators = load_plugins('estimator', argsplit(args.estimators))
 
@@ -127,15 +129,24 @@ def main():
         u_nks = parser.get_u_nks()
 
     # Step 2: Uncorrelate the data
-    if uncorrelator.needs_dhdls:
-        uncorrelator.set_dhdls(dhdls)
-    if uncorrelator.needs_u_nks:
-        uncorrelator.set_u_nks(u_nks)
-
-    if do_dhdl:
-        dhdls = uncorrelator.uncorrelate(dhdls, args.equiltime)
-    if do_u_nks:
-        u_nks = uncorrelator.uncorrelate(u_nks, args.equiltime)
+    if args.uncorr_threshold > 0:
+        if uncorrelator.needs_dhdls:
+            uncorrelator.set_dhdls(dhdls)
+        if uncorrelator.needs_u_nks:
+            uncorrelator.set_u_nks(u_nks)
+
+        if do_dhdl:
+            print("Uncorrelating dH/dl ...")
+            dhdls = uncorrelator.uncorrelate(dhdls, args.equiltime)
+        if do_u_nks:
+            print("Uncorrelating reduced potentials ...")
+            u_nks = uncorrelator.uncorrelate(u_nks, args.equiltime)
+
+    # concat data for estimators
+    if u_nks is not None:
+        u_nks = pd.concat(u_nks)
+    if dhdls is not None:
+        dhdls = pd.concat(dhdls)
 
     # Step 3: Estimate Free energy differences
     for estimator in estimators:

diff --git a/uncorrelate/statistical_inefficiency_de.py b/uncorrelate/statistical_inefficiency_de.py
@@ -33,7 +33,7 @@ def uncorrelate(self, dfs, lower):
             statinefs.append(statinef)
             i += 1
 
-        return pandas.concat(uncorrelated_dfs)
+        return uncorrelated_dfs
 
 
 def get_plugin(*args):

diff --git a/uncorrelate/statistical_inefficiency_dhdl.py b/uncorrelate/statistical_inefficiency_dhdl.py
@@ -11,6 +11,10 @@ class StatisticalInefficiencyDhdl:
     needs_u_nks = False
 
     dhdl = None
+    uncorr_threshold = None
+
+    def __init__(self, uncorr_threshold):
+        self.uncorr_threshold = uncorr_threshold
 
     def set_dhdls(self, dhdls):
         """
@@ -50,13 +54,22 @@ def uncorrelate(self, dfs, lower):
             dl.append(dli)
 
         uncorrelated_dfs = []
-        for dhdl_, l, df in zip(self.dhdls, dl, dfs):
+        print("Number of correlated and uncorrelated samples (Method=%s):\n\n%6s %12s %12s %12s\n" % ("dHdl", "State", "N", "N_k", "N/N_k"))
+        for idx, (dhdl_, l, df) in enumerate(zip(self.dhdls, dl, dfs)):
             ind = np.array(l, dtype=bool)
             ind = np.array(ind, dtype=int)
             dhdl_sum = dhdl_.dot(ind)
-            uncorrelated_dfs.append(alchemlyb.preprocessing.statistical_inefficiency(df, dhdl_sum, lower, conservative=False))
+            uncorrelated_df = alchemlyb.preprocessing.statistical_inefficiency(df, dhdl_sum, lower, conservative=False)
+            N, N_k = len(df), len(uncorrelated_df)
+            g = N/N_k
+            print("%6s %12s %12s %12.2f" % (idx, N, N_k, g))
+            if N_k < self.uncorr_threshold:
+                print("WARNING: Only %d uncorrelated samples found at lambda number %d; proceeding with analysis using correlated samples..." % (N_k, idx))
+                uncorrelated_dfs.append(df)
+            else:
+                uncorrelated_dfs.append(uncorrelated_df)
 
-        return pandas.concat(uncorrelated_dfs)
+        return uncorrelated_dfs
 
 
 def get_plugin(*args):
@@ -65,4 +78,4 @@ def get_plugin(*args):
     :return:
         Statitical inefficiency uncorrelator
     """
-    return StatisticalInefficiencyDhdl()
+    return StatisticalInefficiencyDhdl(*args)
diff --git a/uncorrelate/statistical_inefficiency_dhdl_all.py b/uncorrelate/statistical_inefficiency_dhdl_all.py
@@ -11,6 +11,10 @@ class StatisticalInefficiencyDhdlAll:
     needs_u_nks = False
 
     dhdl = None
+    uncorr_threshold = None
+
+    def __init__(self, uncorr_threshold):
+        self.uncorr_threshold = uncorr_threshold
 
     def set_dhdls(self, dhdls):
         """
@@ -29,11 +33,20 @@ def uncorrelate(self, dfs, lower):
         """
 
         uncorrelated_dfs = []
-        for dhdl_, df in zip(self.dhdls, dfs):
+        print("Number of correlated and uncorrelated samples (Method=%s):\n\n%6s %12s %12s %12s\n" % ("dHdl (all)", "State", "N", "N_k", "N/N_k"))
+        for idx, (dhdl_, df) in enumerate(zip(self.dhdls, dfs)):
             dhdl_sum = dhdl_.sum(axis=1)
-            uncorrelated_dfs.append(alchemlyb.preprocessing.statistical_inefficiency(df, dhdl_sum, lower, conservative=False))
+            uncorrelated_df = alchemlyb.preprocessing.statistical_inefficiency(df, dhdl_sum, lower, conservative=False)
+            N, N_k = len(df), len(uncorrelated_df)
+            g = N/N_k
+            print("%6s %12s %12s %12.2f" % (idx, N, N_k, g))
+            if N_k < self.uncorr_threshold:
+                print("WARNING: Only %d uncorrelated samples found at lambda number %d; proceeding with analysis using correlated samples..." % (N_k, idx))
+                uncorrelated_dfs.append(df)
+            else:
+                uncorrelated_dfs.append(uncorrelated_df)
 
-        return pandas.concat(uncorrelated_dfs)
+        return uncorrelated_dfs
 
 
 def get_plugin(*args):
@@ -42,4 +55,4 @@ def get_plugin(*args):
     :return:
         Statitical inefficiency uncorrelator using a sum of all dhdls
     """
-    return StatisticalInefficiencyDhdlAll()
+    return StatisticalInefficiencyDhdlAll(*args)