Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uncorrelation #5

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,4 @@ Alchemical Analysis with the same input files:
Coulomb: -41.067 +- 0.180 -41.022 +- 0.129 -41.096 +- 0.170
vdWaals: 11.912 +- 0.160 11.954 +- 0.111 12.022 +- 0.139
TOTAL: -29.154 +- 0.241 -29.067 +- 0.170 -29.074 +- 0.220
```

# Planed features:
- **Output of statistical inefficiencies**
alchemical-analysis offers information about the statistical inefficiencies of the input datasets.
- **Uncorrelation threshold**
In alchemical-analysis it is possible to specify a threshold for the number of samples to keep in the uncorrelation process.
```
31 changes: 21 additions & 10 deletions flamel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python

import argparse
import pandas as pd


def get_available_plugin_ids(type):
Expand Down Expand Up @@ -98,14 +99,15 @@ def main():
parser.add_argument('-q', '--suffix', dest='suffix', help='Suffix for datafile sets, i.e. \'xvg\' (default).', default='xvg')
parser.add_argument('-e', dest='estimators', type=str, default=None, help="Comma separated Estimator methods")
parser.add_argument('-n', '--uncorr', dest='uncorr', help='The observable to be used for the autocorrelation analysis; either \'dhdl_all\' (obtained as a sum over all energy components) or \'dhdl\' (obtained as a sum over those energy components that are changing; default) or \'dE\'. In the latter case the energy differences dE_{i,i+1} (dE_{i,i-1} for the last lambda) are used.', default='dhdl')
parser.add_argument('-i', '--uncorr_threshold', dest='uncorr_threshold', help='Proceed with correlated samples (N) if the number of uncorrelated samples (N_k) is found to be less than this number. If 0 is given, the time series analysis will not be performed at all. Default: 50.', default=50, type=int)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adapted from alchemical-analysis. -i sounds unintuitive to me, but thats what AA is using.

parser.add_argument('-r', '--decimal', dest='decimal', help='The number of decimal places the free energies are to be reported with. No worries, this is for the text output only; the full-precision data will be stored in \'results.pickle\'. Default: 3.', default=3, type=int)
parser.add_argument('-o', '--output', dest='output', type=str, default=None, help="Output methods")
parser.add_argument('-a', '--software', dest='software', help='Package\'s name the data files come from: Gromacs, Sire, Desmond, or AMBER. Default: Gromacs.', default='Gromacs')
parser.add_argument('-s', '--skiptime', dest='equiltime', help='Discard data prior to this specified time as \'equilibration\' data. Units picoseconds. Default: 0 ps.', default=0, type=float)
args = parser.parse_args()

parser = load_plugin_by_name('parser', args.software, args.temperature, args.prefix, args.suffix)
uncorrelator = load_plugin_by_name('uncorrelate', args.uncorr)
uncorrelator = load_plugin_by_name('uncorrelate', args.uncorr, args.uncorr_threshold)
outputs = load_plugins('output', argsplit(args.output))
estimators = load_plugins('estimator', argsplit(args.estimators))

Expand All @@ -127,15 +129,24 @@ def main():
u_nks = parser.get_u_nks()

# Step 2: Uncorrelate the data
if uncorrelator.needs_dhdls:
uncorrelator.set_dhdls(dhdls)
if uncorrelator.needs_u_nks:
uncorrelator.set_u_nks(u_nks)

if do_dhdl:
dhdls = uncorrelator.uncorrelate(dhdls, args.equiltime)
if do_u_nks:
u_nks = uncorrelator.uncorrelate(u_nks, args.equiltime)
if args.uncorr_threshold > 0:
if uncorrelator.needs_dhdls:
uncorrelator.set_dhdls(dhdls)
if uncorrelator.needs_u_nks:
uncorrelator.set_u_nks(u_nks)

if do_dhdl:
print("Uncorrelating dH/dl ...")
dhdls = uncorrelator.uncorrelate(dhdls, args.equiltime)
if do_u_nks:
print("Uncorrelating reduced potentials ...")
u_nks = uncorrelator.uncorrelate(u_nks, args.equiltime)

# concat data for estimators
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved the concatenation out of the correlation analysis functions because a threshold of 0 needs to skip correlators but still concatenates for estimators.

if u_nks is not None:
u_nks = pd.concat(u_nks)
if dhdls is not None:
dhdls = pd.concat(dhdls)

# Step 3: Estimate Free energy differences
for estimator in estimators:
Expand Down
2 changes: 1 addition & 1 deletion uncorrelate/statistical_inefficiency_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def uncorrelate(self, dfs, lower):
statinefs.append(statinef)
i += 1

return pandas.concat(uncorrelated_dfs)
return uncorrelated_dfs


def get_plugin(*args):
Expand Down
21 changes: 17 additions & 4 deletions uncorrelate/statistical_inefficiency_dhdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ class StatisticalInefficiencyDhdl:
needs_u_nks = False

dhdl = None
uncorr_threshold = None

def __init__(self, uncorr_threshold):
self.uncorr_threshold = uncorr_threshold

def set_dhdls(self, dhdls):
"""
Expand Down Expand Up @@ -50,13 +54,22 @@ def uncorrelate(self, dfs, lower):
dl.append(dli)

uncorrelated_dfs = []
for dhdl_, l, df in zip(self.dhdls, dl, dfs):
print("Number of correlated and uncorrelated samples (Method=%s):\n\n%6s %12s %12s %12s\n" % ("dHdl", "State", "N", "N_k", "N/N_k"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to use the str.format() method since we are py3 only.

for idx, (dhdl_, l, df) in enumerate(zip(self.dhdls, dl, dfs)):
ind = np.array(l, dtype=bool)
ind = np.array(ind, dtype=int)
dhdl_sum = dhdl_.dot(ind)
uncorrelated_dfs.append(alchemlyb.preprocessing.statistical_inefficiency(df, dhdl_sum, lower, conservative=False))
uncorrelated_df = alchemlyb.preprocessing.statistical_inefficiency(df, dhdl_sum, lower, conservative=False)
N, N_k = len(df), len(uncorrelated_df)
g = N/N_k
print("%6s %12s %12s %12.2f" % (idx, N, N_k, g))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to use the str.format() method since we are py3 only.

if N_k < self.uncorr_threshold:
print("WARNING: Only %d uncorrelated samples found at lambda number %d; proceeding with analysis using correlated samples..." % (N_k, idx))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be better to use the warning instead of print the warning.

import warning
warnings.warn("Only %d uncorrelated samples found at lambda number %d")

uncorrelated_dfs.append(df)
else:
uncorrelated_dfs.append(uncorrelated_df)

return pandas.concat(uncorrelated_dfs)
return uncorrelated_dfs


def get_plugin(*args):
Expand All @@ -65,4 +78,4 @@ def get_plugin(*args):
:return:
Statitical inefficiency uncorrelator
"""
return StatisticalInefficiencyDhdl()
return StatisticalInefficiencyDhdl(*args)
21 changes: 17 additions & 4 deletions uncorrelate/statistical_inefficiency_dhdl_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ class StatisticalInefficiencyDhdlAll:
needs_u_nks = False

dhdl = None
uncorr_threshold = None

def __init__(self, uncorr_threshold):
self.uncorr_threshold = uncorr_threshold

def set_dhdls(self, dhdls):
"""
Expand All @@ -29,11 +33,20 @@ def uncorrelate(self, dfs, lower):
"""

uncorrelated_dfs = []
for dhdl_, df in zip(self.dhdls, dfs):
print("Number of correlated and uncorrelated samples (Method=%s):\n\n%6s %12s %12s %12s\n" % ("dHdl (all)", "State", "N", "N_k", "N/N_k"))
for idx, (dhdl_, df) in enumerate(zip(self.dhdls, dfs)):
dhdl_sum = dhdl_.sum(axis=1)
uncorrelated_dfs.append(alchemlyb.preprocessing.statistical_inefficiency(df, dhdl_sum, lower, conservative=False))
uncorrelated_df = alchemlyb.preprocessing.statistical_inefficiency(df, dhdl_sum, lower, conservative=False)
N, N_k = len(df), len(uncorrelated_df)
g = N/N_k
print("%6s %12s %12s %12.2f" % (idx, N, N_k, g))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to use the str.format() method since we are py3 only.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I tried to keep the style similar to the rest of the repo which still uses the % syntax but will happily change it.

if N_k < self.uncorr_threshold:
print("WARNING: Only %d uncorrelated samples found at lambda number %d; proceeding with analysis using correlated samples..." % (N_k, idx))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to make a base class that does this check to avoid code duplication.

uncorrelated_dfs.append(df)
else:
uncorrelated_dfs.append(uncorrelated_df)

return pandas.concat(uncorrelated_dfs)
return uncorrelated_dfs


def get_plugin(*args):
Expand All @@ -42,4 +55,4 @@ def get_plugin(*args):
:return:
Statitical inefficiency uncorrelator using a sum of all dhdls
"""
return StatisticalInefficiencyDhdlAll()
return StatisticalInefficiencyDhdlAll(*args)