continuous logging

nataliacp · Feb 23, 2022 · b704959 · b704959
1 parent 579a0c5
commit b704959
Show file tree

Hide file tree

Showing 8 changed files with 293 additions and 222 deletions.
diff --git a/experiments/south_america/scripts/run_sa_exp.py b/experiments/south_america/scripts/run_sa_exp.py
@@ -40,7 +40,7 @@
 
             # Sample
             mc.warm_up()
-            mc.sample()
+            mc.sample(run=run)
 
             # Save samples to file
             mc.log_statistics()

diff --git a/sbayes/__main__.py b/sbayes/__main__.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-from sbayes.cli import main
+from sbayes.cli import cli
 
-main()
+cli()
diff --git a/sbayes/cli.py b/sbayes/cli.py
@@ -1,55 +1,49 @@
+import multiprocessing
+from itertools import product
+
 import argparse
 from pathlib import Path
 import tkinter as tk
 from tkinter import filedialog
 
-from sbayes.experiment_setup import Experiment
+from sbayes.experiment_setup import Experiment, update_recursive
 from sbayes.load_data import Data
-from sbayes.mcmc_setup import MCMC
 from sbayes.simulation import Simulation
+from sbayes.mcmc_setup import MCMC
+
 
+def run_experiment(
+        experiment: Experiment,
+        data: Data,
+        i_run: int,
+        custom_settings: dict = None,
+):
+    if custom_settings is not None:
+        update_recursive(experiment.config, custom_settings)
 
-def run_experiment(experiment, data, run):
     mcmc = MCMC(data=data, experiment=experiment)
     mcmc.log_setup()
 
     # Warm-up
     mcmc.warm_up()
 
     # Sample from posterior
-    mcmc.sample()
+    mcmc.sample(run=i_run)
 
     # Save samples to file
     mcmc.log_statistics()
-    mcmc.save_samples(run=run)
+    mcmc.save_samples(run=i_run)
 
     # Use the last sample as the new initial sample
     return mcmc.samples['last_sample']
 
 
-def main(config: Path = None,
-         experiment_name: str = None,
-         custom_settings: dict = None):
-    if config is None:
-        parser = argparse.ArgumentParser(
-            description="An MCMC algorithm to identify contact zones")
-        parser.add_argument("config", nargs="?", type=Path,
-                            help="The JSON configuration file")
-        parser.add_argument("name", nargs="?", type=str,
-                            help="The experiment name used for logging and as the name of the results directory.")
-        args = parser.parse_args()
-        config = args.config
-        experiment_name = args.name
-
-    # 0. Ask for config file via files-dialog, if not provided as argument.
-    if config is None:
-        tk.Tk().withdraw()
-        config = filedialog.askopenfilename(
-            title='Select a config file in JSON format.',
-            initialdir='..',
-            filetypes=(('json files', '*.json'),('all files', '*.*'))
-        )
-
+def main(
+        config: Path,
+        experiment_name: str = None,
+        custom_settings: dict = None,
+        processes: int = 1,
+):
     # Initialize the experiment
     experiment = Experiment(experiment_name=experiment_name,
                             config_file=config,
@@ -76,14 +70,30 @@ def main(config: Path = None,
         data.log_loading()
 
     # Rerun experiment to check for consistency
-    for run in range(experiment.config['mcmc']['runs']):
-        n_areas = experiment.config['model']['areas']
-        iterate_or_run(
-            x=n_areas,
-            config_setter=lambda x: experiment.config['model'].__setitem__('areas', x),
-            function=lambda x: run_experiment(experiment, data, run)
+    i_run_range = list(range(experiment.config['mcmc']['runs']))
+    n_areas_range = experiment.config['model']['areas']
+    if type(n_areas_range) not in [tuple, list, set]:
+        assert isinstance(n_areas_range, int)
+        n_areas_range = [n_areas_range]
+
+    def runner(i_run, n_areas):
+        run_experiment(
+            experiment=experiment,
+            data=data,
+            i_run=i_run,
+            custom_settings={'model': {'areas': n_areas}}
         )
 
+    for i in i_run_range:
+        for k in n_areas_range:
+            runner(i, k)
+
+    # if processes <= 1:
+    #     map(runner, product(i_run_range, n_areas_range))
+    # else:
+    #     pool = multiprocessing.Pool(processes=processes)
+    #     pool.map(runner, product(i_run_range, n_areas_range))
+
 
 def iterate_over_parameter(values, config_setter, function, print_message=None):
     """Iterate over each value in ´values´, apply ´config_setter´ (to update the config
@@ -104,5 +114,36 @@ def iterate_or_run(x, config_setter, function, print_message=None):
         function(x)
 
 
+def cli():
+    """Command line interface."""
+    parser = argparse.ArgumentParser(
+        description='An MCMC algorithm to identify contact zones')
+    parser.add_argument('config', type=Path,
+                        help='The JSON configuration file')
+    parser.add_argument('name', nargs='?', type=str,
+                        help='The experiment name used for logging and as the name of the results directory.')
+    parser.add_argument('threads', nargs='?', type=int, default=1,
+                        help='The number of parallel processes.')
+
+    args = parser.parse_args()
+
+    config = args.config
+
+    # Ask for config file via files-dialog, if not provided as argument.
+    if config is None:
+        tk.Tk().withdraw()
+        config = filedialog.askopenfilename(
+            title='Select a config file in JSON format.',
+            initialdir='..',
+            filetypes=(('json files', '*.json'), ('all files', '*.*'))
+        )
+
+    main(
+        config=config,
+        experiment_name=args.name,
+        processes=args.threads
+    )
+
+
 if __name__ == '__main__':
-    main()
+    cli()
diff --git a/sbayes/mcmc_setup.py b/sbayes/mcmc_setup.py
@@ -3,14 +3,11 @@
 
 """ Setup of the MCMC process """
 
-import numpy as np
-import typing
+import typing as tp
 
-from sbayes.postprocessing import contribution_per_area, match_areas, rank_areas
 from sbayes.sampling.zone_sampling import Sample, ZoneMCMC, ZoneMCMCWarmup
-from sbayes.util import normalize, samples2file
 from sbayes.model import Model
-from sbayes.simulation import Simulation
+from sbayes.sampling.loggers import ResultsLogger, ParametersCSVLogger, AreasLogger, LikelihoodLogger
 
 
 class MCMC:
@@ -52,15 +49,23 @@ def log_setup(self):
 Ratio of contact steps (changing gamma): {mcmc_config['operators']['contact']}
 ''')
 
-    def sample(self, lh_per_area=True, initial_sample: typing.Optional[typing.Any] = None):
+    def sample(
+            self,
+            lh_per_area=True,
+            initial_sample: tp.Optional[Sample] = None,
+            run: int = 1
+    ):
         mcmc_config = self.config['mcmc']
 
         if initial_sample is None:
             initial_sample = self.sample_from_warm_up
 
+        sample_loggers = self.get_sample_loggers(run=run)
+
         self.sampler = ZoneMCMC(
             data=self.data,
             model=self.model,
+            sample_loggers=sample_loggers,
             initial_sample=initial_sample,
             operators=mcmc_config['operators'],
             p_grow_connected=mcmc_config['grow_to_adjacent'],
@@ -72,73 +77,19 @@ def sample(self, lh_per_area=True, initial_sample: typing.Optional[typing.Any] =
         self.sampler.generate_samples(self.config['mcmc']['steps'],
                                       self.config['mcmc']['samples'])
 
-        # Evaluate likelihood and prior for each zone alone (makes it possible to rank zones)
-        if lh_per_area:
-            contribution_per_area(self.sampler)
+        # # Evaluate likelihood and prior for each zone alone (makes it possible to rank zones)
+        # if lh_per_area:
+        #     contribution_per_area(self.sampler)
 
         self.samples = self.sampler.statistics
-
-    def eval_ground_truth(self, lh_per_area=True):
-        assert isinstance(self.data, Simulation)
-        simulation = self.data
-
-        # Evaluate the likelihood of the true sample in simulated data
-        # If the model includes inheritance use all weights, if not use only the first two weights (global, zone)
-        if self.config['model']['inheritance']:
-            weights = simulation.weights.copy()
-        else:
-            weights = normalize(simulation.weights[:, :2])
-
-        ground_truth = Sample(zones=simulation.areas,
-                              weights=weights,
-                              p_global=simulation.p_universal[np.newaxis, ...],
-                              p_zones=simulation.p_contact,
-                              p_families=simulation.p_inheritance)
-
-        ground_truth.everything_changed()
-
-        self.samples['true_zones'] = simulation.areas
-        self.samples['true_weights'] = weights
-        self.samples['true_p_global'] = simulation.p_universal[np.newaxis, ...]
-        self.samples['true_p_zones'] = simulation.p_contact
-        self.samples['true_p_families'] = simulation.p_inheritance
-        self.samples['true_ll'] = self.sampler.likelihood(ground_truth, 0)
-        self.samples['true_prior'] = self.sampler.prior(ground_truth, 0)
-        self.samples["true_families"] = simulation.families
-
-        if lh_per_area:
-            ground_truth_log_lh_single_area = []
-            ground_truth_prior_single_area = []
-            ground_truth_posterior_single_area = []
-
-            for z in range(len(simulation.areas)):
-                area = simulation.areas[np.newaxis, z]
-                p_zone = simulation.p_contact[np.newaxis, z]
-
-                # Define Model
-                ground_truth_single_zone = Sample(zones=area, weights=weights,
-                                                  p_global=simulation.p_universal[np.newaxis, ...],
-                                                  p_zones=p_zone, p_families=simulation.p_inheritance)
-                ground_truth_single_zone.everything_changed()
-
-                # Evaluate Likelihood and Prior
-                lh = self.sampler.likelihood(ground_truth_single_zone, 0)
-                prior = self.sampler.prior(ground_truth_single_zone, 0)
-
-                ground_truth_log_lh_single_area.append(lh)
-                ground_truth_prior_single_area.append(prior)
-                ground_truth_posterior_single_area.append(lh + prior)
-
-                self.samples['true_lh_single_zones'] = ground_truth_log_lh_single_area
-                self.samples['true_prior_single_zones'] = ground_truth_prior_single_area
-                self.samples['true_posterior_single_zones'] = ground_truth_posterior_single_area
+        self.sampler.print_statistics()
 
     def warm_up(self):
         mcmc_config = self.config['mcmc']
-        print(mcmc_config['warmup'])
         warmup = ZoneMCMCWarmup(
             data=self.data,
             model=self.model,
+            sample_loggers=[],
             n_chains=mcmc_config['warmup']['warmup_chains'],
             operators=mcmc_config['operators'],
             p_grow_connected=mcmc_config['grow_to_adjacent'],
@@ -153,35 +104,30 @@ def warm_up(self):
                                                            warm_up=True,
                                                            warm_up_steps=self.config['mcmc']['warmup']['warmup_steps'])
 
-    def save_samples(self, run=1):
-
-        # TODO move all of this:
-        #   1. log samples during MCMC run
-        #   2. match + rang areas in summarization
-        #   3. compute simulation stats in separate script
-
-        # self.samples = match_areas(self.samples)
-        # self.samples = rank_areas(self.samples)
+    def get_sample_loggers(self, run=1) -> tp.List[ResultsLogger]:
+        k = self.model.n_zones
+        base_dir = self.path_results / f'K{k}'
+        base_dir.mkdir(exist_ok=True)
+        params_path = base_dir / f'stats_K{k}_{run}.txt'
+        areas_path = base_dir / f'areas_K{k}_{run}.txt'
+        likelihood_path = base_dir / f'likelihood_K{k}_{run}.h5'
 
-        fi = 'K{K}'.format(K=self.config['model']['areas'])
-        run = '_{run}'.format(run=run)
-        pth = self.path_results / fi
-        ext = '.txt'
-        gt_pth = pth / 'ground_truth'
+        sample_loggers = [
+            ParametersCSVLogger(params_path, self.data),
+            AreasLogger(areas_path, self.data),
+        ]
+        if not self.config['mcmc']['sample_from_prior']:
+            sample_loggers.append(LikelihoodLogger(likelihood_path, self.data))
 
-        paths = {'parameters': pth / ('stats_' + fi + run + ext),
-                 'areas': pth / ('areas_' + fi + run + ext),
-                 'gt': gt_pth / ('stats' + ext),
-                 'gt_areas': gt_pth / ('areas' + ext)}
+        return sample_loggers
 
-        pth.mkdir(exist_ok=True)
+    """ OLD METHODS FOR COMPATIBILITY """
 
-        if self.data.is_simulated:
-            self.eval_ground_truth()
-            gt_pth.mkdir(exist_ok=True)
+    def mc_eval_ground_truth(self):
+        pass
 
-        samples2file(self.samples, self.data, self.config, paths)
+    def save_samples(self, run=1):
+        pass
 
     def log_statistics(self):
-        # TODO log during MCMC run
-        self.sampler.print_statistics(self.samples)
+        pass