From ccd7ffde0f3c82d110d2808b8f6ecd1c5aa8f920 Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Mon, 24 Jul 2023 01:17:37 +0200
Subject: [PATCH 01/12] select inputs for exposure time calculator

---
 resspect/time_domain_snpcc.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/resspect/time_domain_snpcc.py b/resspect/time_domain_snpcc.py
index 24c8fc58..c653ede0 100644
--- a/resspect/time_domain_snpcc.py
+++ b/resspect/time_domain_snpcc.py
@@ -227,14 +227,21 @@ def _update_queryable_if_get_cost(
             Default is [4, 8].
         spectroscopic_snr
             SNR required for spectroscopic follow-up. Default is 10.
-        kwargs
+        kwargs: dict
             Any input required by ExpTimeCalc.findexptime function.
         """
+        local_keys = ['mag', 'SNRin', 'cwl_nm', 'bandpass_nm', 'band', 'airmass', 'skymode',
+                      'skymag', 'nread', 'skyADU', 'fwhm']
+
+        kwargs2 = {}
+        for name in kwargs.keys():
+            kwargs2[name] = kwargs[name]
+        
         for index in range(self._number_of_telescopes):
             light_curve_data.calc_exp_time(
                 telescope_diam=telescope_sizes[index],
                 telescope_name=telescope_names[index],
-                SNR=spectroscopic_snr, **kwargs
+                SNR=spectroscopic_snr, **kwargs2
             )
         return light_curve_data
 
@@ -431,7 +438,9 @@ def build_one_epoch(self, raw_data_dir: str, day_of_survey: int,
         self._number_of_telescopes = len(tel_names)
 
         multi_process = multiprocessing.Pool(number_of_processors)
-        logging.info("Starting SNPCC time domian features extraction...")
+        
+        logging.info("Starting SNPCC time domain features extraction...")
+        
         with open(self._features_file_name, 'a') as snpcc_features_file:
             iterator_list = zip(
                 files_list, repeat(raw_data_dir), repeat(queryable_criteria),

From 0d5c8d1849b65a5b43337fecbf441e77b28d18e4 Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Mon, 24 Jul 2023 02:08:11 +0200
Subject: [PATCH 02/12] fix generate time domain for snpcc

---
 docs/prepare_time_domain.rst                | 12 ++++++++++--
 resspect/scripts/build_time_domain_snpcc.py | 17 ++++++++++++-----
 resspect/time_domain_snpcc.py               | 14 ++++++++------
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/docs/prepare_time_domain.rst b/docs/prepare_time_domain.rst
index 6c3164c2..939254b0 100644
--- a/docs/prepare_time_domain.rst
+++ b/docs/prepare_time_domain.rst
@@ -47,14 +47,22 @@ You can perform the entire analysis for one day of the survey using the `SNPCCPh
    >>> day = 20
    >>> queryable_criteria = 2
    >>> get_cost = True
+   >>> feature_extractor = 'bazin'
+   >>> tel_sizes=[4, 8]
+   >>> tel_names = ['4m', '8m']
+   >>> spec_SNR = 10
+   >>> number_of_processors = 5
 
    >>> data = SNPCCPhotometry()
    >>> data.create_daily_file(output_dir=output_dir, day=day,
    >>>                        get_cost=get_cost)
    >>> data.build_one_epoch(raw_data_dir=path_to_data, 
    >>>                      day_of_survey=day, time_domain_dir=output_dir, 
+   >>>                      feature_extractor=feature_extractor,
    >>>                      queryable_criteria=queryable_criteria, 
-   >>>                      get_cost=get_cost)
+   >>>                      get_cost=get_cost, tel_sizes=tel_sizes, 
+   >>>                      tel_names=tel_names, spec_SNR=spec_SNR,  
+   >>>                      number_of_processors=number_of_processors)
 
 
 Alternatively you can use the command line to prepare a sequence of days in one batch:
@@ -62,7 +70,7 @@ Alternatively you can use the command line to prepare a sequence of days in one
 .. code-block:: bash
 
    >>> build_time_domain_snpcc.py -d 20 21 22 23 -p <path to raw data dir> 
-   >>>        -o <path to output time domain dir> -q 2 -c True
+   >>>        -o <path to output time domain dir> -q 2 -c True -nc 5
 
 For PLASTiCC
 ^^^^^^^^^^^^
diff --git a/resspect/scripts/build_time_domain_snpcc.py b/resspect/scripts/build_time_domain_snpcc.py
index 4c807141..17cad0f6 100644
--- a/resspect/scripts/build_time_domain_snpcc.py
+++ b/resspect/scripts/build_time_domain_snpcc.py
@@ -49,13 +49,15 @@ def build_time_domain_snpcc(user_choice):
     -t: sequence (optional)
         Primary mirrors diameters of potential spectroscopic telescopes.
         Only used if "get_cost == True". Default is [4, 8].
+    -nc: int (optional)
+        Number of cores used in calculation. Default is 1.
 
     Examples
     -------
     Use it directly from the command line.
 
     >>> build_time_domain.py -d 20 21 22 23 -p <path to raw data dir> 
-    >>>      -o <path to output time domain dir> -q 2 -c True
+    >>>      -o <path to output time domain dir> -q 2 -c True -nc 10
     """
     path_to_data = user_choice.raw_data_dir
     output_dir = user_choice.output
@@ -67,17 +69,19 @@ def build_time_domain_snpcc(user_choice):
     tel_sizes = user_choice.tel_sizes
     tel_names = user_choice.tel_names
     spec_SNR = user_choice.spec_SNR
+    number_of_processors = user_choice.n_cores
 
     for item in day:
         data = SNPCCPhotometry()
         data.create_daily_file(output_dir=output_dir, day=item, get_cost=get_cost)
         data.build_one_epoch(raw_data_dir=path_to_data, day_of_survey=int(item),
                              time_domain_dir=output_dir,
-                             feature_method=feature_method, 
+                             feature_extractor=feature_method, 
                              days_since_obs=days_since_obs,
                              queryable_criteria=queryable_criteria, 
                              get_cost=get_cost, tel_sizes=tel_sizes,
-                             tel_names=tel_names, spec_SNR=spec_SNR)
+                             tel_names=tel_names, spec_SNR=spec_SNR,
+                             number_of_processors=number_of_processors)
 
 
 def main():
@@ -102,8 +106,8 @@ def main():
     parser.add_argument('-c', '--calculate-cost', dest='get_cost', default=False, 
                         help='Calculate cost of spectra in each day.')
     parser.add_argument('-f', '--feature-method', dest='feature_method', type=str,
-                        required=False, default='Bazin', help='Feature extraction method. ' + \
-                        'Only "Bazin" is accepted at the moment.')
+                        required=False, default='bazin', help='Feature extraction method. ' + \
+                        'Only "bazin" is accepted at the moment.')
     parser.add_argument('-g', '--days-since-obs', dest='days_since_obs', required=False,
                         type=int, default=2, help='Gap in days since last observation ' + \
                         'when the measured magnitude can be used for spectroscopic ' + \
@@ -118,6 +122,9 @@ def main():
     parser.add_argument('-snr', '--spec-SNR', dest='spec_SNR', required=False,
                         default=10, help='SNR required for spectroscopic follow-up. ' + \
                         'Default is 10.')
+    parser.add_argument('-nc', '--n-cores', dest='n_cores', required=False,
+                        default=1, help='Number of cores used. ' + \
+                        'Default is 1.', type=int)
 
     # get input directory and output file name from user
     from_user = parser.parse_args()
diff --git a/resspect/time_domain_snpcc.py b/resspect/time_domain_snpcc.py
index c653ede0..7a93b57e 100644
--- a/resspect/time_domain_snpcc.py
+++ b/resspect/time_domain_snpcc.py
@@ -64,11 +64,11 @@ class SNPCCPhotometry:
         Get minimum and maximum MJD for complete sample.
     create_daily_file(raw_data_dir: str, day: int, output_dir: str,
                       header: str)
-        Create one file for a given day of the survey.
+        Creates one file for a given day of the survey.
         Only populates the file with header.
         It will erase existing files!
     build_one_epoch(raw_data_dir: str, day_of_survey: int,
-                    time_domain_dir: str, feature_method: str,
+                    time_domain_dir: str, feature_extractor: str,
                     dataset: str)
         Selects objects with observed points until given MJD,
         performs feature extraction and evaluate if query is possible.
@@ -109,7 +109,7 @@ def create_daily_file(self, output_dir: str,
         """
         maybe_create_directory(output_dir)
         self._features_file_name = os.path.join(
-            output_dir, 'day_' + str(day) + '.dat')
+            output_dir, 'day_' + str(day) + '.csv')
         logging.info('Creating features file')
         with open(self._features_file_name, 'w') as features_file:
             if feature_extractor not in FEATURE_EXTRACTOR_HEADERS_MAPPING:
@@ -120,7 +120,7 @@ def create_daily_file(self, output_dir: str,
                 self._header = FEATURE_EXTRACTOR_HEADERS_MAPPING[
                     feature_extractor]['snpcc_header_with_cost']
 
-            features_file.write(' '.join(self._header) + '\n')
+            features_file.write(','.join(self._header) + '\n')
 
     def _verify_telescope_names(self, telescope_names: list, get_cost: bool):
         """
@@ -233,9 +233,11 @@ def _update_queryable_if_get_cost(
         local_keys = ['mag', 'SNRin', 'cwl_nm', 'bandpass_nm', 'band', 'airmass', 'skymode',
                       'skymag', 'nread', 'skyADU', 'fwhm']
 
+        # select input for exposure time calculator
         kwargs2 = {}
         for name in kwargs.keys():
-            kwargs2[name] = kwargs[name]
+            if name in local_keys:
+                 kwargs2[name] = kwargs[name]
         
         for index in range(self._number_of_telescopes):
             light_curve_data.calc_exp_time(
@@ -452,7 +454,7 @@ def build_one_epoch(self, raw_data_dir: str, day_of_survey: int,
                     features_to_write = self._get_features_to_write(
                         light_curve_data, get_cost, tel_names)
                     snpcc_features_file.write(
-                        ' '.join(str(each_feature) for each_feature
+                        ','.join(str(each_feature) for each_feature
                                  in features_to_write) + '\n')
         logging.info("Features have been saved to: %s", self._features_file_name)
 

From 61062c8bef88aa19d81360c38e8eff663e6fb094 Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Mon, 24 Jul 2023 23:13:55 +0200
Subject: [PATCH 03/12] fix time domain loop

---
 docs/learn_loop.rst          |   6 +-
 resspect/__init__.py         |   5 +-
 resspect/samples_utils.py    | 122 +++++++++++++++++++++++++++++++++++
 resspect/time_domain_loop.py |  25 ++++---
 4 files changed, 140 insertions(+), 18 deletions(-)
 create mode 100644 resspect/samples_utils.py

diff --git a/docs/learn_loop.rst b/docs/learn_loop.rst
index 6e50fe90..bd7aee30 100644
--- a/docs/learn_loop.rst
+++ b/docs/learn_loop.rst
@@ -122,6 +122,7 @@ following the same algorithm described in `Ishida et al., 2019 <https://cosmosta
     >>> strategy = 'UncSampling'                        # learning strategy
     >>> batch = 1                                       # if int, ignore cost per observation, if None find optimal batch size
     >>> sep_files = False                               # if True, expects train, test and validation samples in separate files
+    >>> budgets = None
     
     >>> path_to_features_dir = 'results/time_domain/'   # folder where the files for each day are stored
     
@@ -148,13 +149,12 @@ following the same algorithm described in `Ishida et al., 2019 <https://cosmosta
     >>>                  output_queried_file=output_query_file, path_to_ini_files=path_to_ini_files,
     >>>                  path_to_features_dir=path_to_features_dir,
     >>>                  strategy=strategy, fname_pattern=fname_pattern, batch=batch, classifier=classifier,
-    >>>                  sep_files=sep_files,
+    >>>                  sep_files=sep_files, budgets=budgets,
     >>>                  screen=screen, initial_training=training,
     >>>                  survey=survey, queryable=queryable, n_estimators=n_estimators)
 
 
-Make sure you check the full documentation of the module to understand which variables are required depending
-on the case you wish to run.
+Make sure you check the full documentation of the module to understand which variables are required depending on the case you wish to run.
 
 More details can be found in the corresponding `docstring <https://github.com/COINtoolbox/resspect/blob/master/resspect/scripts/run_time_domain.py>`_.
 
diff --git a/resspect/__init__.py b/resspect/__init__.py
index 176f9b87..a068f197 100644
--- a/resspect/__init__.py
+++ b/resspect/__init__.py
@@ -25,8 +25,9 @@
 from .fit_lightcurves import *
 from .learn_loop import *
 from .metrics import *
-from .query_strategies import *
 from .plot_results import *
+from .query_strategies import *
+from .samples_utils import *
 from .snana_fits_to_pd import *
 from .scripts.build_canonical import build_canonical as build_canonical
 from .scripts.build_time_domain_snpcc import build_time_domain_snpcc as build_time_domain_snpcc
@@ -88,9 +89,11 @@
            'purity',
            'random_forest',           
            'random_sampling',
+           'read_features_fullLC_samples',
            'read_fits',
            'run_loop',
            'run_time_domain',
+           'sep_samples',
            'SNPCCPhotometry',
            'svm',
            'time_domain_loop',
diff --git a/resspect/samples_utils.py b/resspect/samples_utils.py
new file mode 100644
index 00000000..3452db29
--- /dev/null
+++ b/resspect/samples_utils.py
@@ -0,0 +1,122 @@
+# Copyright 2023 resspect software
+# Author: Emille Ishida
+#
+# created on 24 July 2023
+#
+# Licensed GNU General Public License v3.0;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.gnu.org/licenses/gpl-3.0.en.html
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['sep_samples', 'read_features_fullLC_samples']
+
+import os
+import pandas as pd
+import numpy as np
+import glob
+
+def sep_samples(all_ids: np.array, n_test_val: int,
+                n_train: int):
+    """
+    Separate train test and validation samples.
+
+    Parameters
+    ----------
+    all_ids: np.array
+        Array with all available object ids.
+    n_test_val: int
+        Number of objects to be added to test and validation.
+        All the remaining ones will be assigned to 
+    n_train: int
+        Number of objects in training sample. This should be
+        enough to allow testing of mutiple initial conditions.
+        Set it to at least 10x the size of the initial sample
+        within the learning loop.
+
+    Returns
+    -------
+    dict
+        keys are the sample names, values are the ids of
+        objects in each sample.
+    """
+    samples = {}
+    
+    # separate ids for training
+    samples['train'] = np.random.choice(all_ids, size=n_train, replace=False)
+    train_flag = np.isin(all_ids, samples['train'])
+
+    #separate ids for test and validation
+    samples['test'] = np.random.choice(all_ids[~train_flag], size=n_test_val, 
+                                       replace=False)
+    test_flag = np.isin(all_ids, samples['test'])
+    test_train_flag = np.logical_or(train_flag, test_flag)
+
+    samples['val'] = np.random.choice(all_ids[~test_train_flag], size=n_test_val,
+                                      replace=False)
+    val_flag = np.isin(all_ids, samples['val'])
+    val_test_train_flag = np.logical_or(test_train_flag, val_flag)
+
+    samples['query'] = all_ids[~val_test_train_flag]
+
+    return samples   
+
+
+def read_features_fullLC_samples(sample_ids: np.array, 
+                              output_fname: str, path_to_features: str,
+                                id_key='id'):
+    """
+    Create separate files for full light curve samples.
+
+    Parameters
+    ----------
+    sample_ids: np.array
+        Array of ids to be added to the sample.
+    output_fname: str
+        Filename where the sample will be saved.
+        If 'path_to_features' is a directory, this should
+        be pattern without extension.
+    path_to_features: str
+        Full path for where the features are stored.
+        It can be a directory or  a file.
+        All files should be csv.
+    id_key: str (optional)
+        String identifying the object id column. 
+        Default is 'id'.
+
+    Returns
+    -------
+    None
+        Save samples to file.
+    """
+
+    # read features
+    if os.path.isfile(path_to_features):
+        data_temp = pd.read_csv(path_to_features, index_col=False)
+        flag = np.isin(data_temp[id_key].values, sample_ids)
+        data = data_temp[flag]
+        data.to_csv(output_fname, index=False)
+
+    elif os.path.isdir(path_to_features):        
+        flist = glob.glob('*.csv')
+        
+        for i in range(len(flist)):
+            data_temp = pd.read_csv(path_to_features + flist[i])
+            flag = np.isin(data_temp[id_key].values, sample_ids)
+            data = data_temp[flag]
+            data.to_csv(output_fname + str(i + 1) + '.csv', index_col=False)
+
+    return None
+
+
+def main():
+    return None
+
+if __name__ == '__main__':
+    main()
diff --git a/resspect/time_domain_loop.py b/resspect/time_domain_loop.py
index 78097a2f..312336c4 100644
--- a/resspect/time_domain_loop.py
+++ b/resspect/time_domain_loop.py
@@ -32,7 +32,7 @@ def load_dataset(file_names_dict: dict, survey_name: str = 'DES',
                  is_separate_files: bool = False, samples_list: list = [None],
                  is_load_build_samples: bool = True,
                  number_of_classes: int = 2,
-                 feature_extraction_method: str = 'Bazin',
+                 feature_extraction_method: str = 'bazin',
                  is_save_samples: bool = False) -> DataBase:
     """
     Reads a data sample from file.
@@ -70,8 +70,8 @@ def load_dataset(file_names_dict: dict, survey_name: str = 'DES',
         Currently only nclass == 2 is implemented.
     feature_extraction_method: str (optional)
         Feature extraction method. The current implementation only
-        accepts method=='Bazin' or 'photometry'.
-        Default is 'Bazin'.
+        accepts method=='bazin' or 'photometry'.
+        Default is 'bazin'.
     is_save_samples: bool (optional)
         If True, save training and test samples to file.
         Default is False.
@@ -84,7 +84,7 @@ def load_dataset(file_names_dict: dict, survey_name: str = 'DES',
     for sample in samples_list:
         database_class.load_features(
             file_names_dict[sample], survey=survey_name, sample=sample,
-            method=feature_extraction_method)
+            feature_extractor=feature_extraction_method)
     if is_load_build_samples:
         database_class.build_samples(
             initial_training=initial_training, nclass=number_of_classes,
@@ -100,7 +100,7 @@ def _load_first_loop_and_full_data(
         initial_training: Union[str, int] = 'original',
         ia_training_fraction: float = 0.5, is_queryable: bool = False,
         is_separate_files: bool = False, number_of_classes: int = 2,
-        feature_extraction_method: str = 'Bazin',
+        feature_extraction_method: str = 'bazin',
         is_save_samples: bool = False) -> Tuple[DataBase, DataBase]:
     """
     Loads first loop and initial light curve training data
@@ -871,16 +871,13 @@ def run_time_domain_active_learning_loop(
     learning_days = [int(each_day) for each_day in learning_days]
     
     # create dictionary with budgets
-    if bool(budgets):
-        if len(budgets) not in [2, len(np.arange(learning_days[0], learning_days[1]))]:
-            raise ValueError('There must be 1 budget per telescope or ' + \
+    if budgets is not None and len(budgets) not in [2, len(np.arange(learning_days[0], learning_days[1]))]:
+        raise ValueError('There must be 1 budget per telescope or ' + \
                             '1 budget per telescope per night!')
-        
-        c = 0
-        budgets_dict = {}
-        for epoch in range(learning_days[0], learning_days[-1] - 1):
-            budgets_dict[epoch] = list(budgets)[c]
-            c = c + 1
+
+    budgets_dict = {}
+    for epoch in range(learning_days[0], learning_days[-1] - 1):
+        budgets_dict[epoch] = budgets
     
     for epoch in progressbar.progressbar(
             range(learning_days[0], learning_days[-1] - 1)):

From 104d4054630fe29124bb42b641744e7f17bc9e77 Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Mon, 24 Jul 2023 23:27:23 +0200
Subject: [PATCH 04/12] update docs

---
 docs/learn_loop.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/learn_loop.rst b/docs/learn_loop.rst
index bd7aee30..83c7199e 100644
--- a/docs/learn_loop.rst
+++ b/docs/learn_loop.rst
@@ -110,7 +110,9 @@ Active Learning loop in time domain
 ===================================
 
 Considering that you have previously prepared the time domain data, you can run the active learning loop
-following the same algorithm described in `Ishida et al., 2019 <https://cosmostatistics-initiative.org/portfolio-item/active-learning-for-sn-classification/>`_    by using the :py:mod:`resspect.time_domain_loop` module:
+following the same algorithm described in `Ishida et al., 2019 <https://cosmostatistics-initiative.org/portfolio-item/active-learning-for-sn-classification/>`_    by using the :py:mod:`resspect.time_domain_loop` module.
+
+.. note:: The code below requires a file with features extracted from full light curves from which the initial sample will be drawn.
 
 .. code-block:: python
     :linenos:
@@ -121,8 +123,7 @@ following the same algorithm described in `Ishida et al., 2019 <https://cosmosta
     >>> training = 'original'                           # if int take int number of objects for initial training, 50% being Ia
     >>> strategy = 'UncSampling'                        # learning strategy
     >>> batch = 1                                       # if int, ignore cost per observation, if None find optimal batch size
-    >>> sep_files = False                               # if True, expects train, test and validation samples in separate files
-    >>> budgets = None
+    >>> sep_files = False                               # if True, expects train, test and validation samples in separate filess
     
     >>> path_to_features_dir = 'results/time_domain/'   # folder where the files for each day are stored
     

From e50be9cc5e43ba91eb8665fe21b8ed1924da3b2e Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Tue, 25 Jul 2023 18:29:50 +0200
Subject: [PATCH 05/12] enable time domain for snpcc

---
 docs/learn_loop.rst          | 53 ++++++++++++++++++++++++++++++------
 resspect/time_domain_loop.py |  2 +-
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/docs/learn_loop.rst b/docs/learn_loop.rst
index 83c7199e..53a5258b 100644
--- a/docs/learn_loop.rst
+++ b/docs/learn_loop.rst
@@ -16,7 +16,7 @@ For start, we can load the feature information:
 
    >>> from resspect import DataBase
 
-   >>> path_to_features_file = 'results/Bazin.dat'
+   >>> path_to_features_file = 'results/Bazin.csv'
 
    >>> data = DataBase()
    >>> data.load_features(path_to_features_file, method='Bazin', screen=True)
@@ -87,9 +87,9 @@ In interactive mode, you must define the required variables and use the :py:mod:
    >>> method = 'Bazin'                               # only option in v1.0
    >>> ml = 'RandomForest'                            # classifier
    >>> strategy = 'RandomSampling'                    # learning strategy
-   >>> input_file = 'results/Bazin.dat'               # input features file
-   >>> metric = 'results/metrics.dat'                 # output metrics file
-   >>> queried = 'results/queried.dat'                # output query file
+   >>> input_file = 'results/Bazin.csv'               # input features file
+   >>> metric = 'results/metrics.csv'                 # output metrics file
+   >>> queried = 'results/queried.csv'                # output query file
    >>> train = 'original'                             # initial training
    >>> batch = 1                                      # size of batch
 
@@ -128,12 +128,12 @@ following the same algorithm described in `Ishida et al., 2019 <https://cosmosta
     >>> path_to_features_dir = 'results/time_domain/'   # folder where the files for each day are stored
     
     >>> output_metrics_file = 'results/metrics_' + strategy + '_' + str(training) + \
-                           '_batch' + str(batch) +  '.dat'                               # output results for metrics
+                           '_batch' + str(batch) +  '.csv'                               # output results for metrics
     >>> output_query_file = 'results/queried_' + strategy + '_' + str(training) + \
-                            '_batch' + str(batch) +  '.dat'                              # output query sample
+                            '_batch' + str(batch) +  '.csv'                              # output query sample
                             
     >>> path_to_ini_files = {}
-    >>> path_to_ini_files['train'] = 'results/Bazin.dat'                                 # features from full light curves for initial training sample 
+    >>> path_to_ini_files['train'] = 'results/Bazin.csv'                                 # features from full light curves for initial training sample 
     >>> survey='DES'
     
     >>> classifier = 'RandomForest'
@@ -141,7 +141,7 @@ following the same algorithm described in `Ishida et al., 2019 <https://cosmosta
     
     >>> feature_method = 'Bazin'
     >>> screen = False                                  # if True will print many intermediate steps for debuging 
-    >>> fname_pattern = ['day_', '.dat']                # pattern on filename where different days of the survey are stored                              
+    >>> fname_pattern = ['day_', '.csv']                # pattern on filename where different days of the survey are stored                              
     >>> queryable= True                                 # if True, check brightness before considering an object queryable
     
 
@@ -177,4 +177,39 @@ The result will be something like the plot below (accounting for variations due
 Separate samples and Telescope resources
 ----------------------------------------
 
-Beyond the simple learning loop described above, `resspect` also handdles a few batch strategies which take into account the available telescope time for spectroscopic follow-up... TBC
+In a realistic situation, you might like to consider a more complex experiment design. For example, using a fixed validation sample and taking into account the time evolution of the transient and available resources for spectroscopic follow-up. 
+
+The RESSPECT reported an extensive study which takes into account many of the caveats related to realistic astronomical observations. The full report can be found at `Kennamer et al., 2020 <https://cosmostatistics-initiative.org/portfolio-item/resspect1/>`_.
+
+In following the procedure described in `Kennamer et al., 2020 <https://cosmostatistics-initiative.org/portfolio-item/resspect1/>`_, the first step is to separate objects into `train`, `test`, `validation` and `query` samples.
+
+.. code-block:: python
+    :linenos:
+
+    >>> from resspect import sep_samples  
+    >>> from resspect import read_features_fullLC_samples
+
+    >>> # user input
+    >>> path_to_features = 'results/Bazin.csv'
+    >>> output_dir = 'results/'         # output directory where files will be saved
+    >>> n_test_val = 1000               # number of objects in each sample: test and validation
+    >>> n_train = 1500                  # number of objects to be separated for training
+    >>>                                 # this should be big enough to allow tests according
+    >>>                                 # to multiple initial conditions
+
+    >>> # read data and separate samples
+    >>> all_data = pd.read_csv(path_to_features, index_col=False)
+    >>> samples = sep_samples(all_data['id'].values, n_test_val=n_test_val, 
+    >>>                       n_train=n_train)
+
+    >>> # read features and save them to separate files
+    >>> for sample in samples.keys():
+    >>>     output_fname = output_dir + sample + '_bazin.csv'
+    >>>     read_features_fullLC_samples(samples[sample], output_fname,
+                                         path_to_features)
+   
+
+This will save samples to individual files. From these, only the `query` sample needs to be prepared for time domain. 
+
+
+
diff --git a/resspect/time_domain_loop.py b/resspect/time_domain_loop.py
index 312336c4..779ff5d2 100644
--- a/resspect/time_domain_loop.py
+++ b/resspect/time_domain_loop.py
@@ -453,7 +453,7 @@ def _save_metrics_and_queried_sample(
                                     '_' + str(current_loop) + '.csv')
     database_class.save_queried_sample(
         output_queried_file_name, loop=current_loop,
-        full_sample=is_save_full_query, epoch=epoch)
+        full_sample=is_save_full_query, epoch=epoch, batch=batch)
 
 
 def _load_next_day_data(

From edd1fe33c0310a1cbb37309bd522e91fa4c32d58 Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Fri, 28 Jul 2023 20:57:15 +0200
Subject: [PATCH 06/12] refactor loop for identifying queries. By Etienne

---
 resspect/query_strategies.py | 49 ++++++------------------------------
 1 file changed, 7 insertions(+), 42 deletions(-)

diff --git a/resspect/query_strategies.py b/resspect/query_strategies.py
index 4fd8ff03..9801d7f6 100644
--- a/resspect/query_strategies.py
+++ b/resspect/query_strategies.py
@@ -115,12 +115,7 @@ def uncertainty_sampling(class_prob: np.array, test_ids: np.array,
     order = dist.argsort()
 
     # only allow objects in the query sample to be chosen
-    flag = []
-    for item in order:
-        if test_ids[item] in queryable_ids:
-            flag.append(True)
-        else:
-            flag.append(False)
+    flag = list(pd.Series(data=test_ids[order]).isin(queryable_ids))
 
     # check if there are queryable objects within threshold
     indx = int(len(flag) * query_thre)
@@ -191,12 +186,7 @@ def random_sampling(test_ids: np.array, queryable_ids: np.array,
 
     if queryable:
         # flag only the queryable objects
-        flag = []
-        for item in indx:
-            if test_ids[item] in queryable_ids:
-                flag.append(True)
-            else:
-                flag.append(False)
+        flag = list(pd.Series(data=test_ids[order]).isin(queryable_ids))
 
         ini_index = flag.index(True)
 
@@ -267,12 +257,7 @@ def uncertainty_sampling_entropy(class_prob: np.array, test_ids: np.array,
     order = entropies.argsort()[::-1]
 
     # only allow objects in the query sample to be chosen
-    flag = []
-    for item in order:
-        if test_ids[item] in queryable_ids:
-            flag.append(True)
-        else:
-            flag.append(False)
+    flag = list(pd.Series(data=test_ids[order]).isin(queryable_ids))
 
     # check if there are queryable objects within threshold
     indx = int(len(flag) * query_thre)
@@ -338,12 +323,7 @@ def uncertainty_sampling_least_confident(class_prob: np.array, test_ids: np.arra
     order = prob_predicted_class.argsort()
 
     # only allow objects in the query sample to be chosen
-    flag = []
-    for item in order:
-        if test_ids[item] in queryable_ids:
-            flag.append(True)
-        else:
-            flag.append(False)
+    flag = list(pd.Series(data=test_ids[order]).isin(queryable_ids))
 
     # check if there are queryable objects within threshold
     indx = int(len(flag) * query_thre)
@@ -409,12 +389,7 @@ def uncertainty_sampling_margin(class_prob: np.array, test_ids: np.array,
     order = margin.argsort()
 
     # only allow objects in the query sample to be chosen
-    flag = []
-    for item in order:
-        if test_ids[item] in queryable_ids:
-            flag.append(True)
-        else:
-            flag.append(False)
+    flag = list(pd.Series(data=test_ids[order]).isin(queryable_ids))
 
     # check if there are queryable objects within threshold
     indx = int(len(flag) * query_thre)
@@ -481,12 +456,7 @@ def qbd_mi(ensemble_probs: np.array, test_ids: np.array,
     order = mis.argsort()[::-1]
 
     # only allow objects in the query sample to be chosen
-    flag = []
-    for item in order:
-        if test_ids[item] in queryable_ids:
-            flag.append(True)
-        else:
-            flag.append(False)
+    flag = list(pd.Series(data=test_ids[order]).isin(queryable_ids))
 
     # check if there are queryable objects within threshold
     indx = int(len(flag) * query_thre)
@@ -553,12 +523,7 @@ def qbd_entropy(ensemble_probs: np.array, test_ids: np.array,
     order = entropies.argsort()[::-1]
 
     # only allow objects in the query sample to be chosen
-    flag = []
-    for item in order:
-        if test_ids[item] in queryable_ids:
-            flag.append(True)
-        else:
-            flag.append(False)
+    flag = list(pd.Series(data=test_ids[order]).isin(queryable_ids))
 
     # check if there are queryable objects within threshold
     indx = int(len(flag) * query_thre)

From 2b969ac2c610b411b3c1d09833ee02be800e612f Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Fri, 28 Jul 2023 21:04:56 +0200
Subject: [PATCH 07/12] update docs for learn loop time domain

---
 docs/index.rst      | 18 ++++--------------
 docs/learn_loop.rst | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index eb742bdb..ee4b2f19 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -48,21 +48,12 @@ Next, clone this repository in another chosen location:
 
     (resspect) >>> git clone https://github.com/COINtoolbox/resspect
 
-Navigate to the repository folder and do
-
-.. code-block:: bash
-
-    (resspect) >>> pip install -r requirements.txt
-
-
-You can now install this package with:
+Navigate to the repository folder and you can now install this package with:
 
 .. code-block:: bash
 
     (resspect) >>> pip install -e .
 
-.. hint:: You may choose to create your virtual environment within the folder of the repository. If you choose to do this, you must remember to exclude the virtual environment directory from version control using e.g., ``.gitignore``. 
-
 
 Setting up a working directory
 ------------------------------
@@ -91,9 +82,6 @@ This data was provided by Rick Kessler, after the publication of results from th
 It allows you to run tests and validate your installation.
 
 
-Data for the RESSPECT project can be found in the COIN server. Check the minutes document for the module you are interested in for information about the exact location.
-
-
 Analysis steps
 ==============
 
@@ -163,7 +151,9 @@ Acknowledgements
 
 This work is part of the Recommendation System for Spectroscopic Followup (RESSPECT) project, governed by an inter-collaboration agreement signed between the `Cosmostatistics Initiative (COIN) <https://cosmostatistics-initiative.org/>`_ and the `LSST Dark Energy Science Collaboration (DESC) <https://lsstdesc.org/>`_.
 
-The `COsmostatistics INitiative (COIN) <https://cosmostatistics-initiative.org>`_ receives financial support from `CNRS <http://www.cnrs.fr/>`_ as part of its MOMENTUM programme over the 2018-2020 period, under the project *Active Learning for Large Scale Sky Surveys*.
+The `COsmostatistics INitiative (COIN) <https://cosmostatistics-initiative.org>`_ is an international network of researchers whose goal is to foster interdisciplinarity inspired by Astronomy. 
+
+COIN received financial support from `CNRS <http://www.cnrs.fr/>`_ for the development of this project, as part of its MOMENTUM programme over the 2018-2020 period, under the project *Active Learning for Large Scale Sky Surveys*.
 
 This work would not be possible without intensive consultation to online platforms and
 discussion forums. Although it is not possible to provide a complete list of the open source
diff --git a/docs/learn_loop.rst b/docs/learn_loop.rst
index 53a5258b..d7131701 100644
--- a/docs/learn_loop.rst
+++ b/docs/learn_loop.rst
@@ -209,7 +209,37 @@ In following the procedure described in `Kennamer et al., 2020 <https://cosmosta
                                          path_to_features)
    
 
-This will save samples to individual files. From these, only the `query` sample needs to be prepared for time domain. 
+This will save samples to individual files. From these, only the `query` sample needs to be prepared for time domain, following instructions in :ref:`Prepare data for Time Domain <prepare_time_domain>`. Once that is done, there is only a few inputs that needs to be changed in the last call of the `time_domain_loop` function. 
+
+.. code-block:: python
+    :linenos:
+
+    >>> sep_files = True         
+    >>> batch = None                            # use telescope time budgets instead of fixed number of queries per loop      
+    >>> budgets = (6. * 3600, 6. * 3600)        # budget of 6 hours per night of observation
+
+    >>> path_to_features_dir = 'results/time_domain/'  # this is the path to the directory where the query sample
+                                                       # processed for time domain is stored
+
+    >>> path_to_ini_files = {}
+    >>> path_to_ini_files['train'] = 'results/train_bazin.csv'       
+    >>> path_to_ini_files['test'] = 'results/test_bazin.csv'
+    >>> path_to_ini_files['validation'] = 'results/val_bazin.csv'
+
+    
+    >>> # run time domain loop
+    >>> time_domain_loop(days=days, output_metrics_file=output_diag_file,
+                         output_queried_file=output_query_file, 
+                         path_to_ini_files=path_to_ini_files,
+                         path_to_features_dir=path_to_features_dir,
+                         strategy=strategy, fname_pattern=fname_pattern, 
+                         batch=batch, classifier=classifier,
+                         sep_files=sep_files, budgets=budgets,
+                         screen=screen, initial_training=training,
+                         survey=survey, queryable=queryable, n_estimators=n_estimators)
+
+
+    
 
 
 

From 74ba68d87f4657e248429c12c429ec5359e4a0a0 Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Sat, 29 Jul 2023 00:20:52 +0200
Subject: [PATCH 08/12] fix loop for random sampling

---
 resspect/query_strategies.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/resspect/query_strategies.py b/resspect/query_strategies.py
index 9801d7f6..9356736d 100644
--- a/resspect/query_strategies.py
+++ b/resspect/query_strategies.py
@@ -24,6 +24,7 @@
            'qbd_entropy']
 
 import numpy as np
+import pandas as pd
 
 
 def compute_entropy(ps: np.array):
@@ -185,8 +186,9 @@ def random_sampling(test_ids: np.array, queryable_ids: np.array,
                             replace=False)
 
     if queryable:
+        
         # flag only the queryable objects
-        flag = list(pd.Series(data=test_ids[order]).isin(queryable_ids))
+        flag = list(pd.Series(data=test_ids[indx]).isin(queryable_ids))
 
         ini_index = flag.index(True)
 

From d38430aa356fa3b4ad10c480210835342d2b642d Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Sat, 29 Jul 2023 00:46:17 +0200
Subject: [PATCH 09/12] no idea why notebook works but script does not

---
 resspect/scripts/run_time_domain.py | 111 +++++++++++++++++++++-------
 1 file changed, 86 insertions(+), 25 deletions(-)

diff --git a/resspect/scripts/run_time_domain.py b/resspect/scripts/run_time_domain.py
index e46d6375..e1b45038 100644
--- a/resspect/scripts/run_time_domain.py
+++ b/resspect/scripts/run_time_domain.py
@@ -22,6 +22,14 @@
 from resspect.time_domain_loop import time_domain_loop
 
 
+def str2none(v):
+    if v == None or v.isdigit() or v == 'original' or int(v).isdigit():
+        return v
+    elif str(v) == 'None':
+        return None
+    elif isinstance(v, list):
+        return ([int(v[0]), int(v[1])])
+
 def run_time_domain(user_choice):
     """Command line interface to the Time Domain Active Learning scenario.
 
@@ -38,24 +46,50 @@ def run_time_domain(user_choice):
         Complete path to directory holding features files for all days.
     -s: str
         Query strategy. Options are 'UncSampling' and 'RandomSampling'.
-    -b: int (optional)
-        Size of batch to be queried in each loop. Default is 1.
+    -b: int or None (optional)
+        If int,  size of batch to be queried in each loop. 
+        If None, use budget of available telescope time to select batches.
+        Default is 1.
+    -bd: tuple or list of tuples (optional)
+        Each element of the tuple represents available time in one spectroscopic telescope.
+        Only used if -b == None. Default is None.
     -c: str (optional)
         Machine Learning algorithm.
         Currently 'RandomForest', 'GradientBoostedTrees'
         'K-NNclassifier' and 'MLPclassifier' are implemented.
-    -fm: str (optional)
-        Feature extraction method. Currently only 'Bazin' is implemented.
+    -fl: file containing full light curve features to train on
+         if -t original is chosen.
+    -fp: list of str (optional)
+        List of strings containing pattern for file name with features for each day.
+        Default is ['day_', '.csv']
+    -it: str (optional)
+        Path to initial training file. Only used if -sp == True.
+        Default is None.
+    -n: int (optional)
+        Number of estimators (trees in the forest). 
+        Only used if classifier == 'RandomForest'. Default is 1000.
+    -pv: str (optional)
+        Path to validation file. Only used if -sp == True.
+        Default is False.
+    -pt: str (optional)
+        Path to test file. Only used if -sp == True.
+        Default is None.
+    -qb: bool (optional)
+        If True, consider the mag of the object at the moment of query.
+        Default is True.
     -sc: bool (optional)
         If True, display comment with size of samples on screen.
+    -sf: bool (optional)
+        If True, it assumes samples are stored in separate files. 
+        Default is False.
+    -sv: str (optional)
+        Survey. Options are 'DES' or 'LSST'. Default is 'DES'.
     -t: str or int
         Choice of initial training sample.
         If 'original': begin from the train sample flagged in the file
         If int: choose the required number of samples at random,
         ensuring that at least half are SN Ia
         Default is 'original'.
-    -fl: file containing full light curve features to train on
-         if -t original is chosen.
 
     Returns
     -------
@@ -77,27 +111,44 @@ def run_time_domain(user_choice):
     """
 
     # set parameters
-    days = user_choice.days
+    days = [int(user_choice.days[0]), int(user_choice.days[1])]
     output_metrics_file = user_choice.metrics
     output_query_file = user_choice.queried
     path_to_features_dir = user_choice.features_dir
     strategy = user_choice.strategy
+    training = str2none(user_choice.training)
+    queryable = user_choice.queryable
 
-    batch = user_choice.batch
+    batch = str2none(user_choice.batch)
     classifier = user_choice.classifier
-    feature_method = user_choice.feat_method
-    path_to_full_lc_features = user_choice.full_features
     screen = user_choice.screen
-    training = user_choice.training
+    fname_pattern = user_choice.fname_pattern
+    n_estimators = user_choice.n_estimators
+
+    sep_files = user_choice.sep_files
+    budgets = str2none(user_choice.budgets)
+    survey = user_choice.survey
+
+    path_to_ini_files = {}
+    path_to_ini_files['train'] = user_choice.full_features
+    path_to_ini_files['validation'] = user_choice.val
+    path_to_ini_files['test'] = user_choice.test
+
+    print(user_choice)
+    print(path_to_ini_files)
+    print('sep_files = ', sep_files)
+    print('initial_training = ', training)
 
     # run time domain loop
     time_domain_loop(days=days, output_metrics_file=output_metrics_file,
-                     output_queried_file=output_query_file,
-                     path_to_features_dir=path_to_features_dir,
-                     strategy=strategy, batch=batch, classifier=classifier,
-                     features_method=feature_method,
-                     path_to_full_lc_features=path_to_full_lc_features,
-                     screen=screen, training=training)
+                 output_queried_file=output_query_file, 
+                 path_to_ini_files=path_to_ini_files,
+                 path_to_features_dir=path_to_features_dir,
+                 strategy=strategy, fname_pattern=fname_pattern, 
+                 batch=batch, classifier=classifier,
+                 sep_files=sep_files, budgets=budgets,
+                 screen=screen, initial_training=training,
+                 survey=survey, queryable=queryable, n_estimators=n_estimators)
 
 
 def str2bool(v):
@@ -139,15 +190,9 @@ def main():
                         help='Classifier. Currently only accepts '
                              '"RandomForest", "GradientBoostedTrees",'
                              ' "K-NNclassifier" and "MLPclassifier".')
-    parser.add_argument('-fm', '--feature-method', dest='feat_method',
-                        required=False, default='Bazin',
-                        help='Feature extraction method. Currently only accepts '
-                             '"Bazin".')
     parser.add_argument('-fl', '--full-light-curve',
                         dest='full_features', required=False,
-                        default=' ', help='Path to full light curve features.'
-                                          'Only used if '
-                                          '"training==original".')
+                        default=' ', help='Path to full light curve features for initial training.')
     parser.add_argument('-sc', '--screen', dest='screen', required=False,
                         default=True, type=str2bool,
                         help='If True, display size info on training and '
@@ -156,7 +201,23 @@ def main():
                         default='original', help='Choice of initial training'
                                                  'sample. It can be "original"'
                                                  'or an integer.')
-
+    parser.add_argument('-sf', '--sep-files', dest='sep_files', required=False, type=str2bool,
+                       default=False, help='If True, assumes samples are given in separate files.' + \
+                       ' Default is False.')
+    parser.add_argument('-bd', '--budgets', dest='budgets', required=False, default=None,
+                       help='List of available telescope resources for querying.', nargs='+')
+    parser.add_argument('-pv', '--path-validation', dest='val', required=False, default=None,
+                       help='Path to validation sample. Only used if sep_files=True.')
+    parser.add_argument('-pt', '--path-test', dest='test', required=False, default=None,
+                       help='Path to test sample. Only used if sep_files == True.')
+    parser.add_argument('-fp', '--fname-pattern', dest='fname_pattern', required=False, default=['day_', '.csv'],
+                        help='Filename pattern for time domain.')
+    parser.add_argument('-sv', '--survey', dest='survey', required=False, default='DES',
+                       help='Survey. Options are "DES" or "LSST". Default is "DES".')
+    parser.add_argument('-qb', '--queryable', dest='queryable', required=False, default=True,
+                       help='If True consider mag of the object at the moment of query.')
+    parser.add_argument('-n', '--n-estimators', dest='n_estimators', required=False, default=1000, 
+                       help='Number of trees in Random Forest.')
 
     from_user = parser.parse_args()
 

From 963634c723f6ae5651ff377423a07706fa7ea32b Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Sat, 29 Jul 2023 03:37:21 +0200
Subject: [PATCH 10/12] update version call

---
 resspect/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/resspect/__init__.py b/resspect/__init__.py
index a068f197..07f60f01 100644
--- a/resspect/__init__.py
+++ b/resspect/__init__.py
@@ -44,6 +44,11 @@
 from .query_budget_strategies import *
 from .bump import *
 
+import importlib.metadata
+
+__version__ = importlib.metadata.version("resspect")
+
+
 __all__ = ['accuracy',
            'assign_cosmo',
            'bazin',

From 40e4623e26792ad451d27320ce9c9a635ebda9c4 Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Sat, 29 Jul 2023 18:58:19 +0200
Subject: [PATCH 11/12] fix time domain

---
 docs/learn_loop.rst                 | 11 ++++-
 resspect/scripts/run_time_domain.py | 66 ++++++++++++++++++++---------
 2 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/docs/learn_loop.rst b/docs/learn_loop.rst
index d7131701..0e44431f 100644
--- a/docs/learn_loop.rst
+++ b/docs/learn_loop.rst
@@ -238,8 +238,15 @@ This will save samples to individual files. From these, only the `query` sample
                          screen=screen, initial_training=training,
                          survey=survey, queryable=queryable, n_estimators=n_estimators)
 
+The same result can be achieved using the command line using the `run_time_domain` script:
 
-    
-
+.. code-block:: bash
+    :linenos:
 
+    >>> run_time_domain -d <first day of survey> <last day of survey>
+    >>>        -m <output metrics file> -q <output queried file> -f <features pool sample directory>
+    >>>        -s <learning strategy> -t <training choice>
+    >>>        -fl <path to initial training > -pv <path to validation> -pt <path to test>
+    
 
+.. warning:: Make sure you check the values of the optional variables as well!
\ No newline at end of file
diff --git a/resspect/scripts/run_time_domain.py b/resspect/scripts/run_time_domain.py
index e1b45038..5d509496 100644
--- a/resspect/scripts/run_time_domain.py
+++ b/resspect/scripts/run_time_domain.py
@@ -23,12 +23,25 @@
 
 
 def str2none(v):
-    if v == None or v.isdigit() or v == 'original' or int(v).isdigit():
+    """Transform a given variable into None or str.
+
+    Parameters
+    ----------
+    v: str or None or int
+
+    Returns
+    -------
+    None or str or int
+    """
+    
+    if v == None or v == 'original':
         return v
-    elif str(v) == 'None':
+    elif str(v) == 'None' or str(v[0]) == 'None':
         return None
     elif isinstance(v, list):
         return ([int(v[0]), int(v[1])])
+    elif int(str(v)) > 0:
+        return int(v)
 
 def run_time_domain(user_choice):
     """Command line interface to the Time Domain Active Learning scenario.
@@ -57,6 +70,8 @@ def run_time_domain(user_choice):
         Machine Learning algorithm.
         Currently 'RandomForest', 'GradientBoostedTrees'
         'K-NNclassifier' and 'MLPclassifier' are implemented.
+    -cn: bool (optional)
+        If True, this concerns the canonical sample. Default is False.
     -fl: file containing full light curve features to train on
          if -t original is chosen.
     -fp: list of str (optional)
@@ -109,8 +124,8 @@ def run_time_domain(user_choice):
 
     Be aware to check the default options as well!
     """
-
-    # set parameters
+    """
+    # set parameters"""
     days = [int(user_choice.days[0]), int(user_choice.days[1])]
     output_metrics_file = user_choice.metrics
     output_query_file = user_choice.queried
@@ -119,26 +134,23 @@ def run_time_domain(user_choice):
     training = str2none(user_choice.training)
     queryable = user_choice.queryable
 
+    
     batch = str2none(user_choice.batch)
     classifier = user_choice.classifier
     screen = user_choice.screen
     fname_pattern = user_choice.fname_pattern
     n_estimators = user_choice.n_estimators
-
+    
     sep_files = user_choice.sep_files
     budgets = str2none(user_choice.budgets)
     survey = user_choice.survey
-
+    canonical = user_choice.canonical
+    
     path_to_ini_files = {}
     path_to_ini_files['train'] = user_choice.full_features
     path_to_ini_files['validation'] = user_choice.val
-    path_to_ini_files['test'] = user_choice.test
-
-    print(user_choice)
-    print(path_to_ini_files)
-    print('sep_files = ', sep_files)
-    print('initial_training = ', training)
-
+    path_to_ini_files['test'] = user_choice.test    
+    
     # run time domain loop
     time_domain_loop(days=days, output_metrics_file=output_metrics_file,
                  output_queried_file=output_query_file, 
@@ -148,10 +160,21 @@ def run_time_domain(user_choice):
                  batch=batch, classifier=classifier,
                  sep_files=sep_files, budgets=budgets,
                  screen=screen, initial_training=training,
-                 survey=survey, queryable=queryable, n_estimators=n_estimators)
+                 survey=survey, queryable=queryable, n_estimators=n_estimators,
+                 canonical=canonical)
 
 
 def str2bool(v):
+    """Convert str into bool.
+
+    Parameters
+    ----------
+    v: str or bool
+
+    Returns
+    -------
+    bool
+    """
     if isinstance(v, bool):
         return v
     if v.lower() in ('yes', 'true', 't', 'y', '1', 'True', 'TRUE'):
@@ -163,10 +186,10 @@ def str2bool(v):
 
 
 def main():
-
     # get input directory and output file name from user
     parser = argparse.ArgumentParser(description='resspect - '
                                                  'Time Domain loop module')
+
     parser.add_argument('-d', '--days', dest='days', required=True,
                         help='First and last day of survey.',
                         nargs='+')
@@ -210,19 +233,24 @@ def main():
                        help='Path to validation sample. Only used if sep_files=True.')
     parser.add_argument('-pt', '--path-test', dest='test', required=False, default=None,
                        help='Path to test sample. Only used if sep_files == True.')
-    parser.add_argument('-fp', '--fname-pattern', dest='fname_pattern', required=False, default=['day_', '.csv'],
+    parser.add_argument('-fp', '--fname-pattern', dest='fname_pattern', required=False, nargs='+', 
+                        default=['day_', '.csv'],
                         help='Filename pattern for time domain.')
     parser.add_argument('-sv', '--survey', dest='survey', required=False, default='DES',
                        help='Survey. Options are "DES" or "LSST". Default is "DES".')
     parser.add_argument('-qb', '--queryable', dest='queryable', required=False, default=True,
+                        type=str2bool,
                        help='If True consider mag of the object at the moment of query.')
     parser.add_argument('-n', '--n-estimators', dest='n_estimators', required=False, default=1000, 
-                       help='Number of trees in Random Forest.')
-
+                       help='Number of trees in Random Forest.', type=int)
+    parser.add_argument('-cn', '--canonical', dest='canonical', required=False, default=False,
+                        type=str2bool,
+                       help='If True this concerns the canonical sample. Default is False.')
+    
     from_user = parser.parse_args()
 
     run_time_domain(from_user)
-
+  
 
 if __name__ == '__main__':
     main()

From 6a0e8435067acfc08b32dffaee6468520220275b Mon Sep 17 00:00:00 2001
From: emilleishida <emilleishida@gmail.com>
Date: Sat, 29 Jul 2023 20:37:14 +0200
Subject: [PATCH 12/12] fix docs for time domain

---
 docs/learn_loop.rst | 68 ++++++++++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/docs/learn_loop.rst b/docs/learn_loop.rst
index 0e44431f..55c26094 100644
--- a/docs/learn_loop.rst
+++ b/docs/learn_loop.rst
@@ -101,7 +101,7 @@ Alternatively you can also run everything from the command line:
 
 .. code-block:: bash
 
-   >>> run_loop.py -i <input features file> -b <batch size> -n <number of loops>
+   >>> run_loop -i <input features file> -b <batch size> -n <number of loops>
    >>>             -m <output metrics file> -q <output queried sample file>
    >>>             -s <learning strategy> -t <choice of initial training>
 
@@ -119,37 +119,52 @@ following the same algorithm described in `Ishida et al., 2019 <https://cosmosta
 
     >>> from resspect import time_domain_loop
     
-    >>> days = [20, 180]                                # first and last day of the survey to be considered
-    >>> training = 'original'                           # if int take int number of objects for initial training, 50% being Ia
+    >>> days = [20, 180]                                # first and last day of the survey
+    >>> training = 'original'                           # if int take int number of objs 
+                                                        # for initial training, 50% being Ia
+
     >>> strategy = 'UncSampling'                        # learning strategy
-    >>> batch = 1                                       # if int, ignore cost per observation, if None find optimal batch size
-    >>> sep_files = False                               # if True, expects train, test and validation samples in separate filess
+    >>> batch = 1                                       # if int, ignore cost per observation, 
+                                                        # if None find optimal batch size
+
+    >>> sep_files = False                               # if True, expects train, test and 
+                                                        # validation samples in separate filess
     
     >>> path_to_features_dir = 'results/time_domain/'   # folder where the files for each day are stored
-    
+
+    >>> # output results for metrics
     >>> output_metrics_file = 'results/metrics_' + strategy + '_' + str(training) + \
-                           '_batch' + str(batch) +  '.csv'                               # output results for metrics
+                           '_batch' + str(batch) +  '.csv'         
+
+    >>> # output query sample
     >>> output_query_file = 'results/queried_' + strategy + '_' + str(training) + \
-                            '_batch' + str(batch) +  '.csv'                              # output query sample
+                            '_batch' + str(batch) +  '.csv'                              
                             
     >>> path_to_ini_files = {}
-    >>> path_to_ini_files['train'] = 'results/Bazin.csv'                                 # features from full light curves for initial training sample 
+
+    >>> # features from full light curves for initial training sample 
+    >>> path_to_ini_files['train'] = 'results/Bazin.csv'
     >>> survey='DES'
-    
+
     >>> classifier = 'RandomForest'
     >>> n_estimators = 1000                             # number of trees in the forest
     
     >>> feature_method = 'Bazin'
-    >>> screen = False                                  # if True will print many intermediate steps for debuging 
-    >>> fname_pattern = ['day_', '.csv']                # pattern on filename where different days of the survey are stored                              
-    >>> queryable= True                                 # if True, check brightness before considering an object queryable
+    >>> screen = False                                  # if True will print many things for debuging 
+    >>> fname_pattern = ['day_', '.csv']                # pattern on filename where different days 
+                                                        # are stored                              
+
+    >>> queryable= True                                 # if True, check brightness before considering 
+                                                        # an object queryable
     
 
     >>> # run time domain loop
     >>> time_domain_loop(days=days, output_metrics_file=output_metrics_file,
-    >>>                  output_queried_file=output_query_file, path_to_ini_files=path_to_ini_files,
+    >>>                  output_queried_file=output_query_file, 
+    >>>                  path_to_ini_files=path_to_ini_files,
     >>>                  path_to_features_dir=path_to_features_dir,
-    >>>                  strategy=strategy, fname_pattern=fname_pattern, batch=batch, classifier=classifier,
+    >>>                  strategy=strategy, fname_pattern=fname_pattern, batch=batch, 
+    >>>                  classifier=classifier,
     >>>                  sep_files=sep_files, budgets=budgets,
     >>>                  screen=screen, initial_training=training,
     >>>                  survey=survey, queryable=queryable, n_estimators=n_estimators)
@@ -215,10 +230,13 @@ This will save samples to individual files. From these, only the `query` sample
     :linenos:
 
     >>> sep_files = True         
-    >>> batch = None                            # use telescope time budgets instead of fixed number of queries per loop      
+    >>> batch = None                            # use telescope time budgets instead of 
+                                                # fixed number of queries per loop      
+
     >>> budgets = (6. * 3600, 6. * 3600)        # budget of 6 hours per night of observation
 
-    >>> path_to_features_dir = 'results/time_domain/'  # this is the path to the directory where the query sample
+    >>> path_to_features_dir = 'results/time_domain/'  # this is the path to the directory 
+                                                       # where the pool sample
                                                        # processed for time domain is stored
 
     >>> path_to_ini_files = {}
@@ -229,14 +247,14 @@ This will save samples to individual files. From these, only the `query` sample
     
     >>> # run time domain loop
     >>> time_domain_loop(days=days, output_metrics_file=output_diag_file,
-                         output_queried_file=output_query_file, 
-                         path_to_ini_files=path_to_ini_files,
-                         path_to_features_dir=path_to_features_dir,
-                         strategy=strategy, fname_pattern=fname_pattern, 
-                         batch=batch, classifier=classifier,
-                         sep_files=sep_files, budgets=budgets,
-                         screen=screen, initial_training=training,
-                         survey=survey, queryable=queryable, n_estimators=n_estimators)
+    >>>                  output_queried_file=output_query_file, 
+    >>>                  path_to_ini_files=path_to_ini_files,
+    >>>                  path_to_features_dir=path_to_features_dir,
+    >>>                  strategy=strategy, fname_pattern=fname_pattern, 
+    >>>                  batch=batch, classifier=classifier,
+    >>>                  sep_files=sep_files, budgets=budgets,
+    >>>                  screen=screen, initial_training=training,
+    >>>                  survey=survey, queryable=queryable, n_estimators=n_estimators)
 
 The same result can be achieved using the command line using the `run_time_domain` script: