From 152875f3462ca597baf94c30384f5042db82d0f6 Mon Sep 17 00:00:00 2001
From: jsadler2 <jeff.sadler@okstate.edu>
Date: Tue, 13 Dec 2022 13:24:37 -0600
Subject: [PATCH 1/6] [#179] mv it fxns to 2a_model/src

---
 .../src/do_it_functions.py                    |  0
 .../src/it_functions.py                       |  0
 .../Snakefile_functional_performance.smk      | 31 -------------------
 3 files changed, 31 deletions(-)
 rename {scratch/Functional_Performance => 2a_model}/src/do_it_functions.py (100%)
 rename {scratch/Functional_Performance => 2a_model}/src/it_functions.py (100%)
 delete mode 100644 scratch/Functional_Performance/Snakefile_functional_performance.smk

diff --git a/scratch/Functional_Performance/src/do_it_functions.py b/2a_model/src/do_it_functions.py
similarity index 100%
rename from scratch/Functional_Performance/src/do_it_functions.py
rename to 2a_model/src/do_it_functions.py
diff --git a/scratch/Functional_Performance/src/it_functions.py b/2a_model/src/it_functions.py
similarity index 100%
rename from scratch/Functional_Performance/src/it_functions.py
rename to 2a_model/src/it_functions.py
diff --git a/scratch/Functional_Performance/Snakefile_functional_performance.smk b/scratch/Functional_Performance/Snakefile_functional_performance.smk
deleted file mode 100644
index d4a9cad8..00000000
--- a/scratch/Functional_Performance/Snakefile_functional_performance.smk
+++ /dev/null
@@ -1,31 +0,0 @@
-import sys
-sys.path.insert(0, 'scratch/Functional_Performance/src')
-from do_it_functions import get_max_it_df
-
-
-models = ['0_baseline_LSTM','2_multitask_dense']
-replicate = 0
-sink = ['do_min', 'do_mean', 'do_max']
-source = 'srad'
-sites = 'all'
-
-rule calc_it_metrics_single_rep:
-    input:
-        #input data file
-        "../scratch/4_func_perf/in/results_tmmx_tmmn/med_obs_io.zarr",
-        #base file path to models
-        "../scratch/4_func_perf/in/results_tmmx_tmmn/models"
-    output:
-        "out/it_metrics_srad_all_sites_0.csv"
-    run:
-        get_max_it_df(input_file = input[0], 
-                    models = models, 
-                    base_file_path = input[1], 
-                    output_file = output[0], 
-                    replicate = replicate, 
-                    sink = sink,
-                    source=source, 
-                    sites = sites)
-        
-
-

From 8da02e941104d9e4dc60b4b7f1b736698e94254e Mon Sep 17 00:00:00 2001
From: jsadler2 <jeff.sadler@okstate.edu>
Date: Tue, 13 Dec 2022 14:51:00 -0600
Subject: [PATCH 2/6] [#179] functional perf for just one site/model/sink/src

---
 2a_model/src/do_it_functions.py | 319 ++++++++++++--------------------
 1 file changed, 118 insertions(+), 201 deletions(-)

diff --git a/2a_model/src/do_it_functions.py b/2a_model/src/do_it_functions.py
index 82d51dcc..097955ce 100644
--- a/2a_model/src/do_it_functions.py
+++ b/2a_model/src/do_it_functions.py
@@ -12,33 +12,49 @@
 import math
 import xarray as xr
 
-def calc_it_metrics_sites(inputs_df, source, sink, site, log_transform, models, replicate, base_file_path):
+def calc_it_metrics_site(inputs_zarr,
+                         predictions_file,
+                         source,
+                         sink,
+                         site,
+                         log_transform,
+                         model,
+                         replicate,
+                         outfile=None):
     '''
+    Calculate the transfer entropy (TE) and Mutual Information (MI) between
+    one input (source) and one output (sink) at one site and one replicate
+
     Parameters
     ----------
-    inputs_df : pandas dataframe
-        observed io data from zarr file
+    inputs_zarr : str
+        path to io zarr file
+    predictions_file : str
+        path to preds.feather file
     source : str
-        source for calculations (srad, tmmx, tmmn)
-    sink : list
-        sinks ['do_min', 'do_mean', 'do_max']
+        source for calculations (e.g., srad, tmmx, tmmn)
+    sink : str
+        sink for calculations (e.g., 'do_min', 'do_mean', 'do_max')
     site : str
         site number
     log_transform : boolean
         should the source variable be log10 transformed, should only be log10 transformed for discharge
-    models: iterable (list or tuple)
-        the models for which you want do the calcs (e.g., ['0_baseline_LSTM','2_multitask_dense'])
+    model: str
+        the model for which you are doing the calcs (e.g., '0_baseline_LSTM', 'observed')
     replicate: int
-        which replicate you want to do the calcs for
-    base_file_path: str
-        filepath where the model results are
+        which replicate you are doing the calcs for
+    outfile: str
+        filepath to store the output (if desired)
         
     Returns
     -------
     Information theory metric results (transfer entropy) as a nested dictionary
 
     '''
+    inputs = xr.open_zarr(inputs_zarr, consolidated=False)
+    inputs_df = inputs.to_dataframe()
     
+    # TODO: it'd be nice to read this in dynamically at some point
     inputs_site = inputs_df.loc[site][['CAT_BASIN_AREA', 'CAT_BASIN_SLOPE', 'CAT_CNPY11_BUFF100',
            'CAT_ELEV_MEAN', 'CAT_IMPV11', 'CAT_TWI', 'SLOPE', 'day.length', 'depth',
            'discharge', 'light_ratio',
@@ -47,220 +63,121 @@ def calc_it_metrics_sites(inputs_df, source, sink, site, log_transform, models,
            'tmmx', 'velocity', 'vs']]
     targets_site = inputs_df.loc[site][['do_min','do_mean','do_max']]
     
-    if 'do_range' in sink:
+    if sink == 'do_range':
         targets_site['do_range'] = targets_site['do_max']-targets_site['do_min']
         
     tar_dict = {}
-    for m in models:
-        file_path = f'{base_file_path}/{m}/nstates_10/nep_100/rep_{replicate}/preds.feather'
-        model_preds = pd.read_feather(file_path)
-        model_preds = model_preds[model_preds['site_id'] == site].set_index('date')[['do_min','do_mean','do_max']]
-        model_preds['do_range'] = model_preds['do_max']-model_preds['do_min']
-        #create targets dictionary
-        tar_dict[m] = model_preds
+
+    model_preds = pd.read_feather(predictions_file)
+    model_preds = model_preds[model_preds['site_id'] == site].set_index('date')[['do_min','do_mean','do_max']]
+    model_preds['do_range'] = model_preds['do_max']-model_preds['do_min']
+    #create targets dictionary
+    if model != 'observed':
+        tar_dict[model] = model_preds
     tar_dict['observed'] = targets_site
 
 
     
     #create dictionary to store calculations in
-    #max_it = {'do_min': {}, 'do_mean': {}, 'do_max':{}, 'do_range':{}}
-    max_it = {s: {} for s in sink}
+    max_it = {}
     #create a nested dictionary for each DO variable to store it calcs
     #TE0 = Transfer Entropy at a time lag of 0, MI = mututal information,
     #TEmax is the maximum TE, TEmaxT is the time lag of the maximum TE,
     #TEmaxcrit is the a True/False if TEmax is significant, everything follows the same 
     #convention for MI
-    for key in max_it.keys():
-        max_it[key] = {'model': [],'rmse':[], 
-                       'TE0':[], 'TE1':[], 'TE2':[], 
-                       'TE3':[], 'TE4':[], 'TE5':[],
-                       'TE6':[], 'TE7':[], 'TE8':[],
-                       'TEmax':[],'TEmaxt':[], 'TEmaxcrit':[], 
-                       'MI0':[], 'MI1':[], 'MI2':[],
-                       'MI3':[], 'MI4':[], 'MI5':[],
-                       'MI6':[], 'MI7':[], 'MI8':[],
-                      'MImax':[],'MImaxt':[],'MImaxcrit':[]}
     
     
+    #join input and target to make sure they are aligned
+    site_inptar = inputs_site.join(tar_dict[model], rsuffix = '_pred')
     
-    for model in tar_dict.keys():
-        #join input and target to make sure they are aligned
-        site_inptar = inputs_site.join(tar_dict[model], rsuffix = '_pred')
-        
-        #assign x = source, y = sink
-        x = site_inptar[source]
-        y = site_inptar[sink]
-        
-        #for calculating rmse
-        obs_pred = tar_dict['observed'][sink].join(tar_dict[model][sink], rsuffix = '_pred')
-        
-        
-        #load the preprocessing fucntions from it_functions.py
-        ppf = it_functions.pre_proc_func()
-        
-        if log_transform:
-            xl10 = ppf.log10(x)
-        else:
-            xl10 = x.copy()
-        x_rss = ppf.remove_seasonal_signal(xl10)
-        x_ss = ppf.standardize(x_rss)
-        
-        y_prepped = {}
-        for snk in sink:
-            y_rss = ppf.remove_seasonal_signal(y[snk])
-            y_prepped[snk] = ppf.standardize(y_rss)
-            y_prepped[snk] = y_prepped[snk]
-        
-        
-        print('Calculating it metrics '+model+' '+site)
-        n_lags = 9
-        nbins = 11
-        it_dict = {}
-        for snk in sink:
-           
-            #create an array of the prepped x and y variables
-            M = np.stack((x_ss,y_prepped[snk]), axis = 1)
-            #Mswap is for caclulating the TE from Y -> X, we don't really need to do that
-            #because DO doesn't affect solar radiation, but it is needed for function
-            Mswap = np.stack((y_prepped[snk], x_ss), axis = 1)
-            #x_bounds and y_bounds are for removing outliers
-            x_bounds = it_functions.find_bounds(M[:,0], 0.1, 99.9)
-            y_bounds = it_functions.find_bounds(M[:,1], 0.1, 99.9)
-            M_x_bound = np.delete(M, np.where((M[:,0] < x_bounds[0]*1.1) | (M[:,0] > x_bounds[1]*1.1)), axis = 0)
-            M_xy_bound = np.delete(M_x_bound, np.where((M_x_bound[:,1] < y_bounds[0]*1.1) | (M_x_bound[:,1] > y_bounds[1]*1.1)), axis = 0)
-
-            #calc it metrics and store in the dictionary it_dict
-            it_dict[snk] = it_functions.calc_it_metrics(M_xy_bound, Mswap, n_lags, nbins, calc_swap = False, alpha = 0.05, ncores = 7)
-        
-        
-        print('Storing it metrics '+model+' '+site)
-        #find the max TE and MI and the time lag at which the max occurs
-        #and store that in a dictionary as well
-        for snk in sink:
-            
-            TEmax = max(it_dict[snk]['TE'])
-            TEmaxt = int(np.where(it_dict[snk]['TE'] == TEmax)[0])
-        
-            if TEmax > it_dict[snk]['TEcrit'][TEmaxt]:
-                TEmaxcrit = True
-            else:
-                TEmaxcrit = False
-            
-            MImax = max(it_dict[snk]['MI'])
-            MImaxt = int(np.where(it_dict[snk]['MI'] == MImax)[0])
-            
-            if MImax > it_dict[snk]['MIcrit'][MImaxt]:
-                MImaxcrit = True
-            else:
-                MImaxcrit = False
-            #do min
-            max_it[snk]['model'].append(model)
-            mse = np.square(np.subtract(obs_pred[snk+'_pred'],obs_pred[snk])).mean()
-            math.sqrt(mse)
-            max_it[snk]['rmse'].append(math.sqrt(mse))
-            
-            max_it[snk]['TEmax'].append(TEmax)
-            for i in range(9):
-                max_it[snk][f'TE{i}'].append(it_dict[snk]['TE'][i])
-            max_it[snk]['TEmaxt'].append(TEmaxt)
-            max_it[snk]['TEmaxcrit'].append(TEmaxcrit)
-            
-            max_it[snk]['MImax'].append(MImax)
-            for i in range(9):
-                max_it[snk][f'MI{i}'].append(it_dict[snk]['MI'][i])
-            max_it[snk]['MImaxt'].append(MImaxt)
-            max_it[snk]['MImaxcrit'].append(MImaxcrit)
-             
-    return max_it
-
-
-def diff_from_obs(df):
-    #function for calculating the difference in TE from modeled to observation
-    diff_df = df.iloc[:,1:11].sub(df.iloc[0,1:11], axis = 1)
-    diff_df['metric'] = df['metric']
-    diff_df['model'] = df['model']
-    return diff_df.iloc[1:5,:]
-
-def site_it_metrics(inputs_df, source, sink, sites, models, replicate, base_file_path):
-    #wrapper function for calculating the it metrics for each site
-    max_it_site = {}
-    for i,site in enumerate(sites):
-
-        print('----------------\n', site,' ', i+1, ' of ', len(sites),' sites')        
-
-        max_it = calc_it_metrics_sites(inputs_df, source, sink, site, log_transform=False,
-                                       models = models, replicate = replicate, base_file_path = base_file_path)
-        
-        max_it_site[site] = max_it
-
-    return max_it_site        
-
-
-def get_max_it_df(input_file, models, base_file_path, output_file, replicate, sink,
-                  source='srad', sites = "all"):
-    '''
-    This function returns the functional performance (Transfer Entropy (TE) and 
-    Mutual Information (MI)) for all models specified, all sinks specified, for
-    the specified source, and for _one_ replicate. 
-    Writes a Pandas DataFrame to the output_file with columns:
-     `model,rmse,TE{0-8,max,maxt,maxcrit},MI{0-8,max,maxt,maxcrit},metric,site,rep_id`
-    Parameters
-    ----------
-    inputs_file : str
-        path to input zarr file
-    models : iterable (list or tuple)
-        the models for which you want to do the calcs (e.g., ['0_baseline_LSTM', '2_multitask_dense'])
-    base_file_path: str
-        filepath where the model results are (e.g., "2a_model/out/models/")
-    output_file: str
-        filepath where the results should be written, should be .csv
-    replicate : int
-        which replicate you want to do the calcs for
-    sink : list
-        sinks ['do_min', 'do_mean', 'do_max']
-    source : str
-        source for calculations (srad, tmmx, tmmn)
-    sites : chr or list
-        if "all" then it metrics are calculated for all sites, if a list is given then 
-        calcs are made only for those sites
-        
-    Returns
-    -------
-    nothing
-    '''
-    inputs = xr.open_zarr(input_file,consolidated=False)
-    inputs_df = inputs.to_dataframe()
+    #assign x = source, y = sink
+    x = site_inptar[source]
+    y = site_inptar[sink]
+    
+    #for calculating rmse
+    obs_pred = tar_dict['observed'][[sink]].join(tar_dict[model][[sink]], rsuffix = '_pred')
     
-    if isinstance(sites, str):
-        assert sites=='all','sites can either be "all" for all sites or a list'
-        sites = inputs_df.index.unique('site_id')
-        sites = sites.drop(['014721254', '014721259'])
+    
+    #load the preprocessing fucntions from it_functions.py
+    ppf = it_functions.pre_proc_func()
+    
+    if log_transform:
+        xl10 = ppf.log10(x)
     else:
-        assert isinstance(sites,list),'sites can either be "all" for all sites or a list'
-         
+        xl10 = x.copy()
+    x_rss = ppf.remove_seasonal_signal(xl10)
+    x_ss = ppf.standardize(x_rss)
     
-    max_it_site = site_it_metrics(inputs_df, source, sink, sites, models,
-                              replicate, base_file_path)
-
-    do_all_sites_list = []
+    y_prepped = {}
+    y_rss = ppf.remove_seasonal_signal(y)
+    y_prepped = ppf.standardize(y_rss)
     
-    for site in sites:
     
-        max_it = max_it_site[site]
-        sink_dfs = []
-        for s in sink:
-            do_sink_df = pd.DataFrame(max_it[s])
-            do_sink_df['metric'] = s
-            sink_dfs.append(do_sink_df)
-        do_df = pd.concat(sink_dfs)
-        do_df['site'] = site
+    print('Calculating it metrics '+model+' '+site)
+    n_lags = 9
+    nbins = 11
+    it_dict = {}
+       
+    #create an array of the prepped x and y variables
+    M = np.stack((x_ss,y_prepped), axis = 1)
+    #Mswap is for caclulating the TE from Y -> X, we don't really need to do that
+    #because DO doesn't affect solar radiation, but it is needed for function
+    Mswap = np.stack((y_prepped, x_ss), axis = 1)
+    #x_bounds and y_bounds are for removing outliers
+    x_bounds = it_functions.find_bounds(M[:,0], 0.1, 99.9)
+    y_bounds = it_functions.find_bounds(M[:,1], 0.1, 99.9)
+    M_x_bound = np.delete(M, np.where((M[:,0] < x_bounds[0]*1.1) | (M[:,0] > x_bounds[1]*1.1)), axis = 0)
+    M_xy_bound = np.delete(M_x_bound, np.where((M_x_bound[:,1] < y_bounds[0]*1.1) | (M_x_bound[:,1] > y_bounds[1]*1.1)), axis = 0)
+
+    #calc it metrics and store in the dictionary it_dict
+    it_dict = it_functions.calc_it_metrics(M_xy_bound, Mswap, n_lags, nbins, calc_swap = False, alpha = 0.05, ncores = 7)
     
-        do_all_sites_list.append(do_df)
+    
+    print('Storing it metrics '+model+' '+site)
+    #find the max TE and MI and the time lag at which the max occurs
+    #and store that in a dictionary as well
         
-    do_all_sites = pd.concat(do_all_sites_list)
+    TEmax = max(it_dict['TE'])
+    TEmaxt = int(np.where(it_dict['TE'] == TEmax)[0])
 
-    do_all_sites.to_csv(output_file)
+    if TEmax > it_dict['TEcrit'][TEmaxt]:
+        TEmaxcrit = True
+    else:
+        TEmaxcrit = False
     
-
+    MImax = max(it_dict['MI'])
+    MImaxt = int(np.where(it_dict['MI'] == MImax)[0])
+    
+    if MImax > it_dict['MIcrit'][MImaxt]:
+        MImaxcrit = True
+    else:
+        MImaxcrit = False
+    #do min
+    max_it['model'] = model
+    mse = np.square(np.subtract(obs_pred[sink+'_pred'],obs_pred[sink])).mean()
+    math.sqrt(mse)
+    max_it['rmse'] = math.sqrt(mse)
+    
+    max_it['TEmax'] = TEmax
+    for i in range(9):
+        max_it[f'TE{i}'] = it_dict['TE'][i]
+    max_it['TEmaxt'] = TEmaxt
+    max_it['TEmaxcrit'] = TEmaxcrit
+    
+    max_it['MImax'] = MImax
+    for i in range(9):
+        max_it[f'MI{i}'] = it_dict['MI'][i]
+    max_it['MImaxt'] = MImaxt
+    max_it['MImaxcrit'] = MImaxcrit
+    max_it['replicate'] = replicate
+    max_it['sink'] = sink
+    max_it['source'] = source
+    max_it['site'] = site
+
+    if outfile:
+        df = pd.DataFrame(max_it, index=[0])
+        df.to_csv(outfile, index=False)
+             
+    return max_it
 
 

From e116862ce250b8ddf504869362521932912b1b88 Mon Sep 17 00:00:00 2001
From: jsadler2 <jeff.sadler@okstate.edu>
Date: Tue, 13 Dec 2022 16:08:34 -0600
Subject: [PATCH 3/6] [#179] func perf files to snakemake

---
 2a_model/src/do_it_functions.py               |  2 +-
 2a_model/src/models/0_baseline_LSTM/Snakefile |  5 +-
 .../src/models/2_multitask_dense/Snakefile    |  3 +-
 2a_model/src/models/Snakefile_base.smk        | 62 +++++++++++++++++--
 4 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/2a_model/src/do_it_functions.py b/2a_model/src/do_it_functions.py
index 097955ce..b16ba341 100644
--- a/2a_model/src/do_it_functions.py
+++ b/2a_model/src/do_it_functions.py
@@ -55,7 +55,7 @@ def calc_it_metrics_site(inputs_zarr,
     inputs_df = inputs.to_dataframe()
     
     # TODO: it'd be nice to read this in dynamically at some point
-    inputs_site = inputs_df.loc[site][['CAT_BASIN_AREA', 'CAT_BASIN_SLOPE', 'CAT_CNPY11_BUFF100',
+    inputs_site = inputs_df.loc[site][['CAT_BASIN_SLOPE', 'CAT_CNPY11_BUFF100',
            'CAT_ELEV_MEAN', 'CAT_IMPV11', 'CAT_TWI', 'SLOPE', 'day.length', 'depth',
            'discharge', 'light_ratio',
            'model_confidence', 'pr', 'resolution', 'rmax', 'rmin', 'shortwave',
diff --git a/2a_model/src/models/0_baseline_LSTM/Snakefile b/2a_model/src/models/0_baseline_LSTM/Snakefile
index 954aae03..77e613cf 100644
--- a/2a_model/src/models/0_baseline_LSTM/Snakefile
+++ b/2a_model/src/models/0_baseline_LSTM/Snakefile
@@ -16,7 +16,10 @@ rule all:
                  epochs=config['epochs'],
                  rep=list(range(config['num_replicates'])),
                  site_id=['01480870'],
-                 year=[2012])
+                 year=[2012]),
+          f"{out_dir}/{config['exp_name']}_func_perf.csv",
+          f"{out_dir}/observed_func_perf.csv"
+
         
 
 module base_workflow:
diff --git a/2a_model/src/models/2_multitask_dense/Snakefile b/2a_model/src/models/2_multitask_dense/Snakefile
index 249f81a3..1280e885 100644
--- a/2a_model/src/models/2_multitask_dense/Snakefile
+++ b/2a_model/src/models/2_multitask_dense/Snakefile
@@ -16,7 +16,8 @@ rule all:
                  epochs=config['epochs'],
                  rep=list(range(config['num_replicates'])),
                  site_id=['01480870'],
-                 year=[2012])
+                 year=[2012]),
+          f"{out_dir}/{config['exp_name']}_func_perf.csv"
         
 
 module base_workflow:
diff --git a/2a_model/src/models/Snakefile_base.smk b/2a_model/src/models/Snakefile_base.smk
index 8836c4d2..d83741ab 100644
--- a/2a_model/src/models/Snakefile_base.smk
+++ b/2a_model/src/models/Snakefile_base.smk
@@ -1,11 +1,15 @@
 import os
+import xarray as xr
 import tensorflow as tf
 import numpy as np
 import pandas as pd
 import sys
 
-code_dir = "../river-dl"
-sys.path.append(code_dir)
+river_dl_dir = "../river-dl"
+sys.path.append(river_dl_dir)
+
+src_dir = "../.."
+sys.path.append(src_dir)
 
 from river_dl.preproc_utils import asRunConfig
 from river_dl.preproc_utils import prep_all_data
@@ -14,6 +18,7 @@ from river_dl.postproc_utils import plot_obs, plot_ts, prepped_array_to_df
 from river_dl.predict import predict_from_arbitrary_data
 from river_dl.train import train_model
 from river_dl import loss_functions as lf
+from do_it_functions import calc_it_metrics_site
 
 out_dir = os.path.join(config['out_dir'], config['exp_name'])
 loss_function = lf.multitask_rmse(config['lambdas'])
@@ -29,7 +34,7 @@ rule as_run_config:
 
 rule prep_io_data:
     input:
-        f"../../../out/well_obs_io.zarr",
+        "../../../out/well_obs_io.zarr",
     output:
         "{outdir}/prepped.npz"
     run:
@@ -100,7 +105,7 @@ rule make_predictions:
     input:
         "{outdir}/prepped.npz",
         "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/train_weights/",
-        f"../../../out/well_obs_io.zarr",
+        "../../../out/well_obs_io.zarr",
     output:
         "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/preds.feather",
     run:
@@ -174,7 +179,7 @@ def get_grp_arg(wildcards):
  
 rule combine_metrics:
      input:
-          f"../../../out/well_obs_io.zarr",
+          "../../../out/well_obs_io.zarr",
           "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/trn_preds.feather",
           "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/val_preds.feather",
           "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/val_times_preds.feather"
@@ -226,3 +231,50 @@ rule plot_prepped_data:
                   partition=wildcards.partition)
 
 
+rule calc_functional_performance_one:
+    input:
+        "../../../out/well_obs_io.zarr",
+        "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/preds.feather"
+    output:
+        "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/func_perf/{site}-{src}-{snk}-{model}.csv"
+    run:
+        calc_it_metrics_site(input[0],
+                             input[1],
+                             wildcards.src,
+                             wildcards.snk,
+                             wildcards.site,
+                             log_transform=False,
+                             model=wildcards.model,
+                             replicate=wildcards.rep,
+                             outfile=output[0])
+
+
+def get_func_perf_sites():
+    input_file = "../../../out/well_obs_io.zarr"
+    inputs = xr.open_zarr(input_file, consolidated=False)
+    inputs_df = inputs.to_dataframe()
+    
+    sites = inputs_df.index.unique('site_id')
+    sites = sites.drop(['014721254', '014721259'])
+    return sites
+
+
+rule gather_func_performances:
+    input:
+        expand("{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/func_perf/{site}-{src}-{snk}-{{model}}.csv",
+                outdir=out_dir,
+                nstates=config['hidden_size'],
+                epochs=config['epochs'],
+                rep=list(range(config['num_replicates'])),
+                site=get_func_perf_sites(),
+                src=['tmmx'],
+                snk=['do_min', 'do_mean', 'do_max'])
+    output:
+        "{outdir}/{model}_func_perf.csv"
+    run:
+        df_list = []
+        for in_file in input:
+            df = pd.read_csv(in_file, dtype={"site": str})
+            df_list.append(df)
+        df_comb = pd.concat(df_list)
+        df_comb.to_csv(output[0], index=False)

From 5a3a7c9cd3eb08c5b3cfaae4395d9c39fea93b7c Mon Sep 17 00:00:00 2001
From: jsadler2 <jeff.sadler@okstate.edu>
Date: Wed, 21 Dec 2022 09:47:00 -0600
Subject: [PATCH 4/6] Snakemake only checks modified time w/o this, snakemake
 retrains models unnecessarily

---
 2a_model.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/2a_model.R b/2a_model.R
index 50829fee..77970608 100644
--- a/2a_model.R
+++ b/2a_model.R
@@ -230,7 +230,7 @@ p2a_targets_list <- list(
     system(stringr::str_glue("snakemake -s {snakefile_path} --configfile {config_path} -j --touch --rerun-incomplete"))
 
     # then run the snakemake pipeline to produce the predictions and metric files
-    system(stringr::str_glue("snakemake -s {snakefile_path} --configfile {config_path} -j --rerun-incomplete "))
+    system(stringr::str_glue("snakemake -s {snakefile_path} --configfile {config_path} -j --rerun-incomplete --rerun-trigger mtime"))
     
     # print out the metrics file name for the target
     file.path("2a_model/out/models", p2a_model_ids$model_id, "exp_overall_metrics.csv")

From c21d80b353aaf5850932cf0b9ce2275bf6b9c22c Mon Sep 17 00:00:00 2001
From: jsadler2 <jeff.sadler@okstate.edu>
Date: Wed, 21 Dec 2022 11:33:06 -0600
Subject: [PATCH 5/6] [#179] incorporate FP files in 2a_metrics_files target

---
 2a_model.R | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/2a_model.R b/2a_model.R
index 77970608..4b6b26de 100644
--- a/2a_model.R
+++ b/2a_model.R
@@ -191,12 +191,12 @@ p2a_targets_list <- list(
              config_path = stringr::str_remove(p2a_config_baseline_LSTM_yml, "2a_model/src/models/")),
          #the 1_ models use the same model and therefore the same Snakefile
          #as the 0_baseline_LSTM run
-        #list(model_id = "1_metab_multitask",
-             #snakefile_dir = "0_baseline_LSTM",
-             #config_path = stringr::str_remove(p2a_config_metab_multitask_yml, "2a_model/src/models/")),
-        #list(model_id = "1a_multitask_do_gpp_er",
-             #snakefile_dir = "0_baseline_LSTM",
-             #config_path = stringr::str_remove(p2a_config_1a_metab_multitask_yml, "2a_model/src/models/")),
+        list(model_id = "1_metab_multitask",
+             snakefile_dir = "0_baseline_LSTM",
+             config_path = stringr::str_remove(p2a_config_metab_multitask_yml, "2a_model/src/models/")),
+        list(model_id = "1a_multitask_do_gpp_er",
+             snakefile_dir = "0_baseline_LSTM",
+             config_path = stringr::str_remove(p2a_config_1a_metab_multitask_yml, "2a_model/src/models/")),
         list(model_id = "2_multitask_dense",
              snakefile_dir = "2_multitask_dense",
              config_path = stringr::str_remove(p2a_config_multitask_dense_yml, "2a_model/src/models/"))
@@ -232,8 +232,11 @@ p2a_targets_list <- list(
     # then run the snakemake pipeline to produce the predictions and metric files
     system(stringr::str_glue("snakemake -s {snakefile_path} --configfile {config_path} -j --rerun-incomplete --rerun-trigger mtime"))
     
-    # print out the metrics file name for the target
-    file.path("2a_model/out/models", p2a_model_ids$model_id, "exp_overall_metrics.csv")
+    # print out the FP and PP metrics file name for the target
+    c(
+        file.path("2a_model/out/models", p2a_model_ids$model_id, "exp_overall_metrics.csv"),
+        file.path("2a_model/out/models", p2a_model_ids$model_id, paste0(p2a_model_ids$model_id, "_func_perf.csv"))
+      )
     },
     format="file",
     pattern = map(p2a_model_ids)

From 24c4b9cfdaf59cf5f4062f1eba212a121630756c Mon Sep 17 00:00:00 2001
From: jsadler2 <jeff.sadler@okstate.edu>
Date: Thu, 22 Dec 2022 14:45:45 -0600
Subject: [PATCH 6/6] [#179] add 'corr', 'TEcrit', 'MIcrit' to FP csv

---
 2a_model/src/do_it_functions.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/2a_model/src/do_it_functions.py b/2a_model/src/do_it_functions.py
index b16ba341..67c992da 100644
--- a/2a_model/src/do_it_functions.py
+++ b/2a_model/src/do_it_functions.py
@@ -159,14 +159,13 @@ def calc_it_metrics_site(inputs_zarr,
     max_it['rmse'] = math.sqrt(mse)
     
     max_it['TEmax'] = TEmax
-    for i in range(9):
-        max_it[f'TE{i}'] = it_dict['TE'][i]
     max_it['TEmaxt'] = TEmaxt
     max_it['TEmaxcrit'] = TEmaxcrit
     
     max_it['MImax'] = MImax
-    for i in range(9):
-        max_it[f'MI{i}'] = it_dict['MI'][i]
+    for variable in ['TE', 'TEcrit', 'MI', 'MIcrit', 'corr']:
+        for i in range(9):
+            max_it[f'{variable}{i}'] = it_dict[variable][i]
     max_it['MImaxt'] = MImaxt
     max_it['MImaxcrit'] = MImaxcrit
     max_it['replicate'] = replicate