From 152875f3462ca597baf94c30384f5042db82d0f6 Mon Sep 17 00:00:00 2001 From: jsadler2 Date: Tue, 13 Dec 2022 13:24:37 -0600 Subject: [PATCH 1/6] [#179] mv it fxns to 2a_model/src --- .../src/do_it_functions.py | 0 .../src/it_functions.py | 0 .../Snakefile_functional_performance.smk | 31 ------------------- 3 files changed, 31 deletions(-) rename {scratch/Functional_Performance => 2a_model}/src/do_it_functions.py (100%) rename {scratch/Functional_Performance => 2a_model}/src/it_functions.py (100%) delete mode 100644 scratch/Functional_Performance/Snakefile_functional_performance.smk diff --git a/scratch/Functional_Performance/src/do_it_functions.py b/2a_model/src/do_it_functions.py similarity index 100% rename from scratch/Functional_Performance/src/do_it_functions.py rename to 2a_model/src/do_it_functions.py diff --git a/scratch/Functional_Performance/src/it_functions.py b/2a_model/src/it_functions.py similarity index 100% rename from scratch/Functional_Performance/src/it_functions.py rename to 2a_model/src/it_functions.py diff --git a/scratch/Functional_Performance/Snakefile_functional_performance.smk b/scratch/Functional_Performance/Snakefile_functional_performance.smk deleted file mode 100644 index d4a9cad8..00000000 --- a/scratch/Functional_Performance/Snakefile_functional_performance.smk +++ /dev/null @@ -1,31 +0,0 @@ -import sys -sys.path.insert(0, 'scratch/Functional_Performance/src') -from do_it_functions import get_max_it_df - - -models = ['0_baseline_LSTM','2_multitask_dense'] -replicate = 0 -sink = ['do_min', 'do_mean', 'do_max'] -source = 'srad' -sites = 'all' - -rule calc_it_metrics_single_rep: - input: - #input data file - "../scratch/4_func_perf/in/results_tmmx_tmmn/med_obs_io.zarr", - #base file path to models - "../scratch/4_func_perf/in/results_tmmx_tmmn/models" - output: - "out/it_metrics_srad_all_sites_0.csv" - run: - get_max_it_df(input_file = input[0], - models = models, - base_file_path = input[1], - output_file = output[0], - replicate = replicate, - sink = sink, - source=source, - sites = sites) - - - From 8da02e941104d9e4dc60b4b7f1b736698e94254e Mon Sep 17 00:00:00 2001 From: jsadler2 Date: Tue, 13 Dec 2022 14:51:00 -0600 Subject: [PATCH 2/6] [#179] functional perf for just one site/model/sink/src --- 2a_model/src/do_it_functions.py | 319 ++++++++++++-------------------- 1 file changed, 118 insertions(+), 201 deletions(-) diff --git a/2a_model/src/do_it_functions.py b/2a_model/src/do_it_functions.py index 82d51dcc..097955ce 100644 --- a/2a_model/src/do_it_functions.py +++ b/2a_model/src/do_it_functions.py @@ -12,33 +12,49 @@ import math import xarray as xr -def calc_it_metrics_sites(inputs_df, source, sink, site, log_transform, models, replicate, base_file_path): +def calc_it_metrics_site(inputs_zarr, + predictions_file, + source, + sink, + site, + log_transform, + model, + replicate, + outfile=None): ''' + Calculate the transfer entropy (TE) and Mutual Information (MI) between + one input (source) and one output (sink) at one site and one replicate + Parameters ---------- - inputs_df : pandas dataframe - observed io data from zarr file + inputs_zarr : str + path to io zarr file + predictions_file : str + path to preds.feather file source : str - source for calculations (srad, tmmx, tmmn) - sink : list - sinks ['do_min', 'do_mean', 'do_max'] + source for calculations (e.g., srad, tmmx, tmmn) + sink : str + sink for calculations (e.g., 'do_min', 'do_mean', 'do_max') site : str site number log_transform : boolean should the source variable be log10 transformed, should only be log10 transformed for discharge - models: iterable (list or tuple) - the models for which you want do the calcs (e.g., ['0_baseline_LSTM','2_multitask_dense']) + model: str + the model for which you are doing the calcs (e.g., '0_baseline_LSTM', 'observed') replicate: int - which replicate you want to do the calcs for - base_file_path: str - filepath where the model results are + which replicate you are doing the calcs for + outfile: str + filepath to store the output (if desired) Returns ------- Information theory metric results (transfer entropy) as a nested dictionary ''' + inputs = xr.open_zarr(inputs_zarr, consolidated=False) + inputs_df = inputs.to_dataframe() + # TODO: it'd be nice to read this in dynamically at some point inputs_site = inputs_df.loc[site][['CAT_BASIN_AREA', 'CAT_BASIN_SLOPE', 'CAT_CNPY11_BUFF100', 'CAT_ELEV_MEAN', 'CAT_IMPV11', 'CAT_TWI', 'SLOPE', 'day.length', 'depth', 'discharge', 'light_ratio', @@ -47,220 +63,121 @@ def calc_it_metrics_sites(inputs_df, source, sink, site, log_transform, models, 'tmmx', 'velocity', 'vs']] targets_site = inputs_df.loc[site][['do_min','do_mean','do_max']] - if 'do_range' in sink: + if sink == 'do_range': targets_site['do_range'] = targets_site['do_max']-targets_site['do_min'] tar_dict = {} - for m in models: - file_path = f'{base_file_path}/{m}/nstates_10/nep_100/rep_{replicate}/preds.feather' - model_preds = pd.read_feather(file_path) - model_preds = model_preds[model_preds['site_id'] == site].set_index('date')[['do_min','do_mean','do_max']] - model_preds['do_range'] = model_preds['do_max']-model_preds['do_min'] - #create targets dictionary - tar_dict[m] = model_preds + + model_preds = pd.read_feather(predictions_file) + model_preds = model_preds[model_preds['site_id'] == site].set_index('date')[['do_min','do_mean','do_max']] + model_preds['do_range'] = model_preds['do_max']-model_preds['do_min'] + #create targets dictionary + if model != 'observed': + tar_dict[model] = model_preds tar_dict['observed'] = targets_site #create dictionary to store calculations in - #max_it = {'do_min': {}, 'do_mean': {}, 'do_max':{}, 'do_range':{}} - max_it = {s: {} for s in sink} + max_it = {} #create a nested dictionary for each DO variable to store it calcs #TE0 = Transfer Entropy at a time lag of 0, MI = mututal information, #TEmax is the maximum TE, TEmaxT is the time lag of the maximum TE, #TEmaxcrit is the a True/False if TEmax is significant, everything follows the same #convention for MI - for key in max_it.keys(): - max_it[key] = {'model': [],'rmse':[], - 'TE0':[], 'TE1':[], 'TE2':[], - 'TE3':[], 'TE4':[], 'TE5':[], - 'TE6':[], 'TE7':[], 'TE8':[], - 'TEmax':[],'TEmaxt':[], 'TEmaxcrit':[], - 'MI0':[], 'MI1':[], 'MI2':[], - 'MI3':[], 'MI4':[], 'MI5':[], - 'MI6':[], 'MI7':[], 'MI8':[], - 'MImax':[],'MImaxt':[],'MImaxcrit':[]} + #join input and target to make sure they are aligned + site_inptar = inputs_site.join(tar_dict[model], rsuffix = '_pred') - for model in tar_dict.keys(): - #join input and target to make sure they are aligned - site_inptar = inputs_site.join(tar_dict[model], rsuffix = '_pred') - - #assign x = source, y = sink - x = site_inptar[source] - y = site_inptar[sink] - - #for calculating rmse - obs_pred = tar_dict['observed'][sink].join(tar_dict[model][sink], rsuffix = '_pred') - - - #load the preprocessing fucntions from it_functions.py - ppf = it_functions.pre_proc_func() - - if log_transform: - xl10 = ppf.log10(x) - else: - xl10 = x.copy() - x_rss = ppf.remove_seasonal_signal(xl10) - x_ss = ppf.standardize(x_rss) - - y_prepped = {} - for snk in sink: - y_rss = ppf.remove_seasonal_signal(y[snk]) - y_prepped[snk] = ppf.standardize(y_rss) - y_prepped[snk] = y_prepped[snk] - - - print('Calculating it metrics '+model+' '+site) - n_lags = 9 - nbins = 11 - it_dict = {} - for snk in sink: - - #create an array of the prepped x and y variables - M = np.stack((x_ss,y_prepped[snk]), axis = 1) - #Mswap is for caclulating the TE from Y -> X, we don't really need to do that - #because DO doesn't affect solar radiation, but it is needed for function - Mswap = np.stack((y_prepped[snk], x_ss), axis = 1) - #x_bounds and y_bounds are for removing outliers - x_bounds = it_functions.find_bounds(M[:,0], 0.1, 99.9) - y_bounds = it_functions.find_bounds(M[:,1], 0.1, 99.9) - M_x_bound = np.delete(M, np.where((M[:,0] < x_bounds[0]*1.1) | (M[:,0] > x_bounds[1]*1.1)), axis = 0) - M_xy_bound = np.delete(M_x_bound, np.where((M_x_bound[:,1] < y_bounds[0]*1.1) | (M_x_bound[:,1] > y_bounds[1]*1.1)), axis = 0) - - #calc it metrics and store in the dictionary it_dict - it_dict[snk] = it_functions.calc_it_metrics(M_xy_bound, Mswap, n_lags, nbins, calc_swap = False, alpha = 0.05, ncores = 7) - - - print('Storing it metrics '+model+' '+site) - #find the max TE and MI and the time lag at which the max occurs - #and store that in a dictionary as well - for snk in sink: - - TEmax = max(it_dict[snk]['TE']) - TEmaxt = int(np.where(it_dict[snk]['TE'] == TEmax)[0]) - - if TEmax > it_dict[snk]['TEcrit'][TEmaxt]: - TEmaxcrit = True - else: - TEmaxcrit = False - - MImax = max(it_dict[snk]['MI']) - MImaxt = int(np.where(it_dict[snk]['MI'] == MImax)[0]) - - if MImax > it_dict[snk]['MIcrit'][MImaxt]: - MImaxcrit = True - else: - MImaxcrit = False - #do min - max_it[snk]['model'].append(model) - mse = np.square(np.subtract(obs_pred[snk+'_pred'],obs_pred[snk])).mean() - math.sqrt(mse) - max_it[snk]['rmse'].append(math.sqrt(mse)) - - max_it[snk]['TEmax'].append(TEmax) - for i in range(9): - max_it[snk][f'TE{i}'].append(it_dict[snk]['TE'][i]) - max_it[snk]['TEmaxt'].append(TEmaxt) - max_it[snk]['TEmaxcrit'].append(TEmaxcrit) - - max_it[snk]['MImax'].append(MImax) - for i in range(9): - max_it[snk][f'MI{i}'].append(it_dict[snk]['MI'][i]) - max_it[snk]['MImaxt'].append(MImaxt) - max_it[snk]['MImaxcrit'].append(MImaxcrit) - - return max_it - - -def diff_from_obs(df): - #function for calculating the difference in TE from modeled to observation - diff_df = df.iloc[:,1:11].sub(df.iloc[0,1:11], axis = 1) - diff_df['metric'] = df['metric'] - diff_df['model'] = df['model'] - return diff_df.iloc[1:5,:] - -def site_it_metrics(inputs_df, source, sink, sites, models, replicate, base_file_path): - #wrapper function for calculating the it metrics for each site - max_it_site = {} - for i,site in enumerate(sites): - - print('----------------\n', site,' ', i+1, ' of ', len(sites),' sites') - - max_it = calc_it_metrics_sites(inputs_df, source, sink, site, log_transform=False, - models = models, replicate = replicate, base_file_path = base_file_path) - - max_it_site[site] = max_it - - return max_it_site - - -def get_max_it_df(input_file, models, base_file_path, output_file, replicate, sink, - source='srad', sites = "all"): - ''' - This function returns the functional performance (Transfer Entropy (TE) and - Mutual Information (MI)) for all models specified, all sinks specified, for - the specified source, and for _one_ replicate. - Writes a Pandas DataFrame to the output_file with columns: - `model,rmse,TE{0-8,max,maxt,maxcrit},MI{0-8,max,maxt,maxcrit},metric,site,rep_id` - Parameters - ---------- - inputs_file : str - path to input zarr file - models : iterable (list or tuple) - the models for which you want to do the calcs (e.g., ['0_baseline_LSTM', '2_multitask_dense']) - base_file_path: str - filepath where the model results are (e.g., "2a_model/out/models/") - output_file: str - filepath where the results should be written, should be .csv - replicate : int - which replicate you want to do the calcs for - sink : list - sinks ['do_min', 'do_mean', 'do_max'] - source : str - source for calculations (srad, tmmx, tmmn) - sites : chr or list - if "all" then it metrics are calculated for all sites, if a list is given then - calcs are made only for those sites - - Returns - ------- - nothing - ''' - inputs = xr.open_zarr(input_file,consolidated=False) - inputs_df = inputs.to_dataframe() + #assign x = source, y = sink + x = site_inptar[source] + y = site_inptar[sink] + + #for calculating rmse + obs_pred = tar_dict['observed'][[sink]].join(tar_dict[model][[sink]], rsuffix = '_pred') - if isinstance(sites, str): - assert sites=='all','sites can either be "all" for all sites or a list' - sites = inputs_df.index.unique('site_id') - sites = sites.drop(['014721254', '014721259']) + + #load the preprocessing fucntions from it_functions.py + ppf = it_functions.pre_proc_func() + + if log_transform: + xl10 = ppf.log10(x) else: - assert isinstance(sites,list),'sites can either be "all" for all sites or a list' - + xl10 = x.copy() + x_rss = ppf.remove_seasonal_signal(xl10) + x_ss = ppf.standardize(x_rss) - max_it_site = site_it_metrics(inputs_df, source, sink, sites, models, - replicate, base_file_path) - - do_all_sites_list = [] + y_prepped = {} + y_rss = ppf.remove_seasonal_signal(y) + y_prepped = ppf.standardize(y_rss) - for site in sites: - max_it = max_it_site[site] - sink_dfs = [] - for s in sink: - do_sink_df = pd.DataFrame(max_it[s]) - do_sink_df['metric'] = s - sink_dfs.append(do_sink_df) - do_df = pd.concat(sink_dfs) - do_df['site'] = site + print('Calculating it metrics '+model+' '+site) + n_lags = 9 + nbins = 11 + it_dict = {} + + #create an array of the prepped x and y variables + M = np.stack((x_ss,y_prepped), axis = 1) + #Mswap is for caclulating the TE from Y -> X, we don't really need to do that + #because DO doesn't affect solar radiation, but it is needed for function + Mswap = np.stack((y_prepped, x_ss), axis = 1) + #x_bounds and y_bounds are for removing outliers + x_bounds = it_functions.find_bounds(M[:,0], 0.1, 99.9) + y_bounds = it_functions.find_bounds(M[:,1], 0.1, 99.9) + M_x_bound = np.delete(M, np.where((M[:,0] < x_bounds[0]*1.1) | (M[:,0] > x_bounds[1]*1.1)), axis = 0) + M_xy_bound = np.delete(M_x_bound, np.where((M_x_bound[:,1] < y_bounds[0]*1.1) | (M_x_bound[:,1] > y_bounds[1]*1.1)), axis = 0) + + #calc it metrics and store in the dictionary it_dict + it_dict = it_functions.calc_it_metrics(M_xy_bound, Mswap, n_lags, nbins, calc_swap = False, alpha = 0.05, ncores = 7) - do_all_sites_list.append(do_df) + + print('Storing it metrics '+model+' '+site) + #find the max TE and MI and the time lag at which the max occurs + #and store that in a dictionary as well - do_all_sites = pd.concat(do_all_sites_list) + TEmax = max(it_dict['TE']) + TEmaxt = int(np.where(it_dict['TE'] == TEmax)[0]) - do_all_sites.to_csv(output_file) + if TEmax > it_dict['TEcrit'][TEmaxt]: + TEmaxcrit = True + else: + TEmaxcrit = False - + MImax = max(it_dict['MI']) + MImaxt = int(np.where(it_dict['MI'] == MImax)[0]) + + if MImax > it_dict['MIcrit'][MImaxt]: + MImaxcrit = True + else: + MImaxcrit = False + #do min + max_it['model'] = model + mse = np.square(np.subtract(obs_pred[sink+'_pred'],obs_pred[sink])).mean() + math.sqrt(mse) + max_it['rmse'] = math.sqrt(mse) + + max_it['TEmax'] = TEmax + for i in range(9): + max_it[f'TE{i}'] = it_dict['TE'][i] + max_it['TEmaxt'] = TEmaxt + max_it['TEmaxcrit'] = TEmaxcrit + + max_it['MImax'] = MImax + for i in range(9): + max_it[f'MI{i}'] = it_dict['MI'][i] + max_it['MImaxt'] = MImaxt + max_it['MImaxcrit'] = MImaxcrit + max_it['replicate'] = replicate + max_it['sink'] = sink + max_it['source'] = source + max_it['site'] = site + + if outfile: + df = pd.DataFrame(max_it, index=[0]) + df.to_csv(outfile, index=False) + + return max_it From e116862ce250b8ddf504869362521932912b1b88 Mon Sep 17 00:00:00 2001 From: jsadler2 Date: Tue, 13 Dec 2022 16:08:34 -0600 Subject: [PATCH 3/6] [#179] func perf files to snakemake --- 2a_model/src/do_it_functions.py | 2 +- 2a_model/src/models/0_baseline_LSTM/Snakefile | 5 +- .../src/models/2_multitask_dense/Snakefile | 3 +- 2a_model/src/models/Snakefile_base.smk | 62 +++++++++++++++++-- 4 files changed, 64 insertions(+), 8 deletions(-) diff --git a/2a_model/src/do_it_functions.py b/2a_model/src/do_it_functions.py index 097955ce..b16ba341 100644 --- a/2a_model/src/do_it_functions.py +++ b/2a_model/src/do_it_functions.py @@ -55,7 +55,7 @@ def calc_it_metrics_site(inputs_zarr, inputs_df = inputs.to_dataframe() # TODO: it'd be nice to read this in dynamically at some point - inputs_site = inputs_df.loc[site][['CAT_BASIN_AREA', 'CAT_BASIN_SLOPE', 'CAT_CNPY11_BUFF100', + inputs_site = inputs_df.loc[site][['CAT_BASIN_SLOPE', 'CAT_CNPY11_BUFF100', 'CAT_ELEV_MEAN', 'CAT_IMPV11', 'CAT_TWI', 'SLOPE', 'day.length', 'depth', 'discharge', 'light_ratio', 'model_confidence', 'pr', 'resolution', 'rmax', 'rmin', 'shortwave', diff --git a/2a_model/src/models/0_baseline_LSTM/Snakefile b/2a_model/src/models/0_baseline_LSTM/Snakefile index 954aae03..77e613cf 100644 --- a/2a_model/src/models/0_baseline_LSTM/Snakefile +++ b/2a_model/src/models/0_baseline_LSTM/Snakefile @@ -16,7 +16,10 @@ rule all: epochs=config['epochs'], rep=list(range(config['num_replicates'])), site_id=['01480870'], - year=[2012]) + year=[2012]), + f"{out_dir}/{config['exp_name']}_func_perf.csv", + f"{out_dir}/observed_func_perf.csv" + module base_workflow: diff --git a/2a_model/src/models/2_multitask_dense/Snakefile b/2a_model/src/models/2_multitask_dense/Snakefile index 249f81a3..1280e885 100644 --- a/2a_model/src/models/2_multitask_dense/Snakefile +++ b/2a_model/src/models/2_multitask_dense/Snakefile @@ -16,7 +16,8 @@ rule all: epochs=config['epochs'], rep=list(range(config['num_replicates'])), site_id=['01480870'], - year=[2012]) + year=[2012]), + f"{out_dir}/{config['exp_name']}_func_perf.csv" module base_workflow: diff --git a/2a_model/src/models/Snakefile_base.smk b/2a_model/src/models/Snakefile_base.smk index 8836c4d2..d83741ab 100644 --- a/2a_model/src/models/Snakefile_base.smk +++ b/2a_model/src/models/Snakefile_base.smk @@ -1,11 +1,15 @@ import os +import xarray as xr import tensorflow as tf import numpy as np import pandas as pd import sys -code_dir = "../river-dl" -sys.path.append(code_dir) +river_dl_dir = "../river-dl" +sys.path.append(river_dl_dir) + +src_dir = "../.." +sys.path.append(src_dir) from river_dl.preproc_utils import asRunConfig from river_dl.preproc_utils import prep_all_data @@ -14,6 +18,7 @@ from river_dl.postproc_utils import plot_obs, plot_ts, prepped_array_to_df from river_dl.predict import predict_from_arbitrary_data from river_dl.train import train_model from river_dl import loss_functions as lf +from do_it_functions import calc_it_metrics_site out_dir = os.path.join(config['out_dir'], config['exp_name']) loss_function = lf.multitask_rmse(config['lambdas']) @@ -29,7 +34,7 @@ rule as_run_config: rule prep_io_data: input: - f"../../../out/well_obs_io.zarr", + "../../../out/well_obs_io.zarr", output: "{outdir}/prepped.npz" run: @@ -100,7 +105,7 @@ rule make_predictions: input: "{outdir}/prepped.npz", "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/train_weights/", - f"../../../out/well_obs_io.zarr", + "../../../out/well_obs_io.zarr", output: "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/preds.feather", run: @@ -174,7 +179,7 @@ def get_grp_arg(wildcards): rule combine_metrics: input: - f"../../../out/well_obs_io.zarr", + "../../../out/well_obs_io.zarr", "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/trn_preds.feather", "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/val_preds.feather", "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/val_times_preds.feather" @@ -226,3 +231,50 @@ rule plot_prepped_data: partition=wildcards.partition) +rule calc_functional_performance_one: + input: + "../../../out/well_obs_io.zarr", + "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/preds.feather" + output: + "{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/func_perf/{site}-{src}-{snk}-{model}.csv" + run: + calc_it_metrics_site(input[0], + input[1], + wildcards.src, + wildcards.snk, + wildcards.site, + log_transform=False, + model=wildcards.model, + replicate=wildcards.rep, + outfile=output[0]) + + +def get_func_perf_sites(): + input_file = "../../../out/well_obs_io.zarr" + inputs = xr.open_zarr(input_file, consolidated=False) + inputs_df = inputs.to_dataframe() + + sites = inputs_df.index.unique('site_id') + sites = sites.drop(['014721254', '014721259']) + return sites + + +rule gather_func_performances: + input: + expand("{outdir}/nstates_{nstates}/nep_{epochs}/rep_{rep}/func_perf/{site}-{src}-{snk}-{{model}}.csv", + outdir=out_dir, + nstates=config['hidden_size'], + epochs=config['epochs'], + rep=list(range(config['num_replicates'])), + site=get_func_perf_sites(), + src=['tmmx'], + snk=['do_min', 'do_mean', 'do_max']) + output: + "{outdir}/{model}_func_perf.csv" + run: + df_list = [] + for in_file in input: + df = pd.read_csv(in_file, dtype={"site": str}) + df_list.append(df) + df_comb = pd.concat(df_list) + df_comb.to_csv(output[0], index=False) From 5a3a7c9cd3eb08c5b3cfaae4395d9c39fea93b7c Mon Sep 17 00:00:00 2001 From: jsadler2 Date: Wed, 21 Dec 2022 09:47:00 -0600 Subject: [PATCH 4/6] Snakemake only checks modified time w/o this, snakemake retrains models unnecessarily --- 2a_model.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2a_model.R b/2a_model.R index 50829fee..77970608 100644 --- a/2a_model.R +++ b/2a_model.R @@ -230,7 +230,7 @@ p2a_targets_list <- list( system(stringr::str_glue("snakemake -s {snakefile_path} --configfile {config_path} -j --touch --rerun-incomplete")) # then run the snakemake pipeline to produce the predictions and metric files - system(stringr::str_glue("snakemake -s {snakefile_path} --configfile {config_path} -j --rerun-incomplete ")) + system(stringr::str_glue("snakemake -s {snakefile_path} --configfile {config_path} -j --rerun-incomplete --rerun-trigger mtime")) # print out the metrics file name for the target file.path("2a_model/out/models", p2a_model_ids$model_id, "exp_overall_metrics.csv") From c21d80b353aaf5850932cf0b9ce2275bf6b9c22c Mon Sep 17 00:00:00 2001 From: jsadler2 Date: Wed, 21 Dec 2022 11:33:06 -0600 Subject: [PATCH 5/6] [#179] incorporate FP files in 2a_metrics_files target --- 2a_model.R | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/2a_model.R b/2a_model.R index 77970608..4b6b26de 100644 --- a/2a_model.R +++ b/2a_model.R @@ -191,12 +191,12 @@ p2a_targets_list <- list( config_path = stringr::str_remove(p2a_config_baseline_LSTM_yml, "2a_model/src/models/")), #the 1_ models use the same model and therefore the same Snakefile #as the 0_baseline_LSTM run - #list(model_id = "1_metab_multitask", - #snakefile_dir = "0_baseline_LSTM", - #config_path = stringr::str_remove(p2a_config_metab_multitask_yml, "2a_model/src/models/")), - #list(model_id = "1a_multitask_do_gpp_er", - #snakefile_dir = "0_baseline_LSTM", - #config_path = stringr::str_remove(p2a_config_1a_metab_multitask_yml, "2a_model/src/models/")), + list(model_id = "1_metab_multitask", + snakefile_dir = "0_baseline_LSTM", + config_path = stringr::str_remove(p2a_config_metab_multitask_yml, "2a_model/src/models/")), + list(model_id = "1a_multitask_do_gpp_er", + snakefile_dir = "0_baseline_LSTM", + config_path = stringr::str_remove(p2a_config_1a_metab_multitask_yml, "2a_model/src/models/")), list(model_id = "2_multitask_dense", snakefile_dir = "2_multitask_dense", config_path = stringr::str_remove(p2a_config_multitask_dense_yml, "2a_model/src/models/")) @@ -232,8 +232,11 @@ p2a_targets_list <- list( # then run the snakemake pipeline to produce the predictions and metric files system(stringr::str_glue("snakemake -s {snakefile_path} --configfile {config_path} -j --rerun-incomplete --rerun-trigger mtime")) - # print out the metrics file name for the target - file.path("2a_model/out/models", p2a_model_ids$model_id, "exp_overall_metrics.csv") + # print out the FP and PP metrics file name for the target + c( + file.path("2a_model/out/models", p2a_model_ids$model_id, "exp_overall_metrics.csv"), + file.path("2a_model/out/models", p2a_model_ids$model_id, paste0(p2a_model_ids$model_id, "_func_perf.csv")) + ) }, format="file", pattern = map(p2a_model_ids) From 24c4b9cfdaf59cf5f4062f1eba212a121630756c Mon Sep 17 00:00:00 2001 From: jsadler2 Date: Thu, 22 Dec 2022 14:45:45 -0600 Subject: [PATCH 6/6] [#179] add 'corr', 'TEcrit', 'MIcrit' to FP csv --- 2a_model/src/do_it_functions.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/2a_model/src/do_it_functions.py b/2a_model/src/do_it_functions.py index b16ba341..67c992da 100644 --- a/2a_model/src/do_it_functions.py +++ b/2a_model/src/do_it_functions.py @@ -159,14 +159,13 @@ def calc_it_metrics_site(inputs_zarr, max_it['rmse'] = math.sqrt(mse) max_it['TEmax'] = TEmax - for i in range(9): - max_it[f'TE{i}'] = it_dict['TE'][i] max_it['TEmaxt'] = TEmaxt max_it['TEmaxcrit'] = TEmaxcrit max_it['MImax'] = MImax - for i in range(9): - max_it[f'MI{i}'] = it_dict['MI'][i] + for variable in ['TE', 'TEcrit', 'MI', 'MIcrit', 'corr']: + for i in range(9): + max_it[f'{variable}{i}'] = it_dict[variable][i] max_it['MImaxt'] = MImaxt max_it['MImaxcrit'] = MImaxcrit max_it['replicate'] = replicate