Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API updates to specify components #88

Merged
merged 14 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,16 @@ Usage: cupid-run [OPTIONS] CONFIG_PATH
Options:
-s, --serial Do not use LocalCluster objects
-ts, --time-series Run time series generation scripts prior to diagnostics
-atm, --atmosphere Run atmosphere component diagnostics
-ocn, --ocean Run ocean component diagnostics
-lnd, --land Run land component diagnostics
-ice, --seaice Run sea ice component diagnostics
-glc, --landice Run land ice component diagnostics
-h, --help Show this message and exit.
```

#### Running in serial

By default, several of the example notebooks provided use a dask `LocalCluster` object to run in parallel.
However, the `--serial` option will pass a logical flag to each notebook that can be used to skip starting the cluster.

Expand All @@ -98,6 +105,10 @@ if not serial:
client
```

#### Specifying components

If no component flags are provided, all component diagnostics listed in `config.yml` will be executed by default. Multiple flags can be used together to select a group of components, for example: `cupid-run -ocn -ice config.yml`.


### Timeseries File Generation
CUPiD also has the capability to generate single variable timeseries files from history files for all components. To run timeseries, edit the `config.yml` file's timeseries section to fit your preferences, and then run `cupid-run config.yml -ts`.

180 changes: 100 additions & 80 deletions cupid/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,24 @@
import dask
import time
import ploomber
import yaml
import warnings

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])


@click.command(context_settings=CONTEXT_SETTINGS)
@click.option("--serial", "-s", is_flag=True, help="Do not use LocalCluster objects")
@click.option(
"--time-series",
"-ts",
is_flag=True,
help="Run time series generation scripts prior to diagnostics",
)
@click.option("--time-series", "-ts", is_flag=True,
help="Run time series generation scripts prior to diagnostics")
# Options to turn components on or off
@click.option("--atmosphere", "-atm", is_flag=True, help="Run atmosphere component diagnostics")
@click.option("--ocean", "-ocn", is_flag=True, help="Run ocean component diagnostics")
@click.option("--land", "-lnd", is_flag=True, help="Run land component diagnostics")
@click.option("--seaice", "-ice", is_flag=True, help="Run sea ice component diagnostics")
@click.option("--landice", "-glc", is_flag=True, help="Run land ice component diagnostics")
@click.argument("config_path")
def run(config_path, serial=False, time_series=False):

def run(config_path, serial=False, time_series=False,
all=False, atmosphere=False, ocean=False, land=False, seaice=False, landice=False):
"""
Main engine to set up running all the notebooks.
"""
Expand All @@ -34,41 +37,56 @@ def run(config_path, serial=False, time_series=False):
control = cupid.util.get_control_dict(config_path)
cupid.util.setup_book(config_path)

#####################################################################
component_options = {"atm": atmosphere,
"ocn": ocean,
"lnd": land,
"ice": seaice,
"glc": landice}

# Automatically run all if no components specified

if True not in [atmosphere, ocean, land, seaice, landice]:
all = True
for key in component_options.keys():
component_options[key] = True

#####################################################################
# Managing global parameters

global_params = dict()

if "global_params" in control:
global_params = control["global_params"]

global_params['serial'] = serial

####################################################################

if time_series:
timeseries_params = control["timeseries"]

# general timeseries arguments for all components
num_procs = timeseries_params["num_procs"]



for component in ['atm', 'ocn', 'lnd', 'ice', 'glc']:
cupid.timeseries.create_time_series(
component,
timeseries_params[component]["vars"],
timeseries_params[component]["derive_vars"],
[timeseries_params["case_name"]], # could also grab from compute_notebooks section of config file
timeseries_params[component]["hist_str"],
[global_params["CESM_output_dir"] + "/" + timeseries_params["case_name"] + f"/{component}/hist/"], # could also grab from compute_notebooks section of config file
[global_params["CESM_output_dir"]+'/'+timeseries_params['case_name']+f'/{component}/proc/tseries/'],
# Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/
timeseries_params["ts_done"],
timeseries_params["overwrite_ts"],
timeseries_params[component]["start_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date
timeseries_params[component]["end_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date
timeseries_params[component]["level"],
num_procs,
serial,
)
for component, comp_bool in component_options.items():
if comp_bool:
cupid.timeseries.create_time_series(
component,
timeseries_params[component]["vars"],
timeseries_params[component]["derive_vars"],
[timeseries_params["case_name"]], # could also grab from compute_notebooks section of config file
timeseries_params[component]["hist_str"],
[global_params["CESM_output_dir"] + "/" + timeseries_params["case_name"] + f"/{component}/hist/"], # could also grab from compute_notebooks section of config file
[global_params["CESM_output_dir"]+'/'+timeseries_params['case_name']+f'/{component}/proc/tseries/'],
# Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/
timeseries_params["ts_done"],
timeseries_params["overwrite_ts"],
timeseries_params[component]["start_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date
timeseries_params[component]["end_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date
timeseries_params[component]["level"],
num_procs,
serial,
)

# Grab paths

Expand Down Expand Up @@ -113,35 +131,39 @@ def run(config_path, serial=False, time_series=False):
dag = ploomber.DAG(executor=ploomber.executors.Serial())

#####################################################################
# Organizing notebooks - holdover from manually managing dependencies before

all_nbs = dict()

for nb, info in control["compute_notebooks"].items():

all_nbs[nb] = info

# Setting up notebook tasks

for nb, info in all_nbs.items():

global_params["serial"] = serial
if "dependency" in info:
cupid.util.create_ploomber_nb_task(
nb,
info,
cat_path,
nb_path_root,
output_dir,
global_params,
dag,
dependency=info["dependency"],
)

else:
cupid.util.create_ploomber_nb_task(
nb, info, cat_path, nb_path_root, output_dir, global_params, dag
)
# Organizing notebooks to run

if 'compute_notebooks' in control:

all_nbs = dict()

for nb, info in control['compute_notebooks']['infrastructure'].items():
all_nbs[nb] = info
all_nbs[nb]['nb_path_root'] = nb_path_root + '/infrastructure'
all_nbs[nb]['output_dir'] = output_dir + '/infrastructure'

for comp_name, comp_bool in component_options.items():
if comp_name in control['compute_notebooks'] and comp_bool:
for nb, info in control['compute_notebooks'][comp_name].items():
all_nbs[nb] = info
all_nbs[nb]['nb_path_root'] = nb_path_root + '/' + comp_name
all_nbs[nb]['output_dir'] = output_dir + '/' + comp_name
elif comp_bool and not all:
warnings.warn(f"No notebooks for {comp_name} component specified in config file.")

# Checking for existence of environments

for nb, info in all_nbs.copy().items():
if not control["env_check"][info["kernel_name"]]:
bad_env = info["kernel_name"]
warnings.warn(f"Environment {bad_env} specified for {nb}.ipynb could not be found; {nb}.ipynb will not be run. See README.md for environment installation instructions.")
all_nbs.pop(nb)

# Setting up notebook tasks

for nb, info in all_nbs.items():
cupid.util.create_ploomber_nb_task(nb, info, cat_path, info["nb_path_root"],
info["output_dir"], global_params, dag, dependency=info.get("dependency"))

#####################################################################
# Organizing scripts
Expand All @@ -150,29 +172,27 @@ def run(config_path, serial=False, time_series=False):

all_scripts = dict()

for script, info in control["compute_scripts"].items():

all_scripts[script] = info

for comp_name, comp_bool in component_options.items():
if comp_name in control['compute_scripts'] and comp_bool:
for script, info in control['compute_scripts'][comp_name].items():
all_scripts[script] = info
all_scripts[script]['nb_path_root'] = nb_path_root + '/' + comp_name
elif comp_bool and not all:
warnings.warn(f"No scripts for {comp_name} component specified in config file.")

# Checking for existence of environments

for script, info in all_scripts.copy().items():
if not control["env_check"][info["kernel_name"]]:
bad_env = info["kernel_name"]
warnings.warn(f"Environment {bad_env} specified for {script}.py could not be found; {script}.py will not be run.")
TeaganKing marked this conversation as resolved.
Show resolved Hide resolved
all_scripts.pop(script)

# Setting up script tasks

for script, info in all_scripts.items():

if "dependency" in info:
cupid.util.create_ploomber_script_task(
script,
info,
cat_path,
nb_path_root,
global_params,
dag,
dependency=info["dependency"],
)

else:
cupid.util.create_ploomber_script_task(
script, info, cat_path, nb_path_root, global_params, dag
)
cupid.util.create_ploomber_script_task(script, info, cat_path, info['nb_path_root'],
global_params, dag, dependency=info.get("dependency"))

# Run the full DAG

Expand Down
Loading