diff --git a/README.md b/README.md index 963a340..945d681 100644 --- a/README.md +++ b/README.md @@ -54,8 +54,8 @@ To test the package out, try to run `examples/coupled-model`: $ conda activate cupid-dev $ cd examples/coupled_model $ # machine-dependent: request multiple compute cores -$ cupid-run config.yml -$ cupid-build config.yml # Will build HTML from Jupyter Book +$ cupid-run +$ cupid-build # Will build HTML from Jupyter Book ``` After the last step is finished, you can use Jupyter to view generated notebooks in `${CUPID_ROOT}/examples/coupled-model/computed_notebooks/quick-run` @@ -64,7 +64,7 @@ or you can view `${CUPID_ROOT}/examples/coupled-model/computed_notebooks/quick-r Furthermore, to clear the `computed_notebooks` folder which was generated by the `cupid-run` and `cupid-build` commands, you can run the following command: ``` bash -$ cupid-clear config.yml +$ cupid-clear ``` This will clear the `computed_notebooks` folder which is at the location pointed to by the `run_dir` variable in the `config.yml` file. @@ -87,6 +87,7 @@ Options: -lnd, --land Run land component diagnostics -ice, --seaice Run sea ice component diagnostics -glc, --landice Run land ice component diagnostics + --config_path Path to the YAML configuration file containing specifications for notebooks (default config.yml) -h, --help Show this message and exit. ``` @@ -107,8 +108,8 @@ client #### Specifying components -If no component flags are provided, all component diagnostics listed in `config.yml` will be executed by default. Multiple flags can be used together to select a group of components, for example: `cupid-run -ocn -ice config.yml`. +If no component flags are provided, all component diagnostics listed in `config.yml` will be executed by default. Multiple flags can be used together to select a group of components, for example: `cupid-run -ocn -ice`. ### Timeseries File Generation -CUPiD also has the capability to generate single variable timeseries files from history files for all components. To run timeseries, edit the `config.yml` file's timeseries section to fit your preferences, and then run `cupid-run config.yml -ts`. +CUPiD also has the capability to generate single variable timeseries files from history files for all components. To run timeseries, edit the `config.yml` file's timeseries section to fit your preferences, and then run `cupid-run -ts`. diff --git a/cupid/build.py b/cupid/build.py index 7c67d09..d5778b8 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -1,41 +1,60 @@ #!/usr/bin/env python +""" +This script provides functionality to build a Jupyter book based on +the configuration specified in a YAML file. +The main function `build()` reads the configuration file (default config.yml), +extracts the necessary information such as the name of the book and the +directory containing computed notebooks, and then proceeds to clean and build the +Jupyter book using the `jupyter-book` command-line tool. + +Args: + CONFIG_PATH: str, path to configuration file (default config.yml) + +Returns: + None +""" + +import click import subprocess import sys -import os import yaml -def build(): + +@click.command() +@click.argument("config_path", default="config.yml") +def build(config_path): """ - Build a Jupyter book based on the TOC in config.yml. Called by `cupid-build`. - + Build a Jupyter book based on the TOC in CONFIG_PATH. Called by `cupid-build`. + Args: - none + CONFIG_PATH: str, path to configuration file (default config.yml) + Returns: None """ - - config_path = str(sys.argv[1]) - + with open(config_path, "r") as fid: control = yaml.safe_load(fid) - + sname = control["data_sources"]["sname"] run_dir = control["data_sources"]["run_dir"] - subprocess.run(["jupyter-book", "clean" , f"{run_dir}/computed_notebooks/{sname}"]) - subprocess.run(["jupyter-book", "build" , f"{run_dir}/computed_notebooks/{sname}", "--all"]) + subprocess.run(["jupyter-book", "clean", f"{run_dir}/computed_notebooks/{sname}"]) + subprocess.run( + ["jupyter-book", "build", f"{run_dir}/computed_notebooks/{sname}", "--all"] + ) -### Originally used this code to copy jupyter book HTML to a location to host it online + # Originally used this code to copy jupyter book HTML to a location to host it online -# if 'publish_location' in control: - -# user = os.environ.get('USER') -# remote_mach = control["publish_location"]["remote_mach"] -# remote_dir = control["publish_location"]["remote_dir"] -# this seems more complicated than expected...people have mentioned paramiko library? - # subprocess.run(["mkdir", "-p", remote_dir]) - # subprocess.run(["scp", "-r", f"{run_dir}/computed_notebooks/{sname}/_build/html/*", f"{user}@{remote_mach}:{remote_dir}"]) - - return None + # if 'publish_location' in control: + # user = os.environ.get('USER') + # remote_mach = control["publish_location"]["remote_mach"] + # remote_dir = control["publish_location"]["remote_dir"] + # this seems more complicated than expected...people have mentioned paramiko library? + # subprocess.run(["mkdir", "-p", remote_dir]) + # subprocess.run(["scp", "-r", f"{run_dir}/computed_notebooks/{sname}/_build/html/*", + # f"{user}@{remote_mach}:{remote_dir}"]) + + return None diff --git a/cupid/clear.py b/cupid/clear.py index 869980c..ea886fd 100755 --- a/cupid/clear.py +++ b/cupid/clear.py @@ -1,36 +1,56 @@ #!/usr/bin/env python +""" +This script provides functionality to clear the contents of the 'computed_notebooks' folder +at the location specified by the 'run_dir' variable in the CONFIG_PATH. + +The main function `clear()` takes the path to the configuration file as input, reads the config file +to obtain the 'run_dir' variable, and then deletes the contents of the 'computed_notebooks' folder +at that location. + +""" + import os +import shutil import click import cupid.util -import shutil -def readConfigFile(config_path): - #Given the file path to config.yml, this function reads the config file content and - #returns the val of the run_dir string with '/computed_notebooks' appended to it - - #Obtain the contents of the config.yml file and extract the run_dir variable + +def read_config_file(config_path): + """ + Given the file path to the configuration file, this function reads the config file content and + returns the val of the run_dir string with '/computed_notebooks' appended to it + + Args: + CONFIG_PATH: str, path to configuration file (default config.yml) + + Returns: + None + """ + # Obtain the contents of the configuration file and extract the run_dir variable control = cupid.util.get_control_dict(config_path) - run_dir = control['data_sources'].get('run_dir', None) - + run_dir = control["data_sources"].get("run_dir", None) + if run_dir: - #Append '/computed_notebooks' to the run_dir value if it is not empty - fullPath = os.path.join(run_dir, 'computed_notebooks') - return fullPath - - else: #run_dir is empty/wasn't found in config file so return error - raise ValueError("'run_dir' was empty/not found in the config file.") + # Append '/computed_notebooks' to the run_dir value if it is not empty + full_path = os.path.join(run_dir, "computed_notebooks") + return full_path + + # else run_dir is empty/wasn't found in config file so return error + raise ValueError("'run_dir' was empty/not found in the config file.") + @click.command() -@click.argument('config_path') -#Entry point to this script +@click.argument("config_path", default="config.yml") +# Entry point to this script def clear(config_path): - """Clears the contents of the 'computed_notebooks' folder at the location specified by the 'run_dir' variable in the 'config.yml' file. - - Args: config_path - The path to the config.yml file. + """Clears the contents of the 'computed_notebooks' folder at the location + specified by the 'run_dir' variable in the CONFIG_PATH. + + Args: CONFIG_PATH - The path to the configuration file. """ - - run_dir = readConfigFile(config_path) - #Delete the 'computed_notebooks' folder and all the contents inside of it + + run_dir = read_config_file(config_path) + # Delete the 'computed_notebooks' folder and all the contents inside of it shutil.rmtree(run_dir) - print(f"All contents in {run_dir} have been cleared.") \ No newline at end of file + print(f"All contents in {run_dir} have been cleared.") diff --git a/cupid/quickstart.py b/cupid/quickstart.py index 39ee180..8c17864 100644 --- a/cupid/quickstart.py +++ b/cupid/quickstart.py @@ -1,2 +1,3 @@ -### To be created: a script (maybe called through a command line entry point) that sets up a directory with a config.yml file and -### basics necessary to set up a notebook collection \ No newline at end of file +### To be created: a script, maybe called through a command line entry point, +### that sets up a directory with a config.yml file and +### basics necessary to set up a notebook collection diff --git a/cupid/read.py b/cupid/read.py index 4164308..03ec029 100644 --- a/cupid/read.py +++ b/cupid/read.py @@ -1,32 +1,44 @@ +""" +This module provides functions for reading YAML files and working with intake catalogs. + +Functions: + - read_yaml(path_to_yaml): Read a YAML file and return its content as a dictionary. + - get_collection(path_to_catalog, **kwargs): Get a collection of datasets from an + intake catalog based on specified criteria. +""" + import intake import yaml + def read_yaml(path_to_yaml): - with open(path_to_yaml) as f: - data = yaml.load(f, Loader=yaml.FullLoader) + """Read yaml file and return data from loaded yaml file""" + with open(path_to_yaml) as file: + data = yaml.load(file, Loader=yaml.FullLoader) return data def get_collection(path_to_catalog, **kwargs): + """Get collection of datasets from intake catalog""" cat = intake.open_esm_datastore(path_to_catalog) ### note that the json file points to the csv, so the path that the ### yaml file contains doesn't actually get used. this can cause issues - + cat_subset = cat.search(**kwargs) - + if "variable" in kwargs.keys(): - + # pylint: disable=invalid-name def preprocess(ds): ## the double brackets return a Dataset rather than a DataArray - ## this is fragile and could cause issues, i'm not totally sure what subsetting on time_bound does - return ds[[kwargs["variable"], 'time_bound']] - + ## this is fragile and could cause issues, not sure what subsetting on time_bound does + return ds[[kwargs["variable"], "time_bound"]] + ## not sure what the chunking kwarg is doing here either - dsets = cat_subset.to_dataset_dict(xarray_open_kwargs={'chunks': {'time': -1}}, preprocess=preprocess) - + dsets = cat_subset.to_dataset_dict( + xarray_open_kwargs={"chunks": {"time": -1}}, preprocess=preprocess + ) + else: dsets = cat_subset.to_dataset_dict() - - return dsets - + return dsets diff --git a/cupid/run.py b/cupid/run.py index 458ebf9..536ccc5 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -1,55 +1,91 @@ #!/usr/bin/env python -import click +""" +Main script for running all notebooks and scripts specified in the configuration file. + +This script sets up and runs all the specified notebooks and scripts according to the configurations +provided in the specified YAML configuration file. + +Usage: cupid-run [OPTIONS] + + Main engine to set up running all the notebooks. + +Options: + -s, --serial Do not use LocalCluster objects + -ts, --time-series Run time series generation scripts prior to diagnostics + -atm, --atmosphere Run atmosphere component diagnostics + -ocn, --ocean Run ocean component diagnostics + -lnd, --land Run land component diagnostics + -ice, --seaice Run sea ice component diagnostics + -glc, --landice Run land ice component diagnostics + -config_path Path to the YAML configuration file containing specifications for notebooks (default: config.yml) + -h, --help Show this message and exit. +""" + import os -from glob import glob -import papermill as pm +import warnings +import click import intake +import ploomber import cupid.util import cupid.timeseries -from dask.distributed import Client -import dask -import time -import ploomber -import warnings CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) +# fmt: off +# pylint: disable=line-too-long @click.command(context_settings=CONTEXT_SETTINGS) @click.option("--serial", "-s", is_flag=True, help="Do not use LocalCluster objects") -@click.option("--time-series", "-ts", is_flag=True, - help="Run time series generation scripts prior to diagnostics") +@click.option("--time-series", "-ts", is_flag=True, help="Run time series generation scripts prior to diagnostics") # Options to turn components on or off @click.option("--atmosphere", "-atm", is_flag=True, help="Run atmosphere component diagnostics") @click.option("--ocean", "-ocn", is_flag=True, help="Run ocean component diagnostics") @click.option("--land", "-lnd", is_flag=True, help="Run land component diagnostics") @click.option("--seaice", "-ice", is_flag=True, help="Run sea ice component diagnostics") @click.option("--landice", "-glc", is_flag=True, help="Run land ice component diagnostics") -@click.argument("config_path") - -def run(config_path, serial=False, time_series=False, - all=False, atmosphere=False, ocean=False, land=False, seaice=False, landice=False): +@click.argument("config_path", default="config.yml") +def run( + config_path, + serial=False, + time_series=False, + all=False, + atmosphere=False, + ocean=False, + land=False, + seaice=False, + landice=False, +): """ Main engine to set up running all the notebooks. - """ + Args: + CONFIG_PATH: str, path to configuration file (default config.yml) + + Returns: + None + + """ + # fmt: on + # pylint: enable=line-too-long # Get control structure control = cupid.util.get_control_dict(config_path) cupid.util.setup_book(config_path) - component_options = {"atm": atmosphere, - "ocn": ocean, - "lnd": land, - "ice": seaice, - "glc": landice} + component_options = { + "atm": atmosphere, + "ocn": ocean, + "lnd": land, + "ice": seaice, + "glc": landice, + } # Automatically run all if no components specified - + if True not in [atmosphere, ocean, land, seaice, landice]: all = True for key in component_options.keys(): component_options[key] = True - + ##################################################################### # Managing global parameters @@ -57,11 +93,11 @@ def run(config_path, serial=False, time_series=False, if "global_params" in control: global_params = control["global_params"] - - global_params['serial'] = serial - + + global_params["serial"] = serial + #################################################################### - + if time_series: timeseries_params = control["timeseries"] @@ -70,23 +106,28 @@ def run(config_path, serial=False, time_series=False, for component, comp_bool in component_options.items(): if comp_bool: + # fmt: off + # pylint: disable=line-too-long cupid.timeseries.create_time_series( component, timeseries_params[component]["vars"], timeseries_params[component]["derive_vars"], - [timeseries_params["case_name"]], # could also grab from compute_notebooks section of config file + [timeseries_params["case_name"]], timeseries_params[component]["hist_str"], - [global_params["CESM_output_dir"] + "/" + timeseries_params["case_name"] + f"/{component}/hist/"], # could also grab from compute_notebooks section of config file - [global_params["CESM_output_dir"]+'/'+timeseries_params['case_name']+f'/{component}/proc/tseries/'], - # Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ + [global_params["CESM_output_dir"]+"/"+timeseries_params["case_name"]+f"/{component}/hist/"], + [global_params["CESM_output_dir"]+"/"+timeseries_params["case_name"]+f"/{component}/proc/tseries/"], + # Note that timeseries output will eventually go in + # /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ timeseries_params["ts_done"], timeseries_params["overwrite_ts"], - timeseries_params[component]["start_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date - timeseries_params[component]["end_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date + timeseries_params[component]["start_years"], + timeseries_params[component]["end_years"], timeseries_params[component]["level"], num_procs, serial, - ) + ) + # fmt: on + # pylint: enable=line-too-long # Grab paths @@ -105,7 +146,6 @@ def run(config_path, serial=False, time_series=False, cat_path = None if "path_to_cat_json" in control["data_sources"]: - use_catalog = True full_cat_path = os.path.realpath( os.path.expanduser(control["data_sources"]["path_to_cat_json"]) ) @@ -132,38 +172,53 @@ def run(config_path, serial=False, time_series=False, ##################################################################### # Organizing notebooks to run - - if 'compute_notebooks' in control: - + + if "compute_notebooks" in control: + all_nbs = dict() - - for nb, info in control['compute_notebooks']['infrastructure'].items(): + + # pylint: disable=invalid-name + for nb, info in control["compute_notebooks"]["infrastructure"].items(): all_nbs[nb] = info - all_nbs[nb]['nb_path_root'] = nb_path_root + '/infrastructure' - all_nbs[nb]['output_dir'] = output_dir + '/infrastructure' - + all_nbs[nb]["nb_path_root"] = nb_path_root + "/infrastructure" + all_nbs[nb]["output_dir"] = output_dir + "/infrastructure" + for comp_name, comp_bool in component_options.items(): - if comp_name in control['compute_notebooks'] and comp_bool: - for nb, info in control['compute_notebooks'][comp_name].items(): + if comp_name in control["compute_notebooks"] and comp_bool: + for nb, info in control["compute_notebooks"][comp_name].items(): all_nbs[nb] = info - all_nbs[nb]['nb_path_root'] = nb_path_root + '/' + comp_name - all_nbs[nb]['output_dir'] = output_dir + '/' + comp_name + all_nbs[nb]["nb_path_root"] = nb_path_root + "/" + comp_name + all_nbs[nb]["output_dir"] = output_dir + "/" + comp_name elif comp_bool and not all: - warnings.warn(f"No notebooks for {comp_name} component specified in config file.") - + warnings.warn( + f"No notebooks for {comp_name} component specified in config file." + ) + # Checking for existence of environments - + for nb, info in all_nbs.copy().items(): if not control["env_check"][info["kernel_name"]]: bad_env = info["kernel_name"] - warnings.warn(f"Environment {bad_env} specified for {nb}.ipynb could not be found; {nb}.ipynb will not be run. See README.md for environment installation instructions.") + warnings.warn( + f"Environment {bad_env} specified for {nb}.ipynb could not be found;"+ + f" {nb}.ipynb will not be run."+ + f"See README.md for environment installation instructions." + ) all_nbs.pop(nb) - + # Setting up notebook tasks - - for nb, info in all_nbs.items(): - cupid.util.create_ploomber_nb_task(nb, info, cat_path, info["nb_path_root"], - info["output_dir"], global_params, dag, dependency=info.get("dependency")) + + for nb, info in all_nbs.items(): + cupid.util.create_ploomber_nb_task( + nb, + info, + cat_path, + info["nb_path_root"], + info["output_dir"], + global_params, + dag, + dependency=info.get("dependency"), + ) ##################################################################### # Organizing scripts @@ -173,26 +228,38 @@ def run(config_path, serial=False, time_series=False, all_scripts = dict() for comp_name, comp_bool in component_options.items(): - if comp_name in control['compute_scripts'] and comp_bool: - for script, info in control['compute_scripts'][comp_name].items(): + if comp_name in control["compute_scripts"] and comp_bool: + for script, info in control["compute_scripts"][comp_name].items(): all_scripts[script] = info - all_scripts[script]['nb_path_root'] = nb_path_root + '/' + comp_name + all_scripts[script]["nb_path_root"] = nb_path_root + "/" + comp_name elif comp_bool and not all: - warnings.warn(f"No scripts for {comp_name} component specified in config file.") + warnings.warn( + f"No scripts for {comp_name} component specified in config file." + ) # Checking for existence of environments - + for script, info in all_scripts.copy().items(): if not control["env_check"][info["kernel_name"]]: bad_env = info["kernel_name"] - warnings.warn(f"Environment {bad_env} specified for {script}.py could not be found; {script}.py will not be run.") + warnings.warn( + f"Environment {bad_env} specified for {script}.py could not be found;"+ + f"{script}.py will not be run." + ) all_scripts.pop(script) - + # Setting up script tasks for script, info in all_scripts.items(): - cupid.util.create_ploomber_script_task(script, info, cat_path, info['nb_path_root'], - global_params, dag, dependency=info.get("dependency")) + cupid.util.create_ploomber_script_task( + script, + info, + cat_path, + info["nb_path_root"], + global_params, + dag, + dependency=info.get("dependency"), + ) # Run the full DAG diff --git a/cupid/timeseries.py b/cupid/timeseries.py index 86bec80..6315c16 100644 --- a/cupid/timeseries.py +++ b/cupid/timeseries.py @@ -6,16 +6,12 @@ # Import standard python modules # ++++++++++++++++++++++++++++++ -import sys import glob import multiprocessing as mp import os import subprocess -import xarray as xr - -import importlib - from pathlib import Path +import xarray as xr def call_ncrcat(cmd): @@ -50,7 +46,7 @@ def create_time_series( ---- - component: str name of component, eg 'cam' - # This could alternatively be made into a dictionary and encorporate values such as height_dim + # This could also be made into a dict and encorporate values such as height_dim - derive_vars: dict information on derivable variables eg, {'PRECT': ['PRECL','PRECC'], @@ -62,7 +58,7 @@ def create_time_series( - hist_locs: list, str location of CESM history files - ts_dir: list, str - location where time series files will be saved, or where pre-made time series files exist + location where time series files will be saved, or pre-made time series files exist - ts_done: list, boolean check if time series files already exist - overwrite_ts: list, boolean @@ -90,7 +86,7 @@ def create_time_series( # Check if particular case should be processed: if ts_done[case_idx]: emsg = ( - " Configuration file indicates time series files have been pre-computed" + "Configuration file indicates time series files have been pre-computed" ) emsg += f" for case '{case_name}'. Will rely on those files directly." print(emsg) @@ -173,7 +169,7 @@ def create_time_series( # Print a warning, and assume that no vertical # level information is needed. wmsg = "WARNING! Unable to determine the vertical coordinate" - wmsg += f" type from the {height_dim} long name, which is:\n'{lev_long_name}'." + wmsg += f" type from the {height_dim} long name, \n'{lev_long_name}'." wmsg += ( "\nNo additional vertical coordinate information will be" ) @@ -229,8 +225,10 @@ def create_time_series( diag_var_list = hist_file_var_list for var in diag_var_list: if var not in hist_file_var_list: - if component == 'ocn': - print('ocean vars seem to not be present in all files and thus cause errors') + if component == "ocn": + print( + "ocean vars seem to not be present in all files and thus cause errors" + ) continue if ( var in derive_vars.keys() @@ -241,11 +239,10 @@ def create_time_series( diag_var_list.append(constit) vars_to_derive.append(var) continue - else: - msg = f"WARNING: {var} is not in the file {hist_files[0]}." - msg += " No time series will be generated." - print(msg) - continue + msg = f"WARNING: {var} is not in the file {hist_files[0]}." + msg += " No time series will be generated." + print(msg) + continue # Check if variable has a height_dim (eg, 'lev') dimension according to first file: has_lev = bool(height_dim in hist_file_ds[var].dims) @@ -325,7 +322,7 @@ def create_time_series( # End variable loop - if vars_to_derive != []: + if vars_to_derive: if component == "atm": derive_cam_variables( vars_to_derive=vars_to_derive, ts_dir=ts_dir[case_idx] @@ -376,7 +373,8 @@ def derive_cam_variables(vars_to_derive=None, ts_dir=None, overwrite=None): Path(prect_file).unlink() else: print( - f"[{__name__}] Warning: PRECT file was found and overwrite is False. Will use existing file." + f"[{__name__}] Warning: PRECT file was found and overwrite is False" + + "Will use existing file." ) continue # append PRECC to the file containing PRECL @@ -409,7 +407,8 @@ def derive_cam_variables(vars_to_derive=None, ts_dir=None, overwrite=None): Path(derived_file).unlink() else: print( - f"[{__name__}] Warning: RESTOM file was found and overwrite is False. Will use existing file." + f"[{__name__}] Warning: RESTOM file was found and overwrite is False." + + "Will use existing file." ) continue # append FSNT to the file containing FLNT diff --git a/cupid/util.py b/cupid/util.py index 670e094..c8d8991 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -1,24 +1,37 @@ +""" +This module provides functions and classes for managing conda kernels, +executing notebooks with custom engines, and creating tasks for Ploomber DAGs. + +Functions: + - get_control_dict(): Get the control dictionary from a configuration file. + - setup_book(): Setup run dir and output Jupyter book based on config.yaml + - get_toc_files(): Return a list of files in the '_toc.yml'. + - create_ploomber_nb_task(): Create a Ploomber task for running a notebook. + - create_ploomber_script_task(): Create a Ploomber task for running a script. + +Classes: + - ManageCondaKernel: Class for managing conda kernels. + - MarkdownJinjaEngine: Class for using the Jinja Engine to run notebooks. +""" + import os -import shutil -from glob import glob -import pathlib -import subprocess -import json import sys -import yaml +from pathlib import Path +import warnings import jupyter_client import papermill as pm import ploomber from papermill.engines import NBClientEngine from jinja2 import Template -import dask -from pathlib import Path -import warnings +import yaml + +class MarkdownJinjaEngine(NBClientEngine): + """Class for using the Jinja Engine to run notebooks""" -class md_jinja_engine(NBClientEngine): @classmethod def execute_managed_notebook(cls, nb_man, kernel_name, **kwargs): + """Execute notebooks with papermill execution engine""" jinja_data = {} if "jinja_data" not in kwargs else kwargs["jinja_data"] # call the papermill execution engine: @@ -30,6 +43,7 @@ def execute_managed_notebook(cls, nb_man, kernel_name, **kwargs): def get_control_dict(config_path): + """Get control dictionary from configuration file""" try: with open(config_path, "r") as fid: control = yaml.safe_load(fid) @@ -40,27 +54,38 @@ def get_control_dict(config_path): default_kernel_name = control["computation_config"].pop("default_kernel_name", None) control["env_check"] = dict() - + if "compute_notebooks" in control: for nb_category in control["compute_notebooks"].values(): + # pylint: disable=invalid-name for nb, info in nb_category.items(): info["kernel_name"] = info.get("kernel_name", default_kernel_name) if info["kernel_name"] is None: info["kernel_name"] = "cupid-analysis" - warnings.warn(f"No conda environment specified for {nb}.ipynb and no default kernel set, will use cupid-analysis environment.") + warnings.warn( + f"No conda environment specified for {nb}.ipynb and no default kernel set, will use cupid-analysis environment." + ) if info["kernel_name"] not in control["env_check"]: - control["env_check"][info["kernel_name"]] = info["kernel_name"] in jupyter_client.kernelspec.find_kernel_specs() - + control["env_check"][info["kernel_name"]] = ( + info["kernel_name"] + in jupyter_client.kernelspec.find_kernel_specs() + ) + if "compute_scripts" in control: for script_category in control["compute_scripts"].values(): for script, info in script_category.items(): info["kernel_name"] = info.get("kernel_name", default_kernel_name) if info["kernel_name"] is None: info["kernel_name"] = "cupid-analysis" - warnings.warn(f"No environment specified for {script}.py and no default kernel set, will use cupid-analysis environment.") + warnings.warn( + f"No environment specified for {script}.py and no default kernel set, will use cupid-analysis environment." + ) if info["kernel_name"] not in control["env_check"]: - control["env_check"][info["kernel_name"]] = info["kernel_name"] in jupyter_client.kernelspec.find_kernel_specs() - + control["env_check"][info["kernel_name"]] = ( + info["kernel_name"] + in jupyter_client.kernelspec.find_kernel_specs() + ) + return control @@ -70,20 +95,19 @@ def setup_book(config_path): control = get_control_dict(config_path) # ensure directory - run_dir = os.path.expanduser(control['data_sources']["run_dir"]) + run_dir = os.path.expanduser(control["data_sources"]["run_dir"]) output_root = run_dir + "/computed_notebooks" - + os.makedirs(output_root, exist_ok=True) - + output_dir = f'{output_root}/{control["data_sources"]["sname"]}' - + os.makedirs(output_dir, exist_ok=True) - + # create temp catalog directory temp_data_path = run_dir + "/temp_data" - + os.makedirs(temp_data_path, exist_ok=True) - # write table of contents file toc = control["book_toc"] @@ -91,9 +115,9 @@ def setup_book(config_path): yaml.dump(toc, fid, sort_keys=False) # read config defaults - + path_to_here = os.path.dirname(os.path.realpath(__file__)) - + with open(f"{path_to_here}/_jupyter-book-config-defaults.yml", "r") as fid: config = yaml.safe_load(fid) @@ -105,11 +129,14 @@ def setup_book(config_path): yaml.dump(config, fid, sort_keys=False) return None - -def create_ploomber_nb_task(nb, info, cat_path, nb_path_root, output_dir, global_params, dag, dependency=None): + + +def create_ploomber_nb_task( + nb, info, cat_path, nb_path_root, output_dir, global_params, dag, dependency=None +): """ Creates a ploomber task for running a notebook, including necessary parameters. - + Args: nb: key from dict of notebooks info: various specifications for the notebook, originally from config.yml @@ -119,118 +146,126 @@ def create_ploomber_nb_task(nb, info, cat_path, nb_path_root, output_dir, global global_params: global parameters from config.yml dag: ploomber DAG to add the task to dependency: what the upstream task is - + Returns: task: ploomber task object """ - parameter_groups = info['parameter_groups'] + parameter_groups = info["parameter_groups"] ### passing in subset kwargs if they're provided - if 'subset' in info: - subset_kwargs = info['subset'] + if "subset" in info: + subset_kwargs = info["subset"] else: subset_kwargs = {} default_params = {} - if 'default_params' in info: - default_params = info['default_params'] + if "default_params" in info: + default_params = info["default_params"] for key, parms in parameter_groups.items(): + input_path = f"{nb_path_root}/{nb}.ipynb" + output_name = f"{nb}-{key}" if key != "none" else f"{nb}" - input_path = f'{nb_path_root}/{nb}.ipynb' - output_name = ( - f'{nb}-{key}' - if key != 'none' else f'{nb}' - ) + output_path = f"{output_dir}/{output_name}" - output_path = f'{output_dir}/{output_name}' - ### all of these things should be optional parms_in = dict(**default_params) parms_in.update(**global_params) parms_in.update(dict(**parms)) - - parms_in['subset_kwargs'] = subset_kwargs - - if cat_path != None: - parms_in['path_to_cat'] = cat_path - - + + parms_in["subset_kwargs"] = subset_kwargs + + if cat_path is not None: + parms_in["path_to_cat"] = cat_path + pm_params = { - 'engine_name': 'md_jinja', - 'jinja_data': parms, - 'cwd': nb_path_root} - - pm.engines.papermill_engines._engines["md_jinja"] = md_jinja_engine - - task = ploomber.tasks.NotebookRunner(Path(input_path), ploomber.products.File(output_path + '.ipynb'), dag, params=parms_in, papermill_params=pm_params, kernelspec_name=info['kernel_name'], name=output_name) - - if dependency != None: + "engine_name": "md_jinja", + "jinja_data": parms, + "cwd": nb_path_root, + } + + pm.engines.papermill_engines._engines["md_jinja"] = MarkdownJinjaEngine + + task = ploomber.tasks.NotebookRunner( + Path(input_path), + ploomber.products.File(output_path + ".ipynb"), + dag, + params=parms_in, + papermill_params=pm_params, + kernelspec_name=info["kernel_name"], + name=output_name, + ) + + if dependency: raise NotImplementedError - # set DAG dependency here + # set DAG dependency here # something with task.set_upstream(other_task?) - + return task -def create_ploomber_script_task(script, info, cat_path, nb_path_root, global_params, dag, dependency=None): + +def create_ploomber_script_task( + script, info, cat_path, nb_path_root, global_params, dag, dependency=None +): """ - Creates a ploomber task for running a script, including necessary parameters. - - UPDATE THIS DOCSTRING - + Creates a Ploomber task for running a script, including necessary parameters. + Args: - script: key from dict of scripts - info: various specifications for the notebook, originally from config.yml - use_catalog: bool specified earlier, specifying if whole collection uses a catalog or not - nb_path_root: from config.yml, path to folder containing template notebooks - global_params: global parameters from config.yml - dag: ploomber DAG to add the task to - dependency: what the upstream task is - + script (str): The key from the dictionary of scripts. + info (dict): Various specifications for the notebook, originally from config.yml. + cat_path (str or None): Path to the catalog file if using a catalog, otherwise None. + nb_path_root (str): Path to the folder containing template notebooks from config.yml. + global_params (dict): Global parameters from config.yml. + dag (ploomber.DAG): Ploomber DAG to add the task to. + dependency (ploomber.Task, optional): The upstream task. Defaults to None. + Returns: - task: ploomber task object + ploomber.Task: The Ploomber task object. + + Raises: + NotImplementedError: Raised if dependency is not None (setting DAG dependency is not implemented yet). """ - parameter_groups = info['parameter_groups'] + parameter_groups = info["parameter_groups"] ### passing in subset kwargs if they're provided - if 'subset' in info: - subset_kwargs = info['subset'] + if "subset" in info: + subset_kwargs = info["subset"] else: subset_kwargs = {} default_params = {} - if 'default_params' in info: - default_params = info['default_params'] + if "default_params" in info: + default_params = info["default_params"] for key, parms in parameter_groups.items(): + input_path = f"{nb_path_root}/{script}.py" + output_name = f"{script}-{key}" if key != "none" else f"{script}" - input_path = f'{nb_path_root}/{script}.py' - output_name = ( - f'{script}-{key}' - if key != 'none' else f'{script}' - ) + # output_path = f"{output_dir}/{output_name}" - #output_path = f'{output_dir}/{output_name}' - ### all of these things should be optional parms_in = dict(**default_params) parms_in.update(**global_params) parms_in.update(dict(**parms)) - - parms_in['subset_kwargs'] = subset_kwargs - - if cat_path != None: - parms_in['path_to_cat'] = cat_path - - - - task = ploomber.tasks.ScriptRunner(Path(input_path), ploomber.products.File(info['product']), dag, params=parms_in, name=output_name) - - if dependency != None: + + parms_in["subset_kwargs"] = subset_kwargs + + if cat_path is not None: + parms_in["path_to_cat"] = cat_path + + task = ploomber.tasks.ScriptRunner( + Path(input_path), + ploomber.products.File(info["product"]), + dag, + params=parms_in, + name=output_name, + ) + + if dependency is not None: raise NotImplementedError - # set DAG dependency here + # set DAG dependency here # something with task.set_upstream(other_task?) - - return task \ No newline at end of file + + return task diff --git a/docs/addingnotebookstocollection.md b/docs/addingnotebookstocollection.md index 0034efa..bf54ddd 100644 --- a/docs/addingnotebookstocollection.md +++ b/docs/addingnotebookstocollection.md @@ -40,4 +40,4 @@ Generally, a good fit for a diagnostic notebook is one that reads in CESM output 7. Update your parameters. Parameters that are specific to just this notebook should go under `parameter_groups` in the notebook's entry under `compute_notebooks`. Global parameters that you want passed in to every notebook in the collection should go under `global_params`. When `CUPiD` executes your notebook, all of these parameters will get put in a new cell below the cell tagged `parameters` that you added in step 3. This means they will supercede the values of the parameters that you put in the cell above---the names, notation, etc. should match to make sure your notebook is able to find the variables it needs. -8. All set! Your collection can now be run and built with `cupid-run config.yml` and `cupid-build config.yml` as usual. +8. All set! Your collection can now be run and built with `cupid-run` and `cupid-build` as usual. diff --git a/docs/conf.py b/docs/conf.py index 9b028eb..39514e0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,7 +14,7 @@ import datetime import re -sys.path.insert(0, os.path.abspath('../..')) +sys.path.insert(0, os.path.abspath("../..")) print("sys.path:", sys.path) @@ -22,30 +22,29 @@ # This block allows us to remove the header image from any md files # without affecting the original version, but still pull the source # into the docs build fresh each time. -for file in ['README.md', 'NCAR_tips.md']: - os.system(f'cp ../{file} ./') +for file in ["README.md", "NCAR_tips.md"]: + os.system(f"cp ../{file} ./") - # Remove any images from the first line of the file - with open(file, 'r') as f: - file1 = f.readline() - file1 = re.sub(' ', '', file1) - file_rest = f.read() + # Remove any images from the first line of the file + with open(file, "r") as f: + file1 = f.readline() + file1 = re.sub(" ", "", file1) + file_rest = f.read() - with open(file, 'w') as f: - f.write(file1+file_rest) + with open(file, "w") as f: + f.write(file1 + file_rest) # -- Project information ----------------------------------------------------- -project = 'CUPiD' +project = "CUPiD" current_year = datetime.datetime.now().year -copyright = u'{}, University Corporation for Atmospheric Research'.format( - current_year) +copyright = "{}, University Corporation for Atmospheric Research".format(current_year) -author = 'NSF NCAR' +author = "NSF NCAR" # The master toctree document. -master_doc = 'index' +master_doc = "index" # -- General configuration --------------------------------------------------- @@ -55,41 +54,41 @@ # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon', - 'sphinx.ext.autosummary', - 'sphinx.ext.intersphinx', - 'myst_nb', + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", + "myst_nb", "sphinx_design", "nbsphinx", ] intersphinx_mapping = { - 'dask': ('https://docs.dask.org/en/latest/', None), - 'python': ('https://docs.python.org/3/', None), - 'numpy': ("https://numpy.org/doc/stable", None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None), - 'xarray': ('http://xarray.pydata.org/en/stable/', None), - 'pint': ('https://pint.readthedocs.io/en/stable/', None), - 'cftime': ('https://unidata.github.io/cftime/', None), + "dask": ("https://docs.dask.org/en/latest/", None), + "python": ("https://docs.python.org/3/", None), + "numpy": ("https://numpy.org/doc/stable", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), + "xarray": ("http://xarray.pydata.org/en/stable/", None), + "pint": ("https://pint.readthedocs.io/en/stable/", None), + "cftime": ("https://unidata.github.io/cftime/", None), } autosummary_generate = True # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['**.ipynb_checkpoints'] +exclude_patterns = ["**.ipynb_checkpoints"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] source_suffix = { - '.rst': 'restructuredtext', - '.ipynb': 'myst-nb', + ".rst": "restructuredtext", + ".ipynb": "myst-nb", } @@ -116,16 +115,15 @@ use_repository_button=True, use_issues_button=True, home_page_in_toc=True, - extra_footer= - "The National Center for Atmospheric Research is sponsored by the National Science Foundation. Any opinions, findings and conclusions or recommendations expressed in this material do not necessarily reflect the views of the National Science Foundation.", + extra_footer="The National Center for Atmospheric Research is sponsored by the National Science Foundation. Any opinions, findings and conclusions or recommendations expressed in this material do not necessarily reflect the views of the National Science Foundation.", ) # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] -html_logo = '_static/images/logos/logo.png' -html_favicon = '_static/images/logos/logo.png' +html_logo = "_static/images/logos/logo.png" +html_favicon = "_static/images/logos/logo.png" -autoclass_content = 'both' +autoclass_content = "both"