From b5403962f6ea9cf7e8df5e052c817d3b52dfeee5 Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Tue, 22 Nov 2022 02:06:39 +0000 Subject: [PATCH 01/18] Increase precision of degs_per_radian to 15 digits. --- ush/constants.yaml | 2 +- ush/python_utils/config_parser.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ush/constants.yaml b/ush/constants.yaml index c4a3fa9d67..0ac61f5318 100644 --- a/ush/constants.yaml +++ b/ush/constants.yaml @@ -7,7 +7,7 @@ constants: #----------------------------------------------------------------------- # PI_GEOM: 3.14159265358979323846264338327 - DEGS_PER_RADIAN: 57.2957795131 + DEGS_PER_RADIAN: 57.29577951308232087679 RADIUS_EARTH: 6371200.0 # #----------------------------------------------------------------------- diff --git a/ush/python_utils/config_parser.py b/ush/python_utils/config_parser.py index aa9d04aed3..c09ff8c9c5 100644 --- a/ush/python_utils/config_parser.py +++ b/ush/python_utils/config_parser.py @@ -26,7 +26,7 @@ except ModuleNotFoundError: pass # The rest of the formats: JSON/SHELL/INI/XML do not need -# external pakcages +# external packages import json import os import re @@ -398,8 +398,6 @@ def cfg_to_xml_str(cfg): ################## # CONFIG utils ################## - - def flatten_dict(dictionary, keys=None): """Flatten a recursive dictionary (e.g.yaml/json) to be one level deep From fc9f57ab2a9682bb42b2e5b38665d29b81646dbd Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Tue, 22 Nov 2022 02:50:22 +0000 Subject: [PATCH 02/18] Use generic date util. --- .../config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/WE2E/test_configs/wflow_features/config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml b/tests/WE2E/test_configs/wflow_features/config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml index 7b30698ef2..38fbbe5af6 100644 --- a/tests/WE2E/test_configs/wflow_features/config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml +++ b/tests/WE2E/test_configs/wflow_features/config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml @@ -10,8 +10,8 @@ platform: workflow: CCPP_PHYS_SUITE: FV3_GFS_2017_gfdlmp PREDEF_GRID_NAME: RRFS_CONUS_25km - DATE_FIRST_CYCL: date --utc --date="2 days ago" +%Y%m%d00 - DATE_LAST_CYCL: date --utc --date="2 days ago" +%Y%m%d00 + DATE_FIRST_CYCL: $DATE_UTIL --utc --date="2 days ago" +%Y%m%d00 + DATE_LAST_CYCL: $DATE_UTIL --utc --date="2 days ago" +%Y%m%d00 FCST_LEN_HRS: 6 PREEXISTING_DIR_METHOD: rename task_get_extrn_ics: From 381856c95a61cc8def514dd80899bb438f380fcb Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Tue, 6 Dec 2022 15:32:46 -0700 Subject: [PATCH 03/18] Add fake slurm commands for rocoto usage on linux. --- ush/rocoto_fake_slurm/sacct | 33 +++++++++++++++++++++++++++ ush/rocoto_fake_slurm/sbatch | 43 +++++++++++++++++++++++++++++++++++ ush/rocoto_fake_slurm/scancel | 4 ++++ ush/rocoto_fake_slurm/sinfo | 4 ++++ ush/rocoto_fake_slurm/squeue | 34 +++++++++++++++++++++++++++ ush/rocoto_fake_slurm/srun | 26 +++++++++++++++++++++ 6 files changed, 144 insertions(+) create mode 100755 ush/rocoto_fake_slurm/sacct create mode 100755 ush/rocoto_fake_slurm/sbatch create mode 100755 ush/rocoto_fake_slurm/scancel create mode 100755 ush/rocoto_fake_slurm/sinfo create mode 100755 ush/rocoto_fake_slurm/squeue create mode 100755 ush/rocoto_fake_slurm/srun diff --git a/ush/rocoto_fake_slurm/sacct b/ush/rocoto_fake_slurm/sacct new file mode 100755 index 0000000000..a9fe0a4107 --- /dev/null +++ b/ush/rocoto_fake_slurm/sacct @@ -0,0 +1,33 @@ +#!/bin/bash + +# Emulates slurm's sacct +if [[ "$1" = *"--jobs="* ]]; then + PIDS="${1:7}" +elif [[ -f .job_database ]]; then + PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}') +fi +PIDS="${PIDS//,/' '}" + +# Output info the way rocoto calls sacct +echo "JobID|User|JobName|Partition|Priority|Submit|Start|End|NCPUS|ExitCode|State" +for pid in ${PIDS}; do + t_sub=$(date --utc +%Y-%m-%d:%H:%M:%S) + t_start=$t_sub + t_end=$t_sub + name=$pid + + v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}') + if [ ! -z "$v" ]; then + read name t_sub <<< "$v" + v=$(cat .job_database | grep "pid $pid exitcode" | awk '{print $5" "$7" "$9}') + if [ ! -z "$v" ]; then + read v t_start t_end <<< "$v" + fi + fi + + if [ -z "$v" ]; then + echo "$pid|$USER|$name|linux|$t_sub|$t_start|$t_end|0.1|0|RUNNING" + else + echo "$pid|$USER|$name|linux|$t_sub|$t_start|$t_end|0.1|$v|COMPLETED" + fi +done diff --git a/ush/rocoto_fake_slurm/sbatch b/ush/rocoto_fake_slurm/sbatch new file mode 100755 index 0000000000..dcf43e45ee --- /dev/null +++ b/ush/rocoto_fake_slurm/sbatch @@ -0,0 +1,43 @@ +#!/bin/bash + +# Emulates slurm's sbatch + +FD=${1:-/dev/stdin} + +#parse log file +LOG=`grep "#SBATCH -o" $FD | awk '{ print $3 }'` +if [ -z "$LOG" ]; then + LOG=/dev/null +fi + +#parse time +TIM=`grep "#SBATCH -t" $FD | awk '{ print $3 }'` +if [ -z "$TIM" ]; then + CTIM= +else + SECS=`echo $TIM | awk 'BEGIN { FS = ":" } ; { secs = $1 * 3600 + $2 * 60 + $3; print secs };'` + CTIM="timeout ${SECS}s" +fi + +#parse job name +JOBNAME=`grep "#SBATCH --job-name" $FD | awk 'BEGIN { FS = "=" }; { print $2 }'` +if [ -z "$JOBNAME" ]; then + JOBNAME="default" +fi + +#command +CMD="`cat $FD`" + +#execute job in background +bash -c "\ + ds=\$(date --utc +%Y-%m-%d:%H:%M:%S); \ + ${CTIM} ${CMD} &>$LOG; \ + excode=\$(tail -n 1 $LOG | awk '{print \$14}'); \ + de=\$(date --utc +%Y-%m-%d:%H:%M:%S); \ + echo $JOBNAME pid \$$ exitcode \$excode started \$ds ended \$de >>.job_database;" & + +#submission info +pid=$! +dsub=$(date --utc +%Y-%m-%d:%H:%M:%S) +echo $JOBNAME pid $pid submitted $dsub >>.job_database +echo "Submitted batch job "$pid diff --git a/ush/rocoto_fake_slurm/scancel b/ush/rocoto_fake_slurm/scancel new file mode 100755 index 0000000000..94575f48c3 --- /dev/null +++ b/ush/rocoto_fake_slurm/scancel @@ -0,0 +1,4 @@ +#!/bin/bash + +# Emulates slurm's scancel +exec kill -9 -$1 diff --git a/ush/rocoto_fake_slurm/sinfo b/ush/rocoto_fake_slurm/sinfo new file mode 100755 index 0000000000..cde2d07bc7 --- /dev/null +++ b/ush/rocoto_fake_slurm/sinfo @@ -0,0 +1,4 @@ +#!/bin/bash + +# Emulates slurm's sinfo +exec lscpu diff --git a/ush/rocoto_fake_slurm/squeue b/ush/rocoto_fake_slurm/squeue new file mode 100755 index 0000000000..90c1d64065 --- /dev/null +++ b/ush/rocoto_fake_slurm/squeue @@ -0,0 +1,34 @@ +#!/bin/bash + +# Emulates slurm's squeue +if [[ "$1" = *"--jobs="* ]]; then + PIDS="${1:7}" +elif [[ -f .job_database ]]; then + PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}') +fi +PIDS="${PIDS//,/' '}" + +# Output info the way rocoto calls squeue +FMT="%-40s%-40s%-10s%-20s%-30s%-30s%-30s%-30s%-10s%-30s%-200s\n" +printf "$FMT" JOBID USER CPUS PARTITION SUBMIT_TIME START_TIME END_TIME PRIORITY EXIT_CODE STATE NAME +for pid in ${PIDS}; do + t_sub=$(date --utc +%Y-%m-%d:%H:%M:%S) + t_start=$t_sub + t_end=$t_sub + name=$pid + + v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}') + if [ ! -z "$v" ]; then + read name t_sub <<< "$v" + v=$(cat .job_database | grep "pid $pid exitcode" | awk '{print $5" "$7" "$9}') + if [ ! -z "$v" ]; then + read v t_start t_end <<< "$v" + fi + fi + + if [ -z "$v" ]; then + printf "$FMT" $pid $USER 1 linux $t_sub $t_start $t_end 0.1 0 RUNNING $name + else + printf "$FMT" $pid $USER 1 linux $t_sub $t_start $t_end 0.1 $v COMPLETED $name + fi +done diff --git a/ush/rocoto_fake_slurm/srun b/ush/rocoto_fake_slurm/srun new file mode 100755 index 0000000000..b5387a3d2d --- /dev/null +++ b/ush/rocoto_fake_slurm/srun @@ -0,0 +1,26 @@ +#!/bin/bash + +# Emulates slurm's srun +OPTS="" +CMDS="" + +# Extract only --ntasks from options +while (( "$#" )); do + case "$1" in + -n|--ntasks) + OPTS="$OPTS -n $2" + shift 2 + ;; + --ntasks=?*) + OPTS="$OPTS -n ${1:9}" + shift 1 + ;; + *) + CMDS="$CMDS $1" + shift + ;; + esac +done + +# Run with mpirun +exec mpirun ${OPTS} ${CMDS} From 032c68ace92e2f1ec3b8ee10b88799b3e11237a8 Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Tue, 6 Dec 2022 15:50:43 -0700 Subject: [PATCH 04/18] Modify machine files for linux and mac. --- ush/machine/linux.yaml | 16 ++++++++++++++-- ush/machine/macos.yaml | 15 +++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/ush/machine/linux.yaml b/ush/machine/linux.yaml index db9749aa73..d0b2b2cbc5 100644 --- a/ush/machine/linux.yaml +++ b/ush/machine/linux.yaml @@ -1,18 +1,30 @@ platform: - WORKFLOW_MANAGER: none + WORKFLOW_MANAGER: rocoto NCORES_PER_NODE: 8 - SCHED: none + SCHED: slurm + CCPA_OBS_DIR: /home/username/DATA/UFS/obs_data/ccpa/proc + MRMS_OBS_DIR: /home/username/DATA/UFS/obs_data/mrms/proc + NDAS_OBS_DIR: /home/username/DATA/UFS/obs_data/ndas/proc + METPLUS_PATH: "" + MET_BIN_EXEC: bin + MET_INSTALL_DIR: "" + DOMAIN_PREGEN_BASEDIR: /home/username/DATA/UFS/FV3LAM_pregen RUN_CMD_FCST: 'mpirun -n ${PE_MEMBER01} ' RUN_CMD_POST: 'mpirun -n 4 ' RUN_CMD_SERIAL: time RUN_CMD_UTILS: mpirun -n 4 PRE_TASK_CMDS: '{ ulimit -a; }' + TEST_EXTRN_MDL_SOURCE_BASEDIR: /home/username/DATA/UFS/input_model_data + TEST_PREGEN_BASEDIR: /home/username/DATA/UFS/FV3LAM_pregen + TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /home/username/DATA/UFS/dummy_FV3GFS_sys_dir + TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS: /home/username/DATA/UFS/dummy_FV3GFS_sys_dir FIXaer: /home/username/DATA/UFS/fix/fix_aer FIXgsm: /home/username/DATA/UFS/fix/fix_am FIXlut: /home/username/DATA/UFS/fix/fix_lut FIXorg: /home/username/DATA/UFS/fix/fix_orog FIXsfc: /home/username/DATA/UFS/fix/fix_sfc_climo FIXshp: /home/username/DATA/UFS/NaturalEarth + EXTRN_MDL_DATA_STORES: aws nomads disk data: ics_lbcs: FV3GFS: /home/username/DATA/UFS/FV3GFS diff --git a/ush/machine/macos.yaml b/ush/machine/macos.yaml index 7d16746174..3151741a75 100644 --- a/ush/machine/macos.yaml +++ b/ush/machine/macos.yaml @@ -1,12 +1,23 @@ platform: - WORKFLOW_MANAGER: none + WORKFLOW_MANAGER: rocoto NCORES_PER_NODE: 8 - SCHED: none + SCHED: slurm + CCPA_OBS_DIR: /Users/username/DATA/UFS/obs_data/ccpa/proc + MRMS_OBS_DIR: /Users/username/DATA/UFS/obs_data/mrms/proc + NDAS_OBS_DIR: /Users/username/DATA/UFS/obs_data/ndas/proc + DOMAIN_PREGEN_BASEDIR: /Users/username/DATA/UFS/FV3LAM_pregen + METPLUS_PATH: "" + MET_BIN_EXEC: bin + MET_INSTALL_DIR: "" RUN_CMD_FCST: 'mpirun -n ${PE_MEMBER01} ' RUN_CMD_POST: 'mpirun -n 4 ' RUN_CMD_SERIAL: time RUN_CMD_UTILS: mpirun -n 4 PRE_TASK_CMDS: '{ ulimit -a; }' + TEST_EXTRN_MDL_SOURCE_BASEDIR: /Users/username/DATA/UFS/input_model_data + TEST_PREGEN_BASEDIR: /Users/username/DATA/UFS/FV3LAM_pregen + TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /Users/username/DATA/UFS/dummy_FV3GFS_sys_dir + TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS: /Users/username/DATA/UFS/dummy_FV3GFS_sys_dir FIXaer: /Users/username/DATA/UFS/fix/fix_aer FIXgsm: /Users/username/DATA/UFS/fix/fix_am FIXlut: /Users/username/DATA/UFS/fix/fix_lut From 6d99a872eb0da30b3d49c613a61d7bc377a69eba Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Tue, 6 Dec 2022 19:10:12 -0700 Subject: [PATCH 05/18] Modify linux and macos wflow modules. --- modulefiles/wflow_linux.lua | 33 ++++++++++++++++++++++++++------- modulefiles/wflow_macos.lua | 33 ++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/modulefiles/wflow_linux.lua b/modulefiles/wflow_linux.lua index dd9aec7afa..6c4cc6949d 100644 --- a/modulefiles/wflow_linux.lua +++ b/modulefiles/wflow_linux.lua @@ -5,16 +5,35 @@ This module sets a path to activate conda environment needed for running the UFS whatis([===[This module sets a path for conda environment needed for running the UFS SRW App on Linux]===]) setenv("CMAKE_Platform", "linux") -setenv("VENV", pathJoin(os.getenv("HOME"), "condaenv/envs/regional_workflow")) ---[[ -local ROCOTOmod="/Users/username/modules" -prepend_path("MODULEPATH", ROCOTOmod) -load(rocoto) ---]] +-- Conda initialization function +function init_conda(conda_path) + local shell=myShellType() + local conda_file + if shell == "csh" then + conda_file=pathJoin(conda_path,"etc/profile.d/conda.csh") + else + conda_file=pathJoin(conda_path,"etc/profile.d/conda.sh") + end + local mcmd="source " .. conda_file + execute{cmd=mcmd, modeA={"load"}} +end + +-- initialize conda +local conda_path="/home/username/miniconda3" +init_conda(conda_path) + +-- add rocoto to path +local rocoto_path="/home/username/rocoto" +prepend_path("PATH", pathJoin(rocoto_path,"bin")) + +-- add fake slurm commands +local srw_path="/home/username/ufs-srweather-app" +prepend_path("PATH", pathJoin(srw_path, "ush/rocoto_fake_slurm")) +-- display conda activation message if mode() == "load" then LmodMsgRaw([===[Please do the following to activate conda: - > conda activate $VENV + > conda activate regional_workflow ]===]) end diff --git a/modulefiles/wflow_macos.lua b/modulefiles/wflow_macos.lua index 769f1bc05e..d7cf30e0a3 100644 --- a/modulefiles/wflow_macos.lua +++ b/modulefiles/wflow_macos.lua @@ -5,17 +5,36 @@ This module set a path needed to activate conda environement for running UFS SRW whatis([===[This module activates conda environment for running the UFS SRW App on macOS]===]) setenv("CMAKE_Platform", "macos") -setenv("VENV", pathJoin(os.getenv("HOME"), "condaenv/envs/regional_workflow")) ---[[ -local ROCOTOmod="/Users/username/modules" -prepend_path("MODULEPATH", ROCOTOmod) -load(rocoto) ---]] +-- Conda initialization function +function init_conda(conda_path) + local shell=myShellType() + local conda_file + if shell == "csh" then + conda_file=pathJoin(conda_path,"etc/profile.d/conda.csh") + else + conda_file=pathJoin(conda_path,"etc/profile.d/conda.sh") + end + local mcmd="source " .. conda_file + execute{cmd=mcmd, modeA={"load"}} +end + +-- initialize conda +local conda_path="/Users/username/miniconda3" +init_conda(conda_path) + +-- add rocoto to path +local rocoto_path="/Users/username/rocoto" +prepend_path("PATH", pathJoin(rocoto_path,"bin")) + +-- add fake slurm commands +local srw_path="/Users/username/ufs-srweather-app" +prepend_path("PATH", pathJoin(srw_path, "ush/rocoto_fake_slurm")) +-- display conda activation message if mode() == "load" then LmodMsgRaw([===[Please do the following to activate conda virtual environment: - > conda activate $VENV " + > conda activate regional_workflow" ]===]) end From 87e66dc4d738a85feb0686188e9430a2f7d995cc Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Tue, 6 Dec 2022 22:22:18 -0700 Subject: [PATCH 06/18] Fix unittest. --- ush/set_gridparams_ESGgrid.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ush/set_gridparams_ESGgrid.py b/ush/set_gridparams_ESGgrid.py index 3346fe0bfd..6f696b04d4 100644 --- a/ush/set_gridparams_ESGgrid.py +++ b/ush/set_gridparams_ESGgrid.py @@ -104,8 +104,8 @@ def test_set_gridparams_ESGgrid(self): 0.0, 6, 0.999, - 0.013489400626200717, - 0.013489400626200717, + 0.013489400626196555, + 0.013489400626196555, -1760, -1050, ], From 09ffb7c57ed3948721ce724542bf3dc1ed44e6a7 Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Wed, 7 Dec 2022 10:05:28 -0700 Subject: [PATCH 07/18] Remove openmpi module loading in linux/mac build modulefile. --- modulefiles/build_linux_gnu.lua | 1 - modulefiles/build_macos_gnu.lua | 1 - 2 files changed, 2 deletions(-) diff --git a/modulefiles/build_linux_gnu.lua b/modulefiles/build_linux_gnu.lua index 7f40d763de..cc5f6831f1 100644 --- a/modulefiles/build_linux_gnu.lua +++ b/modulefiles/build_linux_gnu.lua @@ -14,7 +14,6 @@ load("hpc") load("hpc-python") load("hpc-gnu") -load("openmpi") load("hpc-openmpi") load("srw_common") diff --git a/modulefiles/build_macos_gnu.lua b/modulefiles/build_macos_gnu.lua index d92de9a4da..36c8b80cdb 100644 --- a/modulefiles/build_macos_gnu.lua +++ b/modulefiles/build_macos_gnu.lua @@ -18,7 +18,6 @@ load("hpc") load("hpc-python") load("hpc-gnu") -load("openmpi") load("hpc-openmpi") load("srw_common") From 5c1a601d309e0670309805318eb0e771846358ea Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Wed, 7 Dec 2022 11:47:36 -0700 Subject: [PATCH 08/18] Fix sacct. --- ush/rocoto_fake_slurm/sacct | 7 ++++--- ush/rocoto_fake_slurm/squeue | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ush/rocoto_fake_slurm/sacct b/ush/rocoto_fake_slurm/sacct index a9fe0a4107..818765d990 100755 --- a/ush/rocoto_fake_slurm/sacct +++ b/ush/rocoto_fake_slurm/sacct @@ -1,7 +1,7 @@ #!/bin/bash # Emulates slurm's sacct -if [[ "$1" = *"--jobs="* ]]; then +if [[ "$1" = "--jobs="* ]]; then PIDS="${1:7}" elif [[ -f .job_database ]]; then PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}') @@ -9,6 +9,7 @@ fi PIDS="${PIDS//,/' '}" # Output info the way rocoto calls sacct +FMT="%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n" echo "JobID|User|JobName|Partition|Priority|Submit|Start|End|NCPUS|ExitCode|State" for pid in ${PIDS}; do t_sub=$(date --utc +%Y-%m-%d:%H:%M:%S) @@ -26,8 +27,8 @@ for pid in ${PIDS}; do fi if [ -z "$v" ]; then - echo "$pid|$USER|$name|linux|$t_sub|$t_start|$t_end|0.1|0|RUNNING" + printf "$FMT" $pid ${USER:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 0 RUNNING else - echo "$pid|$USER|$name|linux|$t_sub|$t_start|$t_end|0.1|$v|COMPLETED" + printf "$FMT" $pid ${USRE:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 $v COMPLETED fi done diff --git a/ush/rocoto_fake_slurm/squeue b/ush/rocoto_fake_slurm/squeue index 90c1d64065..ccd37ce7f1 100755 --- a/ush/rocoto_fake_slurm/squeue +++ b/ush/rocoto_fake_slurm/squeue @@ -1,7 +1,7 @@ #!/bin/bash # Emulates slurm's squeue -if [[ "$1" = *"--jobs="* ]]; then +if [[ "$1" = "--jobs="* ]]; then PIDS="${1:7}" elif [[ -f .job_database ]]; then PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}') From f884c239d6ffac5e378d97d24aea26fa379fbeca Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Wed, 7 Dec 2022 12:50:53 -0700 Subject: [PATCH 09/18] Fix crontab unspecified USER issue. --- ush/rocoto_fake_slurm/sacct | 5 +++-- ush/rocoto_fake_slurm/squeue | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ush/rocoto_fake_slurm/sacct b/ush/rocoto_fake_slurm/sacct index 818765d990..fff885b03f 100755 --- a/ush/rocoto_fake_slurm/sacct +++ b/ush/rocoto_fake_slurm/sacct @@ -16,6 +16,7 @@ for pid in ${PIDS}; do t_start=$t_sub t_end=$t_sub name=$pid + user=${USER:-user} v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}') if [ ! -z "$v" ]; then @@ -27,8 +28,8 @@ for pid in ${PIDS}; do fi if [ -z "$v" ]; then - printf "$FMT" $pid ${USER:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 0 RUNNING + printf "$FMT" $pid ${user:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 0 RUNNING else - printf "$FMT" $pid ${USRE:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 $v COMPLETED + printf "$FMT" $pid ${user:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 $v COMPLETED fi done diff --git a/ush/rocoto_fake_slurm/squeue b/ush/rocoto_fake_slurm/squeue index ccd37ce7f1..5c18b582b2 100755 --- a/ush/rocoto_fake_slurm/squeue +++ b/ush/rocoto_fake_slurm/squeue @@ -16,6 +16,7 @@ for pid in ${PIDS}; do t_start=$t_sub t_end=$t_sub name=$pid + user=${USER:-user} v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}') if [ ! -z "$v" ]; then @@ -27,8 +28,8 @@ for pid in ${PIDS}; do fi if [ -z "$v" ]; then - printf "$FMT" $pid $USER 1 linux $t_sub $t_start $t_end 0.1 0 RUNNING $name + printf "$FMT" $pid $user 1 linux $t_sub $t_start $t_end 0.1 0 RUNNING $name else - printf "$FMT" $pid $USER 1 linux $t_sub $t_start $t_end 0.1 $v COMPLETED $name + printf "$FMT" $pid $user 1 linux $t_sub $t_start $t_end 0.1 $v COMPLETED $name fi done From 61a26223c43b9e7e719d5e72f9342f7be1956a07 Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Wed, 7 Dec 2022 21:29:59 -0700 Subject: [PATCH 10/18] Add EXTRN_MDL_DATA_STORES to macos. --- ush/machine/linux.yaml | 2 +- ush/machine/macos.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ush/machine/linux.yaml b/ush/machine/linux.yaml index d0b2b2cbc5..ec63af445a 100644 --- a/ush/machine/linux.yaml +++ b/ush/machine/linux.yaml @@ -24,7 +24,7 @@ platform: FIXorg: /home/username/DATA/UFS/fix/fix_orog FIXsfc: /home/username/DATA/UFS/fix/fix_sfc_climo FIXshp: /home/username/DATA/UFS/NaturalEarth - EXTRN_MDL_DATA_STORES: aws nomads disk + EXTRN_MDL_DATA_STORES: aws nomads data: ics_lbcs: FV3GFS: /home/username/DATA/UFS/FV3GFS diff --git a/ush/machine/macos.yaml b/ush/machine/macos.yaml index 3151741a75..3a89650d8c 100644 --- a/ush/machine/macos.yaml +++ b/ush/machine/macos.yaml @@ -24,6 +24,7 @@ platform: FIXorg: /Users/username/DATA/UFS/fix/fix_orog FIXsfc: /Users/username/DATA/UFS/fix/fix_sfc_climo FIXshp: /Users/username/DATA/UFS/NaturalEarth + EXTRN_MDL_DATA_STORES: aws nomads data: ics_lbcs: FV3GFS: /Users/username/DATA/UFS/FV3GFS From 658ea2aa4a9f7fec01f6e85806c401546c4d2d35 Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Wed, 7 Dec 2022 21:23:32 -0700 Subject: [PATCH 11/18] Add more states to squeue/sacct. --- ush/rocoto_fake_slurm/sacct | 25 ++++++++++++++++--------- ush/rocoto_fake_slurm/sbatch | 7 ++++++- ush/rocoto_fake_slurm/squeue | 25 ++++++++++++++++--------- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/ush/rocoto_fake_slurm/sacct b/ush/rocoto_fake_slurm/sacct index fff885b03f..7b07f84720 100755 --- a/ush/rocoto_fake_slurm/sacct +++ b/ush/rocoto_fake_slurm/sacct @@ -3,33 +3,40 @@ # Emulates slurm's sacct if [[ "$1" = "--jobs="* ]]; then PIDS="${1:7}" + PIDS="${PIDS//,/' '}" elif [[ -f .job_database ]]; then PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}') fi -PIDS="${PIDS//,/' '}" # Output info the way rocoto calls sacct FMT="%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n" echo "JobID|User|JobName|Partition|Priority|Submit|Start|End|NCPUS|ExitCode|State" + for pid in ${PIDS}; do - t_sub=$(date --utc +%Y-%m-%d:%H:%M:%S) + + t_sub="N/A" t_start=$t_sub t_end=$t_sub name=$pid user=${USER:-user} + exitc=0 + state="UNKNOWN" v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}') if [ ! -z "$v" ]; then + state="PENDING" read name t_sub <<< "$v" - v=$(cat .job_database | grep "pid $pid exitcode" | awk '{print $5" "$7" "$9}') + v=$(cat .job_database | grep "pid $pid started" | awk '{print $5" "$7}') if [ ! -z "$v" ]; then - read v t_start t_end <<< "$v" + state="RUNNING" + read t_start t_end<<< "$v" + fi + v=$(cat .job_database | grep "pid $pid ended" | awk '{print $5" "$7}') + if [ ! -z "$v" ]; then + state="COMPLETED" + read t_end exitc <<< "$v" fi fi - if [ -z "$v" ]; then - printf "$FMT" $pid ${user:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 0 RUNNING - else - printf "$FMT" $pid ${user:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 $v COMPLETED - fi + printf "$FMT" $pid ${user:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 $exitc $state done diff --git a/ush/rocoto_fake_slurm/sbatch b/ush/rocoto_fake_slurm/sbatch index dcf43e45ee..5972e8d41e 100755 --- a/ush/rocoto_fake_slurm/sbatch +++ b/ush/rocoto_fake_slurm/sbatch @@ -13,6 +13,7 @@ fi #parse time TIM=`grep "#SBATCH -t" $FD | awk '{ print $3 }'` if [ -z "$TIM" ]; then + SECS= CTIM= else SECS=`echo $TIM | awk 'BEGIN { FS = ":" } ; { secs = $1 * 3600 + $2 * 60 + $3; print secs };'` @@ -31,10 +32,14 @@ CMD="`cat $FD`" #execute job in background bash -c "\ ds=\$(date --utc +%Y-%m-%d:%H:%M:%S); \ + de=\$(date --utc -d '$SECS sec' +%Y-%m-%d:%H:%M:%S); \ + echo $JOBNAME pid \$$ started \$ds ends \$de >>.job_database; \ + \ ${CTIM} ${CMD} &>$LOG; \ + \ excode=\$(tail -n 1 $LOG | awk '{print \$14}'); \ de=\$(date --utc +%Y-%m-%d:%H:%M:%S); \ - echo $JOBNAME pid \$$ exitcode \$excode started \$ds ended \$de >>.job_database;" & + echo $JOBNAME pid \$$ ended \$de exitcode \$excode >>.job_database;" & #submission info pid=$! diff --git a/ush/rocoto_fake_slurm/squeue b/ush/rocoto_fake_slurm/squeue index 5c18b582b2..915ca58506 100755 --- a/ush/rocoto_fake_slurm/squeue +++ b/ush/rocoto_fake_slurm/squeue @@ -3,33 +3,40 @@ # Emulates slurm's squeue if [[ "$1" = "--jobs="* ]]; then PIDS="${1:7}" + PIDS="${PIDS//,/' '}" elif [[ -f .job_database ]]; then PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}') fi -PIDS="${PIDS//,/' '}" # Output info the way rocoto calls squeue FMT="%-40s%-40s%-10s%-20s%-30s%-30s%-30s%-30s%-10s%-30s%-200s\n" printf "$FMT" JOBID USER CPUS PARTITION SUBMIT_TIME START_TIME END_TIME PRIORITY EXIT_CODE STATE NAME + for pid in ${PIDS}; do - t_sub=$(date --utc +%Y-%m-%d:%H:%M:%S) + + t_sub="N/A" t_start=$t_sub t_end=$t_sub name=$pid user=${USER:-user} + exitc=0 + state="UNKNOWN" v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}') if [ ! -z "$v" ]; then + state="PENDING" read name t_sub <<< "$v" - v=$(cat .job_database | grep "pid $pid exitcode" | awk '{print $5" "$7" "$9}') + v=$(cat .job_database | grep "pid $pid started" | awk '{print $5" "$7}') if [ ! -z "$v" ]; then - read v t_start t_end <<< "$v" + state="RUNNING" + read t_start t_end<<< "$v" + fi + v=$(cat .job_database | grep "pid $pid ended" | awk '{print $5" "$7}') + if [ ! -z "$v" ]; then + state="COMPLETED" + read t_end exitc <<< "$v" fi fi - if [ -z "$v" ]; then - printf "$FMT" $pid $user 1 linux $t_sub $t_start $t_end 0.1 0 RUNNING $name - else - printf "$FMT" $pid $user 1 linux $t_sub $t_start $t_end 0.1 $v COMPLETED $name - fi + printf "$FMT" $pid $user 1 linux $t_sub $t_start $t_end 0.1 $exitc $state $name done From 94fba8484323c9faef5c1186646015b59d8dc91f Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Thu, 8 Dec 2022 06:59:21 -0700 Subject: [PATCH 12/18] Add a taskthrottle=1 option for linux/mac. --- parm/FV3LAM_wflow.xml | 7 ++++++- ush/config_defaults.yaml | 5 +++++ ush/machine/linux.yaml | 1 + ush/machine/macos.yaml | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/parm/FV3LAM_wflow.xml b/parm/FV3LAM_wflow.xml index d46e70dee1..d7cbd7b8df 100644 --- a/parm/FV3LAM_wflow.xml +++ b/parm/FV3LAM_wflow.xml @@ -27,6 +27,11 @@ Workflow task names. {%- else %} {%- endif %} +{%- if taskthrottle %} + +{%- else %} + +{%- endif %} @@ -153,7 +158,7 @@ tasks; and the "FCST" type is used for the RUN_FCST_TN task. ]> - + {# Double quotes are required inside the strftime! Expect an error from reading the template if using single quotes. #} {{ cdate_first_cycl.strftime("%M %H %d %m %Y *") }} diff --git a/ush/config_defaults.yaml b/ush/config_defaults.yaml index 5baa934f43..cd7d1daafa 100644 --- a/ush/config_defaults.yaml +++ b/ush/config_defaults.yaml @@ -87,6 +87,10 @@ platform: # The number of cores available per node on the compute platform, now # configurable for all platforms. # + # TASKTHROTTLE: + # The number of active tasks run simultaneously. For linux/mac setting this + # to 1 makes sense + # # BUILD_MOD_FN: # Name of alternative build module file to use if using an # unsupported platform. Is set automatically for supported machines. @@ -158,6 +162,7 @@ platform: # WORKFLOW_MANAGER: "" NCORES_PER_NODE: "" + TASKTHROTTLE: "" BUILD_MOD_FN: 'build_{{ user.MACHINE|lower() }}_{{ workflow.COMPILER }}' WFLOW_MOD_FN: 'wflow_{{ user.MACHINE|lower() }}' BUILD_VER_FN: 'build.ver.{{ user.MACHINE|lower() }}' diff --git a/ush/machine/linux.yaml b/ush/machine/linux.yaml index ec63af445a..cbd6bafc96 100644 --- a/ush/machine/linux.yaml +++ b/ush/machine/linux.yaml @@ -1,6 +1,7 @@ platform: WORKFLOW_MANAGER: rocoto NCORES_PER_NODE: 8 + TASKTHROTTLE: 1 SCHED: slurm CCPA_OBS_DIR: /home/username/DATA/UFS/obs_data/ccpa/proc MRMS_OBS_DIR: /home/username/DATA/UFS/obs_data/mrms/proc diff --git a/ush/machine/macos.yaml b/ush/machine/macos.yaml index 3a89650d8c..8d1b3c66af 100644 --- a/ush/machine/macos.yaml +++ b/ush/machine/macos.yaml @@ -1,6 +1,7 @@ platform: WORKFLOW_MANAGER: rocoto NCORES_PER_NODE: 8 + TASKTHROTTLE: 1 SCHED: slurm CCPA_OBS_DIR: /Users/username/DATA/UFS/obs_data/ccpa/proc MRMS_OBS_DIR: /Users/username/DATA/UFS/obs_data/mrms/proc From b6a5cd35a96e9184169fdddf5dfb4a3babdd9493 Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Thu, 8 Dec 2022 07:15:12 -0700 Subject: [PATCH 13/18] Don't specifiy number of processes for mpirun. --- ush/machine/linux.yaml | 6 +++--- ush/machine/macos.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ush/machine/linux.yaml b/ush/machine/linux.yaml index cbd6bafc96..11a653d350 100644 --- a/ush/machine/linux.yaml +++ b/ush/machine/linux.yaml @@ -10,10 +10,10 @@ platform: MET_BIN_EXEC: bin MET_INSTALL_DIR: "" DOMAIN_PREGEN_BASEDIR: /home/username/DATA/UFS/FV3LAM_pregen - RUN_CMD_FCST: 'mpirun -n ${PE_MEMBER01} ' - RUN_CMD_POST: 'mpirun -n 4 ' + RUN_CMD_FCST: mpirun -n ${PE_MEMBER01} + RUN_CMD_POST: mpirun RUN_CMD_SERIAL: time - RUN_CMD_UTILS: mpirun -n 4 + RUN_CMD_UTILS: mpirun PRE_TASK_CMDS: '{ ulimit -a; }' TEST_EXTRN_MDL_SOURCE_BASEDIR: /home/username/DATA/UFS/input_model_data TEST_PREGEN_BASEDIR: /home/username/DATA/UFS/FV3LAM_pregen diff --git a/ush/machine/macos.yaml b/ush/machine/macos.yaml index 8d1b3c66af..3ea9e29879 100644 --- a/ush/machine/macos.yaml +++ b/ush/machine/macos.yaml @@ -10,10 +10,10 @@ platform: METPLUS_PATH: "" MET_BIN_EXEC: bin MET_INSTALL_DIR: "" - RUN_CMD_FCST: 'mpirun -n ${PE_MEMBER01} ' - RUN_CMD_POST: 'mpirun -n 4 ' + RUN_CMD_FCST: mpirun -n ${PE_MEMBER01} + RUN_CMD_POST: mpirun RUN_CMD_SERIAL: time - RUN_CMD_UTILS: mpirun -n 4 + RUN_CMD_UTILS: mpirun PRE_TASK_CMDS: '{ ulimit -a; }' TEST_EXTRN_MDL_SOURCE_BASEDIR: /Users/username/DATA/UFS/input_model_data TEST_PREGEN_BASEDIR: /Users/username/DATA/UFS/FV3LAM_pregen From b50e8a759d2a50fe1ff1ee67f116ea4bcc595ae0 Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Thu, 8 Dec 2022 13:40:34 -0700 Subject: [PATCH 14/18] Get exit code directly instead of from log file. --- ush/rocoto_fake_slurm/sbatch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ush/rocoto_fake_slurm/sbatch b/ush/rocoto_fake_slurm/sbatch index 5972e8d41e..b505c8ab9d 100755 --- a/ush/rocoto_fake_slurm/sbatch +++ b/ush/rocoto_fake_slurm/sbatch @@ -36,8 +36,8 @@ bash -c "\ echo $JOBNAME pid \$$ started \$ds ends \$de >>.job_database; \ \ ${CTIM} ${CMD} &>$LOG; \ + excode=\$?; \ \ - excode=\$(tail -n 1 $LOG | awk '{print \$14}'); \ de=\$(date --utc +%Y-%m-%d:%H:%M:%S); \ echo $JOBNAME pid \$$ ended \$de exitcode \$excode >>.job_database;" & From 15f58f73802f88a53c3a0dec2e7b5d4b9f1907fb Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Thu, 8 Dec 2022 21:54:35 +0000 Subject: [PATCH 15/18] Set taskthrottle to 1000 by default. --- parm/FV3LAM_wflow.xml | 7 +------ ush/config_defaults.yaml | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/parm/FV3LAM_wflow.xml b/parm/FV3LAM_wflow.xml index d7cbd7b8df..fda2b630c7 100644 --- a/parm/FV3LAM_wflow.xml +++ b/parm/FV3LAM_wflow.xml @@ -27,11 +27,6 @@ Workflow task names. {%- else %} {%- endif %} -{%- if taskthrottle %} - -{%- else %} - -{%- endif %} @@ -158,7 +153,7 @@ tasks; and the "FCST" type is used for the RUN_FCST_TN task. ]> - + {# Double quotes are required inside the strftime! Expect an error from reading the template if using single quotes. #} {{ cdate_first_cycl.strftime("%M %H %d %m %Y *") }} diff --git a/ush/config_defaults.yaml b/ush/config_defaults.yaml index cd7d1daafa..4974e79f50 100644 --- a/ush/config_defaults.yaml +++ b/ush/config_defaults.yaml @@ -162,7 +162,7 @@ platform: # WORKFLOW_MANAGER: "" NCORES_PER_NODE: "" - TASKTHROTTLE: "" + TASKTHROTTLE: 1000 BUILD_MOD_FN: 'build_{{ user.MACHINE|lower() }}_{{ workflow.COMPILER }}' WFLOW_MOD_FN: 'wflow_{{ user.MACHINE|lower() }}' BUILD_VER_FN: 'build.ver.{{ user.MACHINE|lower() }}' From 03d60f678f8c1f1a4886adc40e3b05654bdf6b5f Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Mon, 9 Jan 2023 13:07:25 -0700 Subject: [PATCH 16/18] Fix linux lmod path bug. --- etc/lmod-setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/lmod-setup.sh b/etc/lmod-setup.sh index 324868afce..c969778c12 100644 --- a/etc/lmod-setup.sh +++ b/etc/lmod-setup.sh @@ -31,7 +31,7 @@ if [ "$L_MACHINE" = macos ]; then module purge elif [ "$L_MACHINE" = linux ]; then - export BASH_ENV="/usr/share/share/lmod/init/bash" + export BASH_ENV="/usr/share/lmod/lmod/init/bash" source $BASH_ENV module purge From 998697f4548b6e3bf576334809ec4861e379bd43 Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Mon, 9 Jan 2023 13:10:41 -0700 Subject: [PATCH 17/18] Set stack size to unlimited for linux/mac. --- ush/machine/linux.yaml | 2 +- ush/machine/macos.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ush/machine/linux.yaml b/ush/machine/linux.yaml index 11a653d350..14dafe0e45 100644 --- a/ush/machine/linux.yaml +++ b/ush/machine/linux.yaml @@ -14,7 +14,7 @@ platform: RUN_CMD_POST: mpirun RUN_CMD_SERIAL: time RUN_CMD_UTILS: mpirun - PRE_TASK_CMDS: '{ ulimit -a; }' + PRE_TASK_CMDS: '{ ulimit -a; ulimit -s unlimited; }' TEST_EXTRN_MDL_SOURCE_BASEDIR: /home/username/DATA/UFS/input_model_data TEST_PREGEN_BASEDIR: /home/username/DATA/UFS/FV3LAM_pregen TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /home/username/DATA/UFS/dummy_FV3GFS_sys_dir diff --git a/ush/machine/macos.yaml b/ush/machine/macos.yaml index 3ea9e29879..3cca8ecc4c 100644 --- a/ush/machine/macos.yaml +++ b/ush/machine/macos.yaml @@ -14,7 +14,7 @@ platform: RUN_CMD_POST: mpirun RUN_CMD_SERIAL: time RUN_CMD_UTILS: mpirun - PRE_TASK_CMDS: '{ ulimit -a; }' + PRE_TASK_CMDS: '{ ulimit -a; ulimit -s unlimited; }' TEST_EXTRN_MDL_SOURCE_BASEDIR: /Users/username/DATA/UFS/input_model_data TEST_PREGEN_BASEDIR: /Users/username/DATA/UFS/FV3LAM_pregen TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /Users/username/DATA/UFS/dummy_FV3GFS_sys_dir From c8ec8fe94d4f428426cc181c87a200b2bcf4e724 Mon Sep 17 00:00:00 2001 From: Daniel Abdi Date: Mon, 9 Jan 2023 13:22:49 -0700 Subject: [PATCH 18/18] Fix unittest. --- ush/set_gridparams_ESGgrid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ush/set_gridparams_ESGgrid.py b/ush/set_gridparams_ESGgrid.py index 6f696b04d4..cf8ddb9ff8 100644 --- a/ush/set_gridparams_ESGgrid.py +++ b/ush/set_gridparams_ESGgrid.py @@ -90,7 +90,7 @@ def test_set_gridparams_ESGgrid(self): dely=3000.0, constants=dict( RADIUS_EARTH=6371200.0, - DEGS_PER_RADIAN=57.2957795131, + DEGS_PER_RADIAN=57.29577951308232087679, ), )